### Outlier Analysis


#### Goal of this section is to find the outliers in the data using Tukey's method.

This means that we will look for points that are more than 1.5 times the Inter-quartile range above the third quartile or below the first quartile.

In [1]:
install.packages("mlbench")

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [2]:
library(mlbench)

In [3]:
data("BostonHousing")

In [4]:
dim(BostonHousing)

### Check for missing data and handle it.


Notice below that we do not have any missing data

In [5]:
BostonHousing[!complete.cases(BostonHousing),]

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv


### Prepare data for analysing


Let's separate features from the target.

Note that we can include the categorical column chas in the aggregation as chas is a binary values column with 0 or 1 as values. We just need to convert the column's data type from factor to integer, we can perform computations on it like a numeric column.

In [6]:
bostonhousing_features <- BostonHousing[1:13]
bostonhousing_features$chas = as.numeric(as.character(bostonhousing_features$chas))
head(bostonhousing_features)

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33
0.02985,0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21


#### Log and scale the data

Before you log the data, remember to add a very small number 0.0001 to features zn & chas, so we can avoid NaNs because of instances containing value 0.  

In [7]:
bostonhousing_features_m <- bostonhousing_features
bostonhousing_features_m$zn = bostonhousing_features_m$zn + .0001 
bostonhousing_features_m$chas = bostonhousing_features_m$chas + .0001 

bostonhousing_log_features = log(bostonhousing_features_m)
bostonhousing_log_sc_features = data.frame(scale(bostonhousing_log_features))

#### Display Outliers

Display Outliers function below uses Tukey's formula to figure out the outliers.

In [8]:
display_outliers <- function (dataframe, feature, param=1.5) {
    feature_vec =  as.vector(dataframe[[feature]])
    Q1 <- quantile(feature_vec, .25)
    Q3 <- quantile(feature_vec, .75)
    tukey_window <- param*(Q3-Q1)
    less_than_Q1 <- dataframe[[feature]] < Q1 - tukey_window
    greater_than_Q3 <- dataframe[[feature]] > Q3 + tukey_window
    tukey_mask <- (less_than_Q1 | greater_than_Q3)
    return(dataframe[tukey_mask,])
}

#### Let's look at the outlier instances for each feature. 

In [9]:
display_outliers(bostonhousing_log_sc_features, 'crim')

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat


In [10]:
display_outliers(bostonhousing_log_sc_features, 'zn')

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat


In [11]:
display_outliers(bostonhousing_log_sc_features, 'indus')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
57,-1.435886,1.816415,-3.167744,-0.2723291,-1.3975,0.1938494,-0.8443067,1.908681,-1.3425573,-0.4672479,-0.4583125,0.3022931,-1.02896
196,-1.619725,1.805691,-3.779625,-0.2723291,-1.254321,2.0639136,-1.0234577,1.007031,-0.5502381,-0.9842937,-1.9282409,0.2935664,-2.134168


In [12]:
display_outliers(bostonhousing_log_sc_features, 'chas')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
143,0.91612914,-0.5986281,1.0480441,3.664771,2.34219579,-1.290092974,0.84220282,-1.685098923,-0.2951683,0.1703775,-1.7630504,0.30229314,1.5280345
153,0.41609706,-0.5986281,1.0480441,3.664771,2.34219579,-1.958860473,0.63289397,-1.319022485,-0.2951683,0.1703775,-1.7630504,0.11464823,0.20618006
155,0.52115011,-0.5986281,1.0480441,3.664771,2.34219579,-0.167660431,0.77536265,-1.165348107,-0.2951683,0.1703775,-1.7630504,0.02797012,0.5742356
156,0.94500688,-0.5986281,1.0480441,3.664771,2.34219579,-0.134314094,0.52920485,-1.169484589,-0.2951683,0.1703775,-1.7630504,-1.64507621,0.56319247
161,0.47277988,-0.5986281,1.0480441,3.664771,0.53353858,0.006386816,0.71632111,-1.114148614,-0.2951683,0.1703775,-1.7630504,0.09812227,-1.10871471
163,0.64143292,-0.5986281,1.0480441,3.664771,0.53353858,1.981001749,0.81246192,-0.879886138,-0.2951683,0.1703775,-1.7630504,0.27832571,-2.86015099
164,0.5543357,-0.5986281,1.0480441,3.664771,0.53353858,2.611947297,0.73914791,-0.7728688,-0.2951683,0.1703775,-1.7630504,0.27447065,-1.94877225
209,-0.56225356,-0.5986281,0.2570422,3.664771,-0.52295782,-0.262581018,-0.01894744,0.475106709,-0.5502381,-0.7755123,0.1221582,0.25051947,0.52281919
210,-0.02328444,-0.5986281,0.2570422,3.664771,-0.52295782,-1.387844297,0.84220282,0.308617065,-0.5502381,-0.7755123,0.1221582,0.30229314,1.2788243
211,-0.44662399,-0.5986281,0.2570422,3.664771,-0.52295782,-0.416590947,0.70745614,0.309621221,-0.5502381,-0.7755123,0.1221582,0.29034852,0.79549444


In [13]:
display_outliers(bostonhousing_log_sc_features, 'nox')

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat


In [14]:
display_outliers(bostonhousing_log_sc_features, 'rm')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
98,-0.61651407,-0.5986281,-1.414354,-0.2723291,-0.9909291,2.280574,0.3928518,0.11742862,-1.34255729,-0.7846368,-0.1405361,0.3022931,-1.5535296
145,0.83383522,-0.5986281,1.0480441,-0.2723291,2.3421958,-2.154612,0.8057788,-1.65133018,-0.29516833,0.1703775,-1.7630504,0.3022931,1.6746472
148,0.75980831,-0.5986281,1.0480441,-0.2723291,2.3421958,-2.112947,0.7702379,-1.49949657,-0.29516833,0.1703775,-1.7630504,0.2852756,1.6882279
164,0.5543357,-0.5986281,1.0480441,3.6647712,0.5335386,2.611947,0.7391479,-0.7728688,-0.29516833,0.1703775,-1.7630504,0.2744706,-1.9487723
167,0.68391819,-0.5986281,1.0480441,-0.2723291,0.5335386,2.124753,0.7787703,-0.87516939,-0.29516833,0.1703775,-1.7630504,0.2091093,-1.7684268
205,-1.44635713,1.8360891,-1.5114466,-0.2723291,-1.3242013,2.241874,-1.0285825,0.82426875,-0.55023813,-1.3113079,-1.7630504,0.2814412,-2.1853782
225,-0.17284487,-0.5986281,-0.4319792,-0.2723291,-0.3730008,2.495318,0.4416684,-0.23214784,0.24208104,-0.5160801,-0.4121369,0.2631047,-1.581433
226,0.06463712,-0.5986281,-0.4319792,-0.2723291,-0.3730008,2.976438,0.5371148,-0.23214784,0.24208104,-0.5160801,-0.4121369,0.252823,-1.3952744
227,-0.08396289,-0.5986281,-0.4319792,-0.2723291,-0.3730008,2.24852,0.6047439,-0.03704469,0.24208104,-0.5160801,-0.4121369,0.2709045,-2.0468462
233,0.10524975,-0.5986281,-0.4319792,-0.2723291,-0.3435455,2.571461,0.3336241,0.29102813,0.24208104,-0.5160801,-0.4121369,0.265989,-2.4409521


In [15]:
display_outliers(bostonhousing_log_sc_features, 'age')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
42,-0.5918796,-0.5986281,-0.2924398,-0.2723291,-0.9575816,0.7178901697,-4.954793,1.0306687,-0.87908029,-1.2119242,-0.1851683,0.2643129,-1.3214543
43,-0.5434745,-0.5986281,-0.2924398,-0.2723291,-0.9575816,-0.1097468319,-3.608298,1.0306687,-0.87908029,-1.2119242,-0.1851683,0.2574514,-1.0174628
44,-0.4884961,-0.5986281,-0.2924398,-0.2723291,-0.9575816,-0.0493403099,-3.633296,1.0306687,-0.87908029,-1.2119242,-0.1851683,0.2943205,-0.6059237
71,-0.7617917,-0.5986281,0.2835052,-0.2723291,-1.3613162,0.2411453478,-3.608298,0.884586,-0.55023813,-0.5325698,0.3765115,0.2586649,-0.7753099
73,-0.7444097,-0.5986281,0.2835052,-0.2723291,-1.3613162,-0.2611130097,-3.334771,0.884586,-0.55023813,-0.5325698,0.3765115,0.2826324,-1.1026741
74,-0.3942191,-0.5986281,0.2835052,-0.2723291,-1.3613162,-0.0007382233,-3.710666,0.884586,-0.55023813,-0.5325698,0.3765115,0.2363717,-0.5837045
75,-0.8132917,-0.5986281,0.5039909,-0.2723291,-1.0809669,0.0390887523,-3.764355,0.4804766,-0.29516833,0.13888,0.1651151,0.2958273,-0.760517
194,-1.4070918,1.7548037,-1.3966627,-0.2723291,-1.507662,0.7572538443,-2.944407,1.1855756,-2.13487646,-0.8872465,-1.2869823,0.290743,-1.257374
201,-1.5028535,1.8360891,-2.2843735,-0.2723291,-1.4829694,1.1853832307,-2.388763,1.5700548,-0.87908029,0.1641093,-0.5984582,0.2605839,-1.4612643
215,-0.2122943,-0.5986281,0.2570422,-0.2723291,-0.5229578,-1.2752756559,-2.96103,0.1657377,-0.55023813,-0.7755123,0.1221582,0.1357542,1.6893546


In [16]:
display_outliers(bostonhousing_log_sc_features, 'dis')

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat


In [17]:
display_outliers(bostonhousing_log_sc_features, 'rad')

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat


In [18]:
display_outliers(bostonhousing_log_sc_features, 'tax')

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat


In [19]:
display_outliers(bostonhousing_log_sc_features, 'ptratio')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
197,-1.12656647,1.805691,-2.241325,-0.2723291,-1.470669,1.3730501,-0.91938489,1.4847173,-1.3425573,-0.341469,-2.998022,0.3022931,-1.6057282
198,-1.05660436,1.805691,-2.241325,-0.2723291,-1.470669,1.1503773,-0.80354049,1.4847173,-1.3425573,-0.341469,-2.998022,0.1555364,-0.36286235
199,-1.15547247,1.805691,-2.241325,-0.2723291,-1.470669,1.3571535,-0.72920178,1.4847173,-1.3425573,-0.341469,-2.998022,0.2868919,-0.80026081
258,0.13351278,1.560474,-1.005712,-0.2723291,0.8666578,2.9549841,0.61229802,-1.111471,-0.2951683,-0.8967849,-2.747643,0.2786243,-1.22786046
259,0.17123788,1.560474,-1.005712,-0.2723291,0.8666578,1.429073,0.84220282,-1.0175669,-0.2951683,-0.8967849,-2.747643,0.2571816,-0.52942068
260,0.16643098,1.560474,-1.005712,-0.2723291,0.8666578,0.8120722,0.84220282,-0.9073351,-0.2951683,-0.8967849,-2.747643,0.2860015,-0.73131983
261,0.07606382,1.560474,-1.005712,-0.2723291,0.8666578,1.2698288,0.51326938,-0.8161477,-0.2951683,-0.8967849,-2.747643,0.2888682,-0.18346791
262,0.07090562,1.560474,-1.005712,-0.2723291,0.8666578,1.6532566,0.6587378,-0.7919985,-0.2951683,-0.8967849,-2.747643,0.2742044,-0.64668153
263,0.05863832,1.560474,-1.005712,-0.2723291,0.8666578,2.6363631,0.69675442,-0.6674787,-0.2951683,-0.8967849,-2.747643,0.2691678,-0.9890629
264,0.27213959,1.560474,-1.005712,-0.2723291,0.8666578,1.4217856,0.74957697,-0.8456019,-0.2951683,-0.8967849,-2.747643,0.2909073,0.08221613


In [20]:
display_outliers(bostonhousing_log_sc_features, 'b')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
19,0.2593254,-0.5986281,-0.08159911,-0.2723291,-0.04899171,-1.203188326,-0.8035405,0.2706851,-0.55023813,-0.51608012,1.09443540,-0.107925512,0.14606402
26,0.2806251,-0.5986281,-0.08159911,-0.2723291,-0.04899171,-0.972855925,0.5895302,0.5669666,-0.55023813,-0.51608012,1.09443540,-0.044929107,0.72059798
28,0.3400468,-0.5986281,-0.08159911,-0.2723291,-0.04899171,-0.287574267,0.6477118,0.5664673,-0.55023813,-0.51608012,1.09443540,-0.032377665,0.79645779
33,0.5126120,-0.5986281,-0.08159911,-0.2723291,-0.04899171,-0.431540970,0.5172678,0.3628210,-0.55023813,-0.51608012,1.09443540,-0.388571869,1.58236293
35,0.5820496,-0.5986281,-0.08159911,-0.2723291,-0.04899171,-0.215724401,0.7906414,0.2526814,-0.55023813,-0.51608012,1.09443540,-0.304072621,1.06778743
103,-0.3212902,-0.5986281,-0.01684901,-0.2723291,-0.21788816,0.224481353,0.5837885,-0.3509446,-0.29516833,0.04853567,1.05619449,-1.926394346,-0.01212361
116,-0.4549705,-0.5986281,0.18454990,-0.2723291,0.03334898,-0.464519658,0.6366110,-0.5312085,-0.08676112,0.34569253,-0.23005048,0.120772662,0.64322763
119,-0.5806215,-0.5986281,0.18454990,-0.2723291,0.03334898,-0.549020843,0.3291504,-0.5204044,-0.08676112,0.34569253,-0.23005048,0.097015540,0.60152704
135,0.3498151,-0.5986281,1.19157445,-0.2723291,0.68701021,-0.725106162,0.8157933,-0.6214861,-0.55023813,0.37472528,1.17037395,-0.230943706,0.79934452
146,0.7618969,-0.5986281,1.04804409,-0.2723291,2.34219579,-0.166207990,0.8422028,-1.5531738,-0.29516833,0.17037747,-1.76305040,-0.771972119,1.58775935


In [21]:
display_outliers(bostonhousing_log_sc_features, 'lstat')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
162,0.5370696,-0.5986281,1.048044,-0.2723291,0.5335386,1.616481,0.68418,-0.9443896,-0.2951683,0.1703775,-1.76305,0.2269452,-3.033566


#### Let's count the outliers across all the features.

In [22]:
for (feature in colnames(bostonhousing_log_sc_features)){
    outlier_count = dim(display_outliers(bostonhousing_log_sc_features, feature))[1]
    print(paste(feature, outlier_count))
}

[1] "crim 0"
[1] "zn 0"
[1] "indus 2"
[1] "chas 35"
[1] "nox 0"
[1] "rm 27"
[1] "age 17"
[1] "dis 0"
[1] "rad 0"
[1] "tax 0"
[1] "ptratio 16"
[1] "b 78"
[1] "lstat 1"


#### Let's check if any instance shows up as an outlier for more than one feature.

In [24]:
raw_outliers = c()
for (feature in colnames(bostonhousing_log_sc_features)){
    outlier_df = display_outliers(bostonhousing_log_sc_features, feature)
    outlier_indices = rownames(outlier_df)
    raw_outliers = c(raw_outliers, outlier_indices)
}
raw_outliers

#### Let's remove the outliers

In [32]:
bostonhousing_wo_outliers <- bostonhousing_log_sc_features[-which(rownames(bostonhousing_log_sc_features) %in% raw_outliers),]


In [33]:
dim(bostonhousing_wo_outliers)

In [35]:
outlier_records_percent <- (506-346)/506 * 100
outlier_records_percent

Wow, 31.6 instances were outliers.