# Identifying and Removing Outliers

To identify outliers in the data, we will use what is the Tukey Method

In [17]:
import pandas as pd
import numpy as np

In [18]:
data_box_set = pd.read_pickle('final_box_cox_sc.p')

In [19]:
def display_outliers(dataframe, col, param=1.5):
    Q1 = np.percentile(dataframe[col], 25)
    Q3 = np.percentile(dataframe[col], 75)
    tukey_window = param*(Q3-Q1)
    less_than_Q1 = dataframe[col] < Q1 - tukey_window
    greater_than_Q3 = dataframe[col] > Q3 + tukey_window
    tukey_mask = (less_than_Q1 | greater_than_Q3)
    return dataframe[tukey_mask]

In [20]:
for col in data_box_set:
    print(col, display_outliers(data_box_set, col).shape)

CRIM (0, 12)
INDUS (0, 12)
NOX (0, 12)
RM (26, 12)
AGE (0, 12)
DIS (0, 12)
RAD (0, 12)
TAX (0, 12)
PTRATIO (0, 12)
B (70, 12)
LSTAT (0, 12)
MEDV (50, 12)


In [30]:
from collections import Counter

In [31]:
raw_outliers = []
for col in data_box_set:
    outlier_df = display_outliers(data_box_set, col)
    raw_outliers += list(outlier_df.index)

In [32]:
outlier_count = Counter(raw_outliers)
outliers = [k for k,v in outlier_count.items() if v > 1]

In [33]:
len(outliers)

27

In [34]:
outlier_count = Counter(raw_outliers)
outliers = [k for k,v in outlier_count.items() if v > 2]

In [35]:
len(outliers)

2

In [36]:
outlier_count.items()

dict_items([(97, 1), (163, 2), (166, 2), (186, 2), (195, 2), (203, 2), (204, 2), (224, 2), (225, 2), (226, 1), (232, 1), (233, 2), (253, 2), (257, 2), (262, 2), (267, 2), (283, 2), (364, 1), (365, 1), (367, 2), (374, 1), (384, 3), (386, 1), (406, 1), (412, 2), (414, 3), (18, 1), (25, 1), (27, 1), (32, 1), (34, 1), (102, 1), (134, 1), (145, 1), (146, 1), (153, 1), (154, 1), (155, 1), (156, 1), (165, 1), (167, 1), (168, 1), (169, 1), (170, 1), (366, 1), (404, 2), (407, 1), (408, 1), (409, 1), (410, 1), (411, 1), (413, 1), (415, 2), (416, 2), (417, 1), (418, 2), (419, 2), (420, 1), (421, 1), (422, 1), (423, 1), (424, 1), (425, 2), (426, 1), (427, 1), (428, 1), (429, 1), (430, 1), (431, 1), (432, 1), (433, 1), (434, 1), (435, 1), (436, 1), (437, 2), (438, 2), (444, 1), (445, 1), (446, 1), (449, 1), (450, 1), (454, 1), (455, 1), (456, 1), (457, 1), (458, 1), (460, 1), (465, 1), (466, 1), (467, 1), (475, 1), (490, 2), (98, 1), (161, 1), (162, 1), (202, 1), (228, 1), (256, 1), (261, 1), (268,

In [37]:
outlier_count

Counter({18: 1,
         25: 1,
         27: 1,
         32: 1,
         34: 1,
         97: 1,
         98: 1,
         102: 1,
         134: 1,
         145: 1,
         146: 1,
         153: 1,
         154: 1,
         155: 1,
         156: 1,
         161: 1,
         162: 1,
         163: 2,
         165: 1,
         166: 2,
         167: 1,
         168: 1,
         169: 1,
         170: 1,
         186: 2,
         195: 2,
         202: 1,
         203: 2,
         204: 2,
         224: 2,
         225: 2,
         226: 1,
         228: 1,
         232: 1,
         233: 2,
         253: 2,
         256: 1,
         257: 2,
         261: 1,
         262: 2,
         267: 2,
         268: 1,
         280: 1,
         282: 1,
         283: 2,
         364: 1,
         365: 1,
         366: 1,
         367: 2,
         368: 1,
         369: 1,
         370: 1,
         371: 1,
         372: 1,
         374: 1,
         384: 3,
         385: 1,
         386: 1,
         387: 1,
    

We can definitley remove the rows with more than 1 outlier