### Outlier Detection and Treatment using Capping Method

In [31]:
import pandas as pd
import numpy as np

In [32]:
df = pd.read_csv("datasets/placement.csv")
df.sample(5)

Unnamed: 0,cgpa,placement_exam_marks,placed
650,6.19,27.0,0
899,7.62,8.0,1
592,7.09,27.0,1
217,6.92,16.0,0
311,6.97,87.0,1


### Using Z-Score

#### z-score = (x - x_mean) / SD

In [33]:
df["cgpa_zscore"] = (df["cgpa"] - df["cgpa"].mean()) / df["cgpa"].std()

feature = "cgpa_zscore"

mean = df[feature].mean()
std = df[feature].std()

In [34]:
df_cap_z = df.copy()

thresh = 3

upper_bound = mean + (thresh * std)
lower_bound = mean - (thresh * std)

df_cap_z

Unnamed: 0,cgpa,placement_exam_marks,placed,cgpa_zscore
0,7.19,26.0,1,0.371425
1,7.46,38.0,1,0.809810
2,7.54,40.0,1,0.939701
3,6.42,8.0,1,-0.878782
4,7.23,17.0,0,0.436371
...,...,...,...,...
995,8.87,44.0,1,3.099150
996,9.12,65.0,1,3.505062
997,4.89,34.0,0,-3.362960
998,8.62,46.0,1,2.693239


In [35]:
# Finding Outliers
out_f = df_cap_z[feature]

print("Outliers: ")
out_f[(out_f > upper_bound) | (out_f < lower_bound)]

Outliers: 


485   -3.314251
995    3.099150
996    3.505062
997   -3.362960
999   -3.346724
Name: cgpa_zscore, dtype: float64

In [36]:
# Capping Outliers

df_cap_z[feature] = np.where(df_cap_z[feature] > upper_bound, upper_bound, np.where(df_cap_z[feature] < lower_bound, lower_bound, df_cap_z[feature]))
df_cap_z

Unnamed: 0,cgpa,placement_exam_marks,placed,cgpa_zscore
0,7.19,26.0,1,0.371425
1,7.46,38.0,1,0.809810
2,7.54,40.0,1,0.939701
3,6.42,8.0,1,-0.878782
4,7.23,17.0,0,0.436371
...,...,...,...,...
995,8.87,44.0,1,3.000000
996,9.12,65.0,1,3.000000
997,4.89,34.0,0,-3.000000
998,8.62,46.0,1,2.693239


### Using IQR - Inter Quartile Range

In [37]:
feature = "cgpa"

df_cap_iqr = df.copy()

factor = 1.5

q1 = df_cap_iqr[feature].quantile(0.25)
q3 = df_cap_iqr[feature].quantile(0.75)

iqr = q3 - q1

upper_whisker = q3 + (factor * iqr)
lower_whisker = q1 - (factor * iqr)

print(upper_whisker, lower_whisker)

df_cap_iqr

8.600000000000001 5.319999999999999


Unnamed: 0,cgpa,placement_exam_marks,placed,cgpa_zscore
0,7.19,26.0,1,0.371425
1,7.46,38.0,1,0.809810
2,7.54,40.0,1,0.939701
3,6.42,8.0,1,-0.878782
4,7.23,17.0,0,0.436371
...,...,...,...,...
995,8.87,44.0,1,3.099150
996,9.12,65.0,1,3.505062
997,4.89,34.0,0,-3.362960
998,8.62,46.0,1,2.693239


In [38]:
# Finding Outliers
out_f = df_cap_iqr[feature]

print("Outliers: ")
out_f[(out_f > upper_whisker) | (out_f < lower_whisker)]

Outliers: 


278    5.23
485    4.92
815    5.27
995    8.87
996    9.12
997    4.89
998    8.62
999    4.90
Name: cgpa, dtype: float64

In [39]:
# Capping Outliers

df_cap_iqr[feature] = np.where(df_cap_iqr[feature] > upper_whisker, upper_whisker, np.where(df_cap_iqr[feature] < lower_whisker, lower_whisker, df_cap_iqr[feature]))
df_cap_iqr

Unnamed: 0,cgpa,placement_exam_marks,placed,cgpa_zscore
0,7.19,26.0,1,0.371425
1,7.46,38.0,1,0.809810
2,7.54,40.0,1,0.939701
3,6.42,8.0,1,-0.878782
4,7.23,17.0,0,0.436371
...,...,...,...,...
995,8.60,44.0,1,3.099150
996,8.60,65.0,1,3.505062
997,5.32,34.0,0,-3.362960
998,8.60,46.0,1,2.693239
