### Outlier Detection and Treatment using Trimming Method

In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv("datasets/placement.csv")
df.sample(5)

Unnamed: 0,cgpa,placement_exam_marks,placed
424,6.91,30.0,0
967,7.35,59.0,0
500,6.07,24.0,1
55,8.12,15.0,0
932,6.89,71.0,0


### Using Z-Score

#### z-score = (x - x_mean) / SD

In [16]:
df["cgpa_zscore"] = (df["cgpa"] - df["cgpa"].mean()) / df["cgpa"].std()

feature = "cgpa_zscore"

mean = df[feature].mean()
std = df[feature].std()

In [17]:
df_trim_z = df.copy()

thresh = 3

upper_bound = mean + (thresh * std)
lower_bound = mean - (thresh * std)

In [18]:
# Finding Outliers
out_f = df_trim_z[feature]

print("Outliers: ")
out_f[(out_f > upper_bound) | (out_f < lower_bound)]

Outliers: 


485   -3.314251
995    3.099150
996    3.505062
997   -3.362960
999   -3.346724
Name: cgpa_zscore, dtype: float64

In [21]:
# Trimming Outliers

df_trim_z = df_trim_z[(df_trim_z[feature] < upper_bound) & (df_trim_z[feature] > lower_bound)]
df_trim_z

Unnamed: 0,cgpa,placement_exam_marks,placed,cgpa_score,cgpa_zscore
0,7.19,26.0,1,0.371425,0.371425
1,7.46,38.0,1,0.809810,0.809810
2,7.54,40.0,1,0.939701,0.939701
3,6.42,8.0,1,-0.878782,-0.878782
4,7.23,17.0,0,0.436371,0.436371
...,...,...,...,...,...
991,7.04,57.0,0,0.127878,0.127878
992,6.26,12.0,0,-1.138565,-1.138565
993,6.73,21.0,1,-0.375452,-0.375452
994,6.48,63.0,0,-0.781363,-0.781363


### Using IQR - Inter Quartile Range

In [23]:
feature = "cgpa"

df_trim_iqr = df.copy()

factor = 1.5

q1 = df_trim_iqr[feature].quantile(0.25)
q3 = df_trim_iqr[feature].quantile(0.75)

iqr = q3 - q1

upper_whisker = q3 + (factor * iqr)
lower_whisker = q1 - (factor * iqr)

In [27]:
# Finding Outliers
out_f = df_trim_iqr[feature]

print("Outliers: ")
out_f[(out_f > upper_whisker) | (out_f < lower_whisker)]

Outliers: 


278    5.23
485    4.92
815    5.27
995    8.87
996    9.12
997    4.89
998    8.62
999    4.90
Name: cgpa, dtype: float64

In [28]:
# Trimming Outliers

df_trim_iqr = df_trim_iqr[(df_trim_iqr[feature] < upper_whisker) & (df_trim_iqr[feature] > lower_whisker)]
df_trim_iqr

Unnamed: 0,cgpa,placement_exam_marks,placed,cgpa_score,cgpa_zscore
0,7.19,26.0,1,0.371425,0.371425
1,7.46,38.0,1,0.809810,0.809810
2,7.54,40.0,1,0.939701,0.939701
3,6.42,8.0,1,-0.878782,-0.878782
4,7.23,17.0,0,0.436371,0.436371
...,...,...,...,...,...
990,6.17,33.0,1,-1.284694,-1.284694
991,7.04,57.0,0,0.127878,0.127878
992,6.26,12.0,0,-1.138565,-1.138565
993,6.73,21.0,1,-0.375452,-0.375452
