In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# load the titanic dataset
df = pd.read_csv(r"HR Analytics.csv")
df

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54803,3030,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,0,78,0
54804,74592,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,0,56,0
54805,13918,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,1,0,79,0
54806,13614,Sales & Marketing,region_9,,m,sourcing,1,29,1.0,2,0,0,45,0


## 1. Removing outliers using the interquartile range (IQR) method:

In [2]:
# calculate the IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# remove outliers
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
df

  df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
6,20379,Operations,region_20,Bachelor's,f,other,1,31,3.0,5,0,0,59,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54794,35000,Operations,region_16,Bachelor's,m,other,1,37,4.0,7,1,0,54,0
54795,13477,Procurement,region_15,Master's & above,m,other,1,35,3.0,7,0,0,70,0
54801,12431,Technology,region_26,Bachelor's,f,sourcing,1,31,,1,0,0,78,0
54804,74592,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,0,56,0


## 2. Removing outliers using the z-score method:

In [5]:
from scipy import stats
import pandas as pd

# load the HR Analytics dataset
df = pd.read_csv("HR Analytics.csv")

# select only numerical columns
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
print(num_cols)
# calculate z-scores
z_scores = stats.zscore(df[num_cols])

# remove outliers
df = df[(z_scores < 3).all(axis=1)]


Index(['employee_id', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')


## 3. Removing outliers using a manual threshold:

In [None]:
import numpy as np

# load the titanic dataset
df = pd.read_csv(r"HR Analytics.csv")
df

# calculate mean and standard deviation
mean = np.mean(df['variable'])
std = np.std(df['variable'])

# remove outliers
df = df[(df['variable'] > (mean - 3 * std)) & (df['variable'] < (mean + 3 * std))]


## 4. Removing outliers using a machine learning model:

In [None]:
from sklearn.ensemble import IsolationForest

# load the titanic dataset
df = pd.read_csv(r"HR Analytics.csv")
df

# fit isolation forest model to data
model = IsolationForest(n_estimators=100, contamination=0.01)
model.fit(df)

# predict outliers
outliers = model.predict(df) == -1

# remove outliers
df = df[~outliers]
