In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [72]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [73]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [74]:
train.shape

(54808, 14)

In [75]:
train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [76]:
train.education.value_counts(sort=False)

Master's & above    14925
Below Secondary       805
Bachelor's          36669
Name: education, dtype: int64

In [77]:
print('Missing values percentage in each row:')
for column in train.columns:
    print(column + ': '+str(train[column].isnull().sum()/train.shape[0])+' %')

Missing values percentage in each row:
employee_id: 0.0 %
department: 0.0 %
region: 0.0 %
education: 0.04395343745438622 %
gender: 0.0 %
recruitment_channel: 0.0 %
no_of_trainings: 0.0 %
age: 0.0 %
previous_year_rating: 0.07524448985549555 %
length_of_service: 0.0 %
KPIs_met >80%: 0.0 %
awards_won?: 0.0 %
avg_training_score: 0.0 %
is_promoted: 0.0 %


# LabelEncoding the categoricals

# Department

In [78]:
from sklearn.preprocessing import LabelEncoder

In [79]:
dep_lr = LabelEncoder()

In [80]:
train.department = dep_lr.fit_transform(train.department)

In [81]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,4,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,7,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,7,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,8,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


# Region

In [82]:
reg_lr = LabelEncoder()

In [83]:
train.region = reg_lr.fit_transform(train.region)

In [84]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,31,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,4,14,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,7,10,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,7,15,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,8,18,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


# Education

In [85]:
edu_lr = LabelEncoder()

Education column has missing values so they have to be filled first

In [86]:
edu_df = train[['department','region','gender','length_of_service','age','education']]

In [87]:
edu_df.head()

Unnamed: 0,department,region,gender,length_of_service,age,education
0,7,31,f,8,35,Master's & above
1,4,14,m,4,30,Bachelor's
2,7,10,m,7,34,Bachelor's
3,7,15,m,10,39,Bachelor's
4,8,18,m,2,45,Bachelor's


In [88]:
edu_df.education.value_counts()

Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: education, dtype: int64

In [89]:
edu_df.groupby(['length_of_service'])['education'].apply(lambda x:x.value_counts().index[0])

length_of_service
1           Bachelor's
2           Bachelor's
3           Bachelor's
4           Bachelor's
5           Bachelor's
6           Bachelor's
7           Bachelor's
8           Bachelor's
9           Bachelor's
10          Bachelor's
11          Bachelor's
12          Bachelor's
13          Bachelor's
14          Bachelor's
15    Master's & above
16    Master's & above
17    Master's & above
18    Master's & above
19    Master's & above
20    Master's & above
21    Master's & above
22          Bachelor's
23    Master's & above
24    Master's & above
25          Bachelor's
26    Master's & above
27          Bachelor's
28          Bachelor's
29    Master's & above
30    Master's & above
31    Master's & above
32    Master's & above
33    Master's & above
34          Bachelor's
37          Bachelor's
Name: education, dtype: object

In [90]:
train['education']= train['education'].fillna(train.groupby(['length_of_service'])['education'].apply(lambda x:x.value_counts().index[0]))

In [91]:
train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2406
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [94]:
train[train['education'].isnull()==True]['length_of_service']

43       2
82       2
87       9
90       1
189      9
        ..
54692    7
54717    4
54729    2
54742    3
54806    2
Name: length_of_service, Length: 2406, dtype: int64