In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [4]:
train.shape

(54808, 14)

In [5]:
train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [6]:
train.education.value_counts(sort=False)

Below Secondary       805
Master's & above    14925
Bachelor's          36669
Name: education, dtype: int64

In [7]:
print('Missing values percentage in each row:')
for column in train.columns:
    print(column + ': '+str(train[column].isnull().sum()/train.shape[0])+' %')

Missing values percentage in each row:
employee_id: 0.0 %
department: 0.0 %
region: 0.0 %
education: 0.04395343745438622 %
gender: 0.0 %
recruitment_channel: 0.0 %
no_of_trainings: 0.0 %
age: 0.0 %
previous_year_rating: 0.07524448985549555 %
length_of_service: 0.0 %
KPIs_met >80%: 0.0 %
awards_won?: 0.0 %
avg_training_score: 0.0 %
is_promoted: 0.0 %


# LabelEncoding the categoricals

# Department

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
dep_lr = LabelEncoder()

In [10]:
train.department = dep_lr.fit_transform(train.department)

In [11]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,4,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,7,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,7,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,8,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


# Region

In [12]:
reg_lr = LabelEncoder()

In [13]:
train.region = reg_lr.fit_transform(train.region)

In [14]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,31,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,4,14,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,7,10,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,7,15,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,8,18,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


# Education

In [15]:
edu_lr = LabelEncoder()

Education column has missing values so they have to be filled first

# Filling missing values in education

In [16]:
edu_df = train[['department','region','gender','length_of_service','age','education']]

In [17]:
edu_df.head()

Unnamed: 0,department,region,gender,length_of_service,age,education
0,7,31,f,8,35,Master's & above
1,4,14,m,4,30,Bachelor's
2,7,10,m,7,34,Bachelor's
3,7,15,m,10,39,Bachelor's
4,8,18,m,2,45,Bachelor's


In [18]:
edu_df.education.value_counts()

Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: education, dtype: int64

In [19]:
edu_df.groupby(['length_of_service'])['education'].apply(lambda x:x.value_counts().index[0])

length_of_service
1           Bachelor's
2           Bachelor's
3           Bachelor's
4           Bachelor's
5           Bachelor's
6           Bachelor's
7           Bachelor's
8           Bachelor's
9           Bachelor's
10          Bachelor's
11          Bachelor's
12          Bachelor's
13          Bachelor's
14          Bachelor's
15    Master's & above
16    Master's & above
17    Master's & above
18    Master's & above
19    Master's & above
20    Master's & above
21    Master's & above
22          Bachelor's
23    Master's & above
24    Master's & above
25          Bachelor's
26    Master's & above
27          Bachelor's
28          Bachelor's
29    Master's & above
30    Master's & above
31    Master's & above
32    Master's & above
33    Master's & above
34          Bachelor's
37          Bachelor's
Name: education, dtype: object

In [30]:
train['education'] = train['education'].fillna(train.groupby(['age'])['education'].apply(lambda x:x.mode()[0]))

In [31]:
train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2405
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [33]:
train.education = train.education.fillna("Bachelor's")

In [34]:
train.isnull().sum()

employee_id                0
department                 0
region                     0
education                  0
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

# Label Encoding education

In [35]:
train.education = edu_lr.fit_transform(train.education)

In [36]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,31,2,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,4,14,0,m,other,1,30,5.0,4,0,0,60,0
2,7513,7,10,0,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,7,15,0,m,other,2,39,1.0,10,0,0,50,0
4,48945,8,18,0,m,other,1,45,3.0,2,0,0,73,0


# Gender

In [38]:
gen_lr = LabelEncoder()

In [39]:
train.gender = gen_lr.fit_transform(train.gender)

In [40]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,31,2,0,sourcing,1,35,5.0,8,1,0,49,0
1,65141,4,14,0,1,other,1,30,5.0,4,0,0,60,0
2,7513,7,10,0,1,sourcing,1,34,3.0,7,0,0,50,0
3,2542,7,15,0,1,other,2,39,1.0,10,0,0,50,0
4,48945,8,18,0,1,other,1,45,3.0,2,0,0,73,0


# recruitment_channel

In [41]:
recChannel_lr = LabelEncoder()

In [42]:
train.recruitment_channel = recChannel_lr.fit_transform(train.recruitment_channel)

In [43]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,31,2,0,2,1,35,5.0,8,1,0,49,0
1,65141,4,14,0,1,0,1,30,5.0,4,0,0,60,0
2,7513,7,10,0,1,2,1,34,3.0,7,0,0,50,0
3,2542,7,15,0,1,0,2,39,1.0,10,0,0,50,0
4,48945,8,18,0,1,0,1,45,3.0,2,0,0,73,0


# Filling missing values in previous_year_rating

In [67]:
train.dtypes

employee_id               int64
department                int32
region                    int32
education                 int32
gender                    int32
recruitment_channel       int32
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [68]:
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,7,31,2,0,2,1,35,5.0,8,1,0,49,0
1,65141,4,14,0,1,0,1,30,5.0,4,0,0,60,0
2,7513,7,10,0,1,2,1,34,3.0,7,0,0,50,0
3,2542,7,15,0,1,0,2,39,1.0,10,0,0,50,0
4,48945,8,18,0,1,0,1,45,3.0,2,0,0,73,0


In [72]:
corr= train.corr()
corr

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
employee_id,1.0,-0.005187,-0.003423,0.002106,-0.001542,0.00596,-0.005121,0.000437,0.004533,0.001274,-0.002501,0.00842,-0.000586,0.001206
department,-0.005187,1.0,-0.020592,0.043456,-0.030108,0.004732,0.014152,0.079162,-0.143596,0.05906,-0.088209,-0.002151,-0.251626,0.00013
region,-0.003423,-0.020592,1.0,-0.003786,0.01973,-0.00092,-0.00459,-0.088918,-0.005002,-0.058939,-0.007664,0.000307,0.023572,0.008841
education,0.002106,0.043456,-0.003786,1.0,-0.032403,-0.003469,-0.033501,0.339928,0.018223,0.233668,0.012062,-0.001421,0.032719,0.029231
gender,-0.001542,-0.030108,0.01973,-0.032403,1.0,0.006567,0.084501,-0.016293,-0.024232,-0.019675,-0.0374,0.002381,-0.024494,-0.011109
recruitment_channel,0.00596,0.004732,-0.00092,-0.003469,0.006567,1.0,-0.010405,-0.0114,0.006135,-0.002887,-0.000297,-0.00551,-0.002416,0.002229
no_of_trainings,-0.005121,0.014152,-0.00459,-0.033501,0.084501,-0.010405,1.0,-0.081278,-0.063126,-0.057275,-0.045576,-0.007628,0.042517,-0.024896
age,0.000437,0.079162,-0.088918,0.339928,-0.016293,-0.0114,-0.081278,1.0,0.006008,0.657111,-0.025592,-0.008169,-0.04838,-0.017166
previous_year_rating,0.004533,-0.143596,-0.005002,0.018223,-0.024232,0.006135,-0.063126,0.006008,1.0,0.000253,0.351578,0.027738,0.075139,0.15932
length_of_service,0.001274,0.05906,-0.058939,0.233668,-0.019675,-0.002887,-0.057275,0.657111,0.000253,1.0,-0.077693,-0.039927,-0.038122,-0.01067
