In [1]:
# Libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
# Libraries for data preprocessing and model building and evaluation
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score

In [3]:
train=pd.read_csv('train_LZdllcl.csv')
train.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [4]:
test=pd.read_csv('test_2umaH9m.csv')
test.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,74430,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,72255,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,38562,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,64486,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [5]:
train.set_index('employee_id',inplace=True)

In [6]:
train.shape,test.shape

((54808, 13), (23490, 13))

In [7]:
train.columns

Index(['department', 'region', 'education', 'gender', 'recruitment_channel',
       'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service',
       'KPIs_met >80%', 'awards_won?', 'avg_training_score', 'is_promoted'],
      dtype='object')

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54808 entries, 65438 to 51526
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   department            54808 non-null  object 
 1   region                54808 non-null  object 
 2   education             52399 non-null  object 
 3   gender                54808 non-null  object 
 4   recruitment_channel   54808 non-null  object 
 5   no_of_trainings       54808 non-null  int64  
 6   age                   54808 non-null  int64  
 7   previous_year_rating  50684 non-null  float64
 8   length_of_service     54808 non-null  int64  
 9   KPIs_met >80%         54808 non-null  int64  
 10  awards_won?           54808 non-null  int64  
 11  avg_training_score    54808 non-null  int64  
 12  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 5.9+ MB


In [9]:
train.describe()

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
count,54808.0,54808.0,50684.0,54808.0,54808.0,54808.0,54808.0,54808.0
mean,1.253011,34.803915,3.329256,5.865512,0.351974,0.023172,63.38675,0.08517
std,0.609264,7.660169,1.259993,4.265094,0.47759,0.15045,13.371559,0.279137
min,1.0,20.0,1.0,1.0,0.0,0.0,39.0,0.0
25%,1.0,29.0,3.0,3.0,0.0,0.0,51.0,0.0
50%,1.0,33.0,3.0,5.0,0.0,0.0,60.0,0.0
75%,1.0,39.0,4.0,7.0,1.0,0.0,76.0,0.0
max,10.0,60.0,5.0,37.0,1.0,1.0,99.0,1.0


In [10]:
train.nunique()

department               9
region                  34
education                3
gender                   2
recruitment_channel      3
no_of_trainings         10
age                     41
previous_year_rating     5
length_of_service       35
KPIs_met >80%            2
awards_won?              2
avg_training_score      61
is_promoted              2
dtype: int64

In [11]:
train.isnull().sum()

department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [12]:
train['education'].unique()

array(["Master's & above", "Bachelor's", nan, 'Below Secondary'],
      dtype=object)

In [13]:
train['education'].fillna('Unknown',inplace=True)

In [14]:
train[train['previous_year_rating'].isna()]

Unnamed: 0_level_0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
employee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
29934,Technology,region_23,Unknown,m,sourcing,1,30,,1,0,0,77,0
71177,Procurement,region_5,Bachelor's,m,other,1,27,,1,0,0,70,0
74759,Sales & Marketing,region_4,Bachelor's,m,sourcing,1,26,,1,0,0,44,0
45709,Sales & Marketing,region_31,Bachelor's,f,other,1,29,,1,0,0,49,0
26599,Sales & Marketing,region_16,Bachelor's,m,other,2,27,,1,1,0,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74615,R&D,region_31,Bachelor's,m,sourcing,1,30,,1,1,0,88,0
11685,Operations,region_15,Bachelor's,m,sourcing,1,31,,1,1,0,56,1
10546,Finance,region_6,Bachelor's,m,other,1,28,,1,1,0,61,0
37919,Finance,region_2,Bachelor's,m,other,1,23,,1,1,0,61,0


In [15]:
train['previous_year_rating'].fillna(0,inplace=True)

In [16]:
train.isnull().sum()

department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [17]:
x=train.drop('is_promoted',axis=1)
y=train['is_promoted']

In [18]:
x=pd.get_dummies(x)

In [19]:
sc=StandardScaler()
x=sc.fit_transform(x)

### Random forest classifier

In [20]:
rf_model=RandomForestClassifier(n_estimators=100)

In [21]:
rf_model.fit(x,y)

RandomForestClassifier()

In [22]:
f1_score(y,rf_model.predict(x))

0.9988211338548924

In [23]:
print(cross_val_score(rf_model, x, y, cv=5, scoring='f1').mean())

0.411364195006567


### Testing the model

In [24]:
test.isnull().sum()

employee_id                0
department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

In [25]:
test['education'].fillna('Unknown',inplace=True)

In [26]:
test[test['previous_year_rating'].isna()]

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,8724,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
21,5677,Technology,region_17,Bachelor's,m,sourcing,1,25,,1,0,0,80
32,67672,Technology,region_17,Bachelor's,m,other,1,29,,1,1,0,85
39,55325,Analytics,region_22,Bachelor's,m,other,1,25,,1,0,0,88
47,44159,Analytics,region_22,Master's & above,m,other,1,31,,1,1,0,84
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23406,53291,Operations,region_4,Bachelor's,m,sourcing,1,32,,1,1,0,62
23436,10138,Technology,region_2,Bachelor's,f,referred,1,29,,1,0,0,79
23445,65765,Analytics,region_31,Bachelor's,m,sourcing,3,28,,1,0,0,86
23479,39410,Sales & Marketing,region_2,Bachelor's,m,other,3,20,,1,0,0,49


In [27]:
test['previous_year_rating'].fillna(0,inplace=True)

In [28]:
test.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

In [29]:
x_test=test.drop('employee_id',axis=1)

In [30]:
x_test=pd.get_dummies(x_test)

In [31]:
x_test=sc.fit_transform(x_test)

In [32]:
output=pd.DataFrame({'employee_id':test['employee_id'],'is_promoted':rf_model.predict(x_test)})
output.set_index('employee_id',inplace=True)

In [33]:
output.head()

Unnamed: 0_level_0,is_promoted
employee_id,Unnamed: 1_level_1
8724,0
74430,0
72255,0
38562,0
64486,0


In [34]:
output['is_promoted'].value_counts()

0    22820
1      670
Name: is_promoted, dtype: int64

In [35]:
output.to_csv('C:/Users/sofia/output5.csv')