In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
df1=pd.read_csv('hr_employee_churn_data.csv')

In [36]:
#basic details about data,its rows and columns,etc
df1.shape

(14999, 10)

In [37]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   empid                  14999 non-null  int64  
 1   satisfaction_level     14997 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  int64  
 4   average_montly_hours   14999 non-null  int64  
 5   time_spend_company     14999 non-null  int64  
 6   Work_accident          14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   salary                 14999 non-null  object 
 9   left                   14999 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 1.1+ MB


In [38]:
df1.describe()

Unnamed: 0,empid,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left
count,14999.0,14997.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,7500.0,0.612863,0.716102,3.803054,201.050337,3.498233,0.14461,0.021268,0.238083
std,4329.982679,0.248634,0.171169,1.232592,49.943099,1.460136,0.351719,0.144281,0.425924
min,1.0,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,3750.5,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,7500.0,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,11249.5,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,14999.0,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


# Feature Engineering before Model Building

## First step is  - removing unwanted feature.
#### eg, we have emp id, which is not playing any important role in the given dataset,so we can drop this

In [39]:
# But before dropping this feature, create a copy of dataset by creating new dataframe
df2=df1.copy()

In [40]:
#use drop() method to drop a feature
#inplace=column has to be dropped from the same dataframe
# Was getting Key Error : label not in axis by using the command :- df2.drop(['empid'],axiis=1,inplace=True)
#solved the error by the link :- https://stackoverflow.com/questions/54296214/axis-error-when-dropping-specific-columns-pandas/54296376#:~:text=You%20can%20not%20drop%20columns,x%5D)%20with%20X_train%3DX_train.
df2.drop(df1.columns[[0]], axis=1,inplace=True)

In [41]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14997 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   promotion_last_5years  14999 non-null  int64  
 7   salary                 14999 non-null  object 
 8   left                   14999 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 1.0+ MB


In [42]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,0.38,0.53,2,157,3,0,0,low,1
1,0.8,0.86,5,262,6,0,0,medium,1
2,0.11,0.88,7,272,4,0,0,medium,1
3,0.72,0.87,5,223,5,0,0,low,1
4,0.37,0.52,2,159,3,0,0,low,1


## Second step :- Handling missing values
#### We have to find out if any feature has missing values using the method :- isnull()

In [43]:
#handling missing values
# sum() - gives sum of missing values
df2.isnull().sum()

satisfaction_level       2
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

In [14]:
# Satisfaction level has 2 missing values
# We will try to fill these 2 missing values

In [15]:
# To fill the missing values, we have to see the stats of the column 'satisfaction_level', which we can see by using the describe() method
df2['satisfaction_level'].describe()

count    14997.000000
mean         0.612863
std          0.248634
min          0.090000
25%          0.440000
50%          0.640000
75%          0.820000
max          1.000000
Name: satisfaction_level, dtype: float64

#### Diff approaches of filling the missing values:-
##### 1) Either we can replace the missing values with mean,mode,median(average values)
##### 2) KNN impute library to impute missing values
#####                          --> KNN works by finding the distances between a query and all the examples in the data, selecting the specified number                                                    examples (K) closest to the query, then votes for the most frequent label (in the case of classification) or averages the                                                   labels (in the case of regression).

##### We will use method 1 - to fill the missing values with the mean of satisfaction_level

In [44]:
# To fill missing values, we use the method :- fillna()
# Inside fillna() - 
#                 --> Inside the bracket of fillna(), we have to write with what value we would like to fill the missing values
#                     We will fill it with mean of satisfaction_level, so df2['satisfaction_level'].mean()
#                 --> Inplace means :- Same column in the same feature has to get updated
df2['satisfaction_level'].fillna(df2['satisfaction_level'].mean(), inplace=True)

In [45]:
# Again running isnull() with satisfaction_level to check if the above command worked
df2.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

## Handling Categorical Features

In [46]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,0.38,0.53,2,157,3,0,0,low,1
1,0.8,0.86,5,262,6,0,0,medium,1
2,0.11,0.88,7,272,4,0,0,medium,1
3,0.72,0.87,5,223,5,0,0,low,1
4,0.37,0.52,2,159,3,0,0,low,1


##### In our dataset, the feature 'salary' is having string categorical values.
##### We will have to handle these categorical features first, because the ML model works only on numbers
### Before handling the categorical features, we will try to understand what unique values it has

In [47]:
# To see the unique values, we have to use a method called unique() on the particular feature
df2['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)

#### For the below application of Pandas getDummies, look at GetDummiesExplaination.ipynb (explaination and example)

#### To handle the categorical features, we use a method called getDummies

In [48]:
# drop_first allows you to drop your first variable and identify it through all other columns being 0.
salary_dummies=pd.get_dummies(df2['salary'],drop_first=True)

In [49]:
salary_dummies
# we will combine these features with our original dataframe, df2

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0
...,...,...
14994,1,0
14995,1,0
14996,1,0
14997,1,0


In [50]:
# using a method called concat, we can combine 2 data frames
df2=pd.concat([df2,salary_dummies],axis=1)
# axis = 1 :- combine dataframes at column level

In [51]:
df2.head()
#low medium from salary_dummies got added to df2

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left,low,medium
0,0.38,0.53,2,157,3,0,0,low,1,1,0
1,0.8,0.86,5,262,6,0,0,medium,1,0,1
2,0.11,0.88,7,272,4,0,0,medium,1,0,1
3,0.72,0.87,5,223,5,0,0,low,1,1,0
4,0.37,0.52,2,159,3,0,0,low,1,1,0


In [52]:
# since we have already converted the categorical feature "salary" into encoded feature by using get_dummies(), we don't need
# the column 'salary' anymore
df2.drop(['salary'],axis=1,inplace=True)

In [53]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left,low,medium
0,0.38,0.53,2,157,3,0,0,1,1,0
1,0.8,0.86,5,262,6,0,0,1,0,1
2,0.11,0.88,7,272,4,0,0,1,0,1
3,0.72,0.87,5,223,5,0,0,1,1,0
4,0.37,0.52,2,159,3,0,0,1,1,0


# Feature Engineering over. Now we split data into train and test set

## Before splitting the data, we have to split the dataset into X and Y, where 
## X = Independent Features
## Y = Target dependent variable

In [54]:
# split data set into features and label
# From the X variable, we drop target column
X = df2.drop(labels='left',axis=1)
# To get Y, we only need target column
Y = df2['left']

#### Now we can split X and Y into training and testing sets

In [55]:
# import a librari called : sklearn.model_selection, and the class would be train_test_split
# test_size=0.2 , i.e, 20% test data
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)

In [56]:
len(X_train)

11999

In [57]:
len(X_test)

3000

# Model building and Model selection
### It is a supervised and classification type problem, and we have different type of algorithms for classification type problem
### In this example, we will take 2 algorithms:-
###                                                                        1) Random Forest Classifier
###                                                                        2) XG Boost Classifier

#### We will also perform hyper tuning for these algorithms and we will see which model gives the best result

### For hyper parameter tuning, we will use a library called grid_search_cv

In [58]:
# Random Forest and XGB Classifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# using GridSearchCV, we will find out that which model will give us the best result for the given dataset


#### Creating model parameter Dictionary variable (with random values)

In [71]:
model_param={
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'param':{
            'n_estimators':[10,50,100,130],
            'criterion':['gini','entropy'],
            'max_depth':range(2,4,1),
            'max_features':['auto','log2']
        }
    },
    'XGBClassifier':{
        'model':XGBClassifier(objective='binary:logistic'),
        'param':{
            'learning_rate':[0.5,0.1,0.01,0.001],
            'max_depth':[3,5,10,20],
            'n_estimators':[10,50,100,200],
            'use_label_encoder':[False]
        }
    }
}
# the variable param will be used with GridSearchCV to find the best models along with their scores

#### The command to apply GridSearchCV on the above 2 models
##### We are iterating through the dictionary variables for each model,
##### Then using GridSearchCV:- 
#####                                             -> estimator is the model name
#####                                             -> the param_grid will hold the model parameter, the expected random parameters
#####                                             -> cv( cross validation set) = 5, 
#####                                                      -> for each random sample of the parameters, it will run the grid search and try to fit ( in
#####                                                          model_selection.fit(X,Y)
#####                                              -> then append the score in score array object.The score array object will store the columns:-
#####                                                          - model name                    - best_score                      - best_params

In [72]:
scores=[]
for model_name,mp in model_param.items():
    model_selection=GridSearchCV(estimator=mp['model'],param_grid=mp['param'],cv=5,return_train_score=False)
    model_selection.fit(X,Y)
    scores.append({
        'model':model_name,
        'best_score':model_selection.best_score_,
        'best_params':model_selection.best_params_
    }
    )






























In [73]:
scores
# The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. 
#To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing 
 #   XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
 # warnings.warn(label_encoder_deprecation_msg, UserWarning)

[{'model': 'RandomForestClassifier',
  'best_score': 0.916727887073469,
  'best_params': {'criterion': 'gini',
   'max_depth': 3,
   'max_features': 'auto',
   'n_estimators': 130}},
 {'model': 'XGBClassifier',
  'best_score': 0.9902661776147605,
  'best_params': {'learning_rate': 0.1,
   'max_depth': 20,
   'n_estimators': 200,
   'use_label_encoder': False}}]