In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df1=pd.read_csv('hr_employee_churn_data.csv')

In [4]:
#basic details about data,its rows and columns,etc
df1.shape

(14999, 10)

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   empid                  14999 non-null  int64  
 1   satisfaction_level     14997 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  int64  
 4   average_montly_hours   14999 non-null  int64  
 5   time_spend_company     14999 non-null  int64  
 6   Work_accident          14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   salary                 14999 non-null  object 
 9   left                   14999 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 1.1+ MB


In [7]:
df1.describe()

Unnamed: 0,empid,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left
count,14999.0,14997.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,7500.0,0.612863,0.716102,3.803054,201.050337,3.498233,0.14461,0.021268,0.238083
std,4329.982679,0.248634,0.171169,1.232592,49.943099,1.460136,0.351719,0.144281,0.425924
min,1.0,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,3750.5,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,7500.0,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,11249.5,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,14999.0,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


# Feature Engineering before Model Building

## First step is  - removing unwanted feature.
#### eg, we have emp id, which is not playing any important role in the given dataset,so we can drop this

In [21]:
# But before dropping this feature, create a copy of dataset by creating new dataframe
df2=df1.copy()

In [22]:
#use drop() method to drop a feature
#inplace=column has to be dropped from the same dataframe
# Was getting Key Error : label not in axis by using the command :- df2.drop(['empid'],axiis=1,inplace=True)
#solved the error by the link :- https://stackoverflow.com/questions/54296214/axis-error-when-dropping-specific-columns-pandas/54296376#:~:text=You%20can%20not%20drop%20columns,x%5D)%20with%20X_train%3DX_train.
df2.drop(df1.columns[[0]], axis=1,inplace=True)

In [23]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14997 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   promotion_last_5years  14999 non-null  int64  
 7   salary                 14999 non-null  object 
 8   left                   14999 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 1.0+ MB


In [24]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,0.38,0.53,2,157,3,0,0,low,1
1,0.8,0.86,5,262,6,0,0,medium,1
2,0.11,0.88,7,272,4,0,0,medium,1
3,0.72,0.87,5,223,5,0,0,low,1
4,0.37,0.52,2,159,3,0,0,low,1


## Second step :- Handling missing values
#### We have to find out if any feature has missing values using the method :- isnull()

In [25]:
#handling missing values
# sum() - gives sum of missing values
df2.isnull().sum()

satisfaction_level       2
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

In [26]:
# Satisfaction level has 2 missing values
# We will try to fill these 2 missing values

In [27]:
# To fill the missing values, we have to see the stats of the column 'satisfaction_level', which we can see by using the describe() method
df2['satisfaction_level'].describe()

count    14997.000000
mean         0.612863
std          0.248634
min          0.090000
25%          0.440000
50%          0.640000
75%          0.820000
max          1.000000
Name: satisfaction_level, dtype: float64

#### Diff approaches of filling the missing values:-
##### 1) Either we can replace the missing values with mean,mode,median(average values)
##### 2) KNN impute library to impute missing values
#####                          --> KNN works by finding the distances between a query and all the examples in the data, selecting the specified number                                                    examples (K) closest to the query, then votes for the most frequent label (in the case of classification) or averages the                                                   labels (in the case of regression).

##### We will use method 1 - to fill the missing values with the mean of satisfaction_level

In [28]:
# To fill missing values, we use the method :- fillna()
# Inside fillna() - 
#                 --> Inside the bracket of fillna(), we have to write with what value we would like to fill the missing values
#                     We will fill it with mean of satisfaction_level, so df2['satisfaction_level'].mean()
#                 --> Inplace means :- Same column in the same feature has to get updated
df2['satisfaction_level'].fillna(df2['satisfaction_level'].mean(), inplace=True)

In [30]:
# Again running isnull() with satisfaction_level to check if the above command worked
df2.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
salary                   0
left                     0
dtype: int64

## Handling Categorical Features

In [31]:
df2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary,left
0,0.38,0.53,2,157,3,0,0,low,1
1,0.8,0.86,5,262,6,0,0,medium,1
2,0.11,0.88,7,272,4,0,0,medium,1
3,0.72,0.87,5,223,5,0,0,low,1
4,0.37,0.52,2,159,3,0,0,low,1


##### In our dataset, the feature 'salary' is having string categorical values.
##### We will have to handle these categorical features first, because the ML model works only on numbers
### Before handling the categorical features, we will try to understand what unique values it has

In [32]:
# To see the unique values, we have to use a method called unique() on the particular feature
df2['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)