# Ques: From the employees csv file, do EDA and make a model that will predict which person is likely to leave the company

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("employees.csv")

In [3]:
print(df.keys())

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')


# Doing EDA to find independent variables

In [4]:
(df.groupby("left").mean())

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
1,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321


# By doing EDA we've found the independent variables which are:
##### satisfaction level, average monthly hours, promotion in last 5 years, Salary.

In [5]:
df1 = df

In [6]:
df1 = df1.drop("Department",axis = 1)

In [7]:
df1

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary
0,0.38,0.53,2,157,3,0,1,0,low
1,0.80,0.86,5,262,6,0,1,0,medium
2,0.11,0.88,7,272,4,0,1,0,medium
3,0.72,0.87,5,223,5,0,1,0,low
4,0.37,0.52,2,159,3,0,1,0,low
...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,low
14995,0.37,0.48,2,160,3,0,1,0,low
14996,0.37,0.53,2,143,3,0,1,0,low
14997,0.11,0.96,6,280,4,0,1,0,low


# creating dummy variables because salary col is in text

In [8]:
salary_dummies = pd.get_dummies(df["salary"],prefix="salary")

In [9]:
print(salary_dummies)

       salary_high  salary_low  salary_medium
0                0           1              0
1                0           0              1
2                0           0              1
3                0           1              0
4                0           1              0
...            ...         ...            ...
14994            0           1              0
14995            0           1              0
14996            0           1              0
14997            0           1              0
14998            0           1              0

[14999 rows x 3 columns]


In [10]:
finaldf = pd.concat((df1,salary_dummies),axis = "columns")

In [11]:
finaldf

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,low,0,1,0
1,0.80,0.86,5,262,6,0,1,0,medium,0,0,1
2,0.11,0.88,7,272,4,0,1,0,medium,0,0,1
3,0.72,0.87,5,223,5,0,1,0,low,0,1,0
4,0.37,0.52,2,159,3,0,1,0,low,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,low,0,1,0
14995,0.37,0.48,2,160,3,0,1,0,low,0,1,0
14996,0.37,0.53,2,143,3,0,1,0,low,0,1,0
14997,0.11,0.96,6,280,4,0,1,0,low,0,1,0


In [16]:
finaldf= finaldf.drop(["last_evaluation","number_project","time_spend_company","Work_accident","salary"],axis = 1)

In [17]:
finaldf

Unnamed: 0,satisfaction_level,left,promotion_last_5years,salary_high,salary_low,salary_medium
0,0.38,1,0,0,1,0
1,0.80,1,0,0,0,1
2,0.11,1,0,0,0,1
3,0.72,1,0,0,1,0
4,0.37,1,0,0,1,0
...,...,...,...,...,...,...
14994,0.40,1,0,0,1,0
14995,0.37,1,0,0,1,0
14996,0.37,1,0,0,1,0
14997,0.11,1,0,0,1,0


In [18]:
from sklearn.model_selection import train_test_split

In [23]:
x_train,x_test,y_train,y_test = train_test_split(finaldf.drop("left",axis = 1),finaldf["left"],test_size=0.3)

In [24]:
print(x_train)

       satisfaction_level  promotion_last_5years  salary_high  salary_low  \
5879                 0.74                      0            0           1   
10387                0.76                      0            0           1   
9437                 0.70                      0            0           1   
13339                0.96                      0            0           0   
9817                 0.56                      0            1           0   
...                   ...                    ...          ...         ...   
13971                0.32                      1            0           0   
12981                0.72                      0            1           0   
6303                 0.74                      0            0           1   
9079                 0.61                      0            0           0   
8840                 0.74                      0            0           0   

       salary_medium  
5879               0  
10387              0  
9437  

In [25]:
print(y_train)

5879     0
10387    0
9437     0
13339    0
9817     0
        ..
13971    0
12981    0
6303     0
9079     0
8840     0
Name: left, Length: 10499, dtype: int64


In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
model = LogisticRegression()

In [28]:
model.fit(x_train,y_train)

LogisticRegression()

In [29]:
print(model.predict(x_test))

[0 0 0 ... 0 0 0]


In [30]:
print(model.score(x_test,y_test))

0.7608888888888888
