In [20]:
# Import pandas & numpy
import pandas as pd
import numpy as np

# import classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# data split module
from sklearn.model_selection import train_test_split

# import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# import plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
# import data
df = pd.read_csv('employee_data.csv')
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,quit,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [22]:
# use one hot encoder to convert 'department' & 'salary' columns into numericals
df = pd.get_dummies(df)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,quit,promotion_last_5years,department_IT,department_RandD,...,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [23]:
# define features
X = df.drop(['quit'], axis = 1)
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department_IT,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0


In [24]:
# define target variable
y = df['quit']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: quit, dtype: int64

In [25]:
# Split the data into train and test data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 3)


In [26]:
def classif_metrics(y_act, y_pred):
    metrics = []
    metrics.append(accuracy_score(y_act, y_pred))
    metrics.append(recall_score(y_act, y_pred))
    metrics.append(precision_score(y_act, y_pred))
    metrics.append(f1_score(y_act, y_pred))
    metrics = [round(elem,3) for elem in metrics]
    return metrics 

In [27]:
def lrModel(X_train, X_test, y_train, y_test):
    # import model 
    lr = LogisticRegression()
    # fit the model on train data set
    lr.fit(X_train, y_train)
    # predict the target variable on train data set
    y_train_pred = lr.predict(X_train)
    # predict the target variable on test data set
    y_test_pred = lr.predict(X_test)
    # Performance metrics of Logistic Regression model
    train_dataSet_metrcis = classif_metrics(y_train, y_train_pred)
    test_dataSet_metrics = classif_metrics(y_test, y_test_pred)
    #
    metrics_df = pd.DataFrame(index = ['Accuracy', 'Recall', 'Precision', 'F1-Score'])
    metrics_df['LR_train'] = train_dataSet_metrcis
    metrics_df['LR_test'] = test_dataSet_metrics
    return metrics_df
     

In [28]:
def rfModel(X_train, X_test, y_train, y_test):
    # import model 
    rf = RandomForestClassifier()
    # fit the model on train data set
    rf.fit(X_train, y_train)
    # predict the target variable on train data set
    y_train_pred = rf.predict(X_train)
    # predict the target variable on test data set
    y_test_pred = rf.predict(X_test)
    # Performance metrics of Logistic Regression model
    train_dataSet_metrcis = classif_metrics(y_train, y_train_pred)
    test_dataSet_metrics = classif_metrics(y_test, y_test_pred)
    #
    metrics_df = pd.DataFrame(index = ['Accuracy', 'Recall', 'Precision', 'F1-Score'])
    metrics_df['RF_train'] = train_dataSet_metrcis
    metrics_df['RF_test'] = test_dataSet_metrics
    return metrics_df 

In [29]:
def dtModel(X_train, X_test, y_train, y_test):
    # import model 
    dt = DecisionTreeClassifier()
    # fit the model on train data set
    dt.fit(X_train, y_train)
    # predict the target variable on train data set
    y_train_pred = dt.predict(X_train)
    # predict the target variable on test data set
    y_test_pred = dt.predict(X_test)
    # Performance metrics of Logistic Regression model
    train_dataSet_metrcis = classif_metrics(y_train, y_train_pred)
    test_dataSet_metrics = classif_metrics(y_test, y_test_pred)
    #
    metrics_df = pd.DataFrame(index = ['Accuracy', 'Recall', 'Precision', 'F1-Score'])
    metrics_df['DT_train'] = train_dataSet_metrcis
    metrics_df['DT_test'] = test_dataSet_metrics
    return metrics_df

In [30]:
res_df = lrModel(X_train, X_test, y_train, y_test)
res_df = res_df.join(rfModel(X_train, X_test, y_train, y_test))
res_df = res_df.join(dtModel(X_train, X_test, y_train, y_test))
res_df



Unnamed: 0,LR_train,LR_test,RF_train,RF_test,DT_train,DT_test
Accuracy,0.795,0.792,0.998,0.987,1.0,0.98
Recall,0.361,0.353,0.994,0.954,1.0,0.966
Precision,0.618,0.609,1.0,0.99,1.0,0.95
F1-Score,0.456,0.447,0.997,0.972,1.0,0.958


In [31]:
res_df.to_excel('employee_attr_classification_results.xlsx')