In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv("HR_clean.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,satisfaction_level,last_evaluation_score,project_count,average_monthly_hours,years_of_working,workplace_acciedent_count,employment_status,promotion_last_5years,department,salary_range,Score_variance
0,0,0.38,0.53,2,157,3,0,1,0,sales,low,0.15
1,2,0.11,0.88,7,272,4,0,1,0,sales,medium,0.77
2,3,0.72,0.87,5,223,5,0,1,0,sales,low,0.15
3,4,0.37,0.52,2,159,3,0,1,0,sales,low,0.15
4,5,0.41,0.5,2,153,3,0,1,0,sales,low,0.09
5,6,0.1,0.77,6,247,4,0,1,0,sales,low,0.67
6,7,0.92,0.85,5,259,5,0,1,0,sales,low,-0.07
7,8,0.89,1.0,5,224,5,0,1,0,sales,low,0.11
8,9,0.42,0.53,2,142,3,0,1,0,sales,low,0.11
9,10,0.45,0.54,2,135,3,0,1,0,sales,low,0.09


In [4]:
df.columns

Index(['Unnamed: 0', 'satisfaction_level', 'last_evaluation_score',
       'project_count', 'average_monthly_hours', 'years_of_working',
       'workplace_acciedent_count', 'employment_status',
       'promotion_last_5years', 'department', 'salary_range',
       'Score_variance'],
      dtype='object')

In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df.columns

Index(['satisfaction_level', 'last_evaluation_score', 'project_count',
       'average_monthly_hours', 'years_of_working',
       'workplace_acciedent_count', 'employment_status',
       'promotion_last_5years', 'department', 'salary_range',
       'Score_variance'],
      dtype='object')

In [6]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

np.random.seed(41)

x = df.drop("employment_status", axis=1)
y = df["employment_status"]

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 


categorical_features = ["department", "salary_range"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer ([(
                                    "one_hot",
                                    one_hot,
                                    categorical_features,)],
                                    remainder = "passthrough")
transformed_x=transformer.fit_transform(x)
transformed_x


x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

clf = LinearSVC()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.8263205013428827

In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      1849
           1       0.48      0.12      0.20       385

    accuracy                           0.83      2234
   macro avg       0.66      0.55      0.55      2234
weighted avg       0.78      0.83      0.78      2234



## Method 1 Undersampling

In [8]:
df1= pd.get_dummies(data=df, columns=["department", "salary_range"])
df1.columns

Index(['satisfaction_level', 'last_evaluation_score', 'project_count',
       'average_monthly_hours', 'years_of_working',
       'workplace_acciedent_count', 'employment_status',
       'promotion_last_5years', 'Score_variance', 'department_IT',
       'department_RandD', 'department_accounting', 'department_hr',
       'department_management', 'department_marketing',
       'department_product_mng', 'department_sales', 'department_support',
       'department_technical', 'salary_range_high', 'salary_range_low',
       'salary_range_medium'],
      dtype='object')

In [9]:
# Class count
count_class_0, count_class_1 = df.employment_status.value_counts()

# Divide by class
df_class_0 = df1[df1["employment_status"] == 0]
df_class_1 = df1[df1["employment_status"] == 1]

In [10]:
df_class_0.shape

(9285, 22)

In [11]:
df_class_1.shape

(1882, 22)

In [12]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis = 0 )


print('Random under-sampling:')
print(df_test_under.employment_status.value_counts())



Random under-sampling:
0    1882
1    1882
Name: employment_status, dtype: int64


In [13]:
x = df_test_under.drop("employment_status", axis=1)
y = df_test_under["employment_status"]


In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=15, stratify=y)

In [15]:
y_train.value_counts()

1    1506
0    1505
Name: employment_status, dtype: int64

In [16]:
clf = LinearSVC()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.49800796812749004

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.50      0.99      0.66       377
           1       0.00      0.00      0.00       376

    accuracy                           0.50       753
   macro avg       0.25      0.50      0.33       753
weighted avg       0.25      0.50      0.33       753



## Trying the second  model

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
model = RandomForestClassifier()

In [20]:
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.950863213811421

In [21]:
print(classification_report(y_test, model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95       377
           1       0.99      0.91      0.95       376

    accuracy                           0.95       753
   macro avg       0.95      0.95      0.95       753
weighted avg       0.95      0.95      0.95       753



## Trying the third model

In [22]:
from sklearn.linear_model import LogisticRegression
log= LogisticRegression()

In [23]:
log.fit(x_train,y_train)
log.score(x_test,y_test)

0.8592297476759628

In [24]:
print(classification_report(y_test, log.predict(x_test)))

              precision    recall  f1-score   support

           0       0.90      0.81      0.85       377
           1       0.82      0.91      0.87       376

    accuracy                           0.86       753
   macro avg       0.86      0.86      0.86       753
weighted avg       0.86      0.86      0.86       753



## Method 2- Oversampling

In [25]:
count_class_0, count_class_1

(9285, 1882)

In [29]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)

df_test_over = pd.concat([df_class_0, df_class_1_over], axis = 0)

print('Random over-sampling:')
print(df_test_over.employment_status.value_counts())

Random over-sampling:
0    9285
1    9285
Name: employment_status, dtype: int64


In [28]:
df_test_over.shape

(18570, 22)

In [30]:
x = df_test_over.drop("employment_status", axis=1)
y = df_test_over["employment_status"]

In [36]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=15, stratify=y)

In [39]:
y_train.value_counts()

1    7428
0    7428
Name: employment_status, dtype: int64

In [41]:
y_test.value_counts()

0    1857
1    1857
Name: employment_status, dtype: int64

In [43]:
from sklearn.svm import LinearSVC

In [46]:
clf1 = LinearSVC

In [48]:
clf1 = LinearSVC()
clf1.fit(x_train, y_train)
clf1.score(x_test, y_test)

0.5850834679590737

In [49]:
print(classification_report(y_test, clf1.predict(x_test)))

              precision    recall  f1-score   support

           0       0.55      0.96      0.70      1857
           1       0.85      0.21      0.33      1857

    accuracy                           0.59      3714
   macro avg       0.70      0.59      0.52      3714
weighted avg       0.70      0.59      0.52      3714



## Model 2

In [54]:
X = df_test_over.drop("employment_status", axis=1)
Y = df_test_over["employment_status"]

In [86]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=15, stratify=Y)

In [87]:
from sklearn.ensemble import RandomForestClassifier
model2= RandomForestClassifier()
model2.fit(X_train, Y_train)
model2.score(X_test, Y_test)

0.9975767366720517

In [88]:
print(classification_report(Y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1857
           1       1.00      1.00      1.00      1857

    accuracy                           1.00      3714
   macro avg       1.00      1.00      1.00      3714
weighted avg       1.00      1.00      1.00      3714



## model 3

In [59]:
X1 = df_test_over.drop("employment_status", axis=1)
Y1 = df_test_over["employment_status"]

In [83]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=0.2, random_state=15, stratify=Y1)

In [84]:
from sklearn.linear_model import LogisticRegression
log1 = LogisticRegression()
log1.fit(X1_train,Y1_train)
log1.score(X1_test,Y1_test)

0.8739903069466882

In [85]:
print(classification_report(Y1_test, log1.predict(X1_test)))

              precision    recall  f1-score   support

           0       0.91      0.83      0.87      1857
           1       0.84      0.92      0.88      1857

    accuracy                           0.87      3714
   macro avg       0.88      0.87      0.87      3714
weighted avg       0.88      0.87      0.87      3714



## Method 3: SMOTE 

In [79]:
df2= pd.get_dummies(data=df, columns=["department", "salary_range"])
df2.columns

Index(['satisfaction_level', 'last_evaluation_score', 'project_count',
       'average_monthly_hours', 'years_of_working',
       'workplace_acciedent_count', 'employment_status',
       'promotion_last_5years', 'Score_variance', 'department_IT',
       'department_RandD', 'department_accounting', 'department_hr',
       'department_management', 'department_marketing',
       'department_product_mng', 'department_sales', 'department_support',
       'department_technical', 'salary_range_high', 'salary_range_low',
       'salary_range_medium'],
      dtype='object')

In [80]:
X3 = df2.drop("employment_status", axis=1)
Y3 = df2["employment_status"]

In [66]:
!pip install -U imbalanced-learn



In [81]:
Y3.value_counts()


0    9285
1    1882
Name: employment_status, dtype: int64

In [82]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X3, Y3)

y_sm.value_counts()

1    9285
0    9285
Name: employment_status, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_sm, Y_sm, test_size=0.2, random_state=15, stratify=Y_sm)

In [89]:
y_train.value_counts()

1    7428
0    7428
Name: employment_status, dtype: int64

In [90]:
y_test.value_counts()

0    1857
1    1857
Name: employment_status, dtype: int64

## model 1

In [91]:
from sklearn.svm import LinearSVC
clf3 = LinearSVC()
clf3.fit(x_train, y_train)
clf3.score(x_test, y_test)

0.6965535810446958

In [92]:
print(classification_report (y_test, clf3.predict(x_test)))

              precision    recall  f1-score   support

           0       0.63      0.94      0.76      1857
           1       0.89      0.45      0.60      1857

    accuracy                           0.70      3714
   macro avg       0.76      0.70      0.68      3714
weighted avg       0.76      0.70      0.68      3714



## Model 2

In [93]:
from sklearn.ensemble import RandomForestClassifier
mod = RandomForestClassifier()
mod.fit(x_train,y_train)
mod.score(x_test,y_test)

0.9967689822294022

In [94]:
print(classification_report (y_test, mod.predict(x_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1857
           1       1.00      1.00      1.00      1857

    accuracy                           1.00      3714
   macro avg       1.00      1.00      1.00      3714
weighted avg       1.00      1.00      1.00      3714



## Model 3

In [96]:
from sklearn.linear_model import LogisticRegression
logm = LogisticRegression()
logm.fit(x_train,y_train)
logm.score(x_test,y_test)

0.8739903069466882

In [97]:
print(classification_report (y_test, logm.predict(x_test)))

              precision    recall  f1-score   support

           0       0.91      0.83      0.87      1857
           1       0.84      0.92      0.88      1857

    accuracy                           0.87      3714
   macro avg       0.88      0.87      0.87      3714
weighted avg       0.88      0.87      0.87      3714

