In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [2]:
df = pd.read_csv('employee_attrition_data.csv')
df.drop(columns = ['Employee_ID'],inplace=True)
df.head()

Unnamed: 0,Age,Gender,Department,Job_Title,Years_at_Company,Satisfaction_Level,Average_Monthly_Hours,Promotion_Last_5Years,Salary,Attrition
0,27,Male,Marketing,Manager,9,0.586251,151,0,60132,0
1,53,Female,Sales,Engineer,10,0.261161,221,1,79947,0
2,59,Female,Marketing,Analyst,8,0.304382,184,0,46958,1
3,42,Female,Engineering,Manager,1,0.480779,242,0,40662,0
4,44,Female,Sales,Engineer,10,0.636244,229,1,74307,0


In [3]:
df.columns

Index(['Age', 'Gender', 'Department', 'Job_Title', 'Years_at_Company',
       'Satisfaction_Level', 'Average_Monthly_Hours', 'Promotion_Last_5Years',
       'Salary', 'Attrition'],
      dtype='object')

In [4]:
df['Job_Title'].value_counts()

Job_Title
Engineer         214
Manager          206
Accountant       206
Analyst          195
HR Specialist    179
Name: count, dtype: int64

In [5]:
X = df.drop(columns=['Attrition'])
y = df['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2 , random_state = 42)
X_train.shape


(800, 9)

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('scaling',StandardScaler(),['Age','Years_at_Company','Average_Monthly_Hours','Salary']),
        ('encoding',OneHotEncoder(),['Gender','Department','Job_Title'])
        
    ], remainder='passthrough'
)
       

In [7]:
X_train = preprocessor.fit_transform(X_train)


In [8]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

X_test = preprocessor.fit_transform(X_test)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.435

## more data

In [9]:
columns = ['Age','Gender', 'Years at Company', 'Monthly Income', 'Job Role', 'Job Satisfaction', 'Number of Promotions', 'Distance from Home','Remote Work','Leadership Opportunities','Attrition']
df = pd.read_csv("train.csv")[columns]
print(df.shape)

(59598, 11)


In [10]:
label_enc = LabelEncoder()
df['Attrition'] = label_enc.fit_transform(df['Attrition'])

list(label_enc.inverse_transform([1,0]))

['Stayed', 'Left']

In [11]:
df.head()

Unnamed: 0,Age,Gender,Years at Company,Monthly Income,Job Role,Job Satisfaction,Number of Promotions,Distance from Home,Remote Work,Leadership Opportunities,Attrition
0,31,Male,19,5390,Education,Medium,2,22,No,No,1
1,59,Female,4,5534,Media,High,3,21,No,No,1
2,24,Female,10,8159,Healthcare,High,0,11,No,No,1
3,36,Female,7,3989,Education,High,1,27,Yes,No,1
4,56,Male,41,4821,Education,Very High,0,71,No,No,1


In [12]:
X_train = df.drop(columns=['Attrition'])
y_train = df['Attrition']


In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal_enc',OrdinalEncoder(dtype=np.int16),['Job Role','Job Satisfaction']),
        ('one_hot',OneHotEncoder(sparse_output=False, drop='first',dtype=np.int16),['Gender','Remote Work','Leadership Opportunities']),
#         ('scaler',StandardScaler(),['Age','Years at Company','Monthly Income','Number of Promotions', 'Company Tenure'])
        
    ], remainder='passthrough'
)
  

In [14]:
X_train = preprocessor.fit_transform(X_train)

In [28]:
X_train[0]

array([   0,    2,    1,    0,    0,   31,   19, 5390,    2,   22],
      dtype=int64)

In [27]:
X_train.shape

(59598, 10)

In [15]:
# model = RandomForestClassifier(
#     n_estimators=400,
#     max_depth = 70,
# #     max_features = 'auto',
#     min_samples_leaf = 4,
#     min_samples_split  =10,  
# )

# model = LogisticRegression()
# model = GradientBoostingClassifier()

# model.fit(X_train, y_train)

# # X_test = preprocessor.fit_transform(X_test)
# y_pred = model.predict(X_train)
# accuracy_score(y_train , y_pred)

In [16]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_alpha=0.05,
    reg_lambda=1,
    scale_pos_weight=1,
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
)

model.fit(X_train, y_train)

# X_test = preprocessor.fit_transform(X_test)
y_pred = model.predict(X_train)
accuracy_score(y_train , y_pred)

0.6601731601731602

## test

In [36]:
df_test = pd.read_csv("test.csv")[columns]
df_test.head()

Unnamed: 0,Age,Gender,Years at Company,Monthly Income,Job Role,Job Satisfaction,Number of Promotions,Distance from Home,Remote Work,Leadership Opportunities,Attrition
0,36,Male,13,8029,Healthcare,High,1,83,No,No,Stayed
1,35,Male,7,4563,Education,High,1,55,No,No,Left
2,50,Male,7,5583,Education,High,3,14,No,No,Stayed
3,58,Male,44,5525,Media,Very High,0,43,No,No,Left
4,39,Male,24,4604,Education,High,0,47,Yes,No,Stayed


In [37]:
df.head()

Unnamed: 0,Age,Gender,Years at Company,Monthly Income,Job Role,Job Satisfaction,Number of Promotions,Distance from Home,Remote Work,Leadership Opportunities,Attrition
0,31,Male,19,5390,Education,Medium,2,22,No,No,1
1,59,Female,4,5534,Media,High,3,21,No,No,1
2,24,Female,10,8159,Healthcare,High,0,11,No,No,1
3,36,Female,7,3989,Education,High,1,27,Yes,No,1
4,56,Male,41,4821,Education,Very High,0,71,No,No,1


In [38]:
df_test['Attrition'].value_counts()

Attrition
Stayed    7868
Left      7032
Name: count, dtype: int64

In [39]:
df_test['Attrition']=label_enc.transform(df_test['Attrition'])
df_test.head()

Unnamed: 0,Age,Gender,Years at Company,Monthly Income,Job Role,Job Satisfaction,Number of Promotions,Distance from Home,Remote Work,Leadership Opportunities,Attrition
0,36,Male,13,8029,Healthcare,High,1,83,No,No,1
1,35,Male,7,4563,Education,High,1,55,No,No,0
2,50,Male,7,5583,Education,High,3,14,No,No,1
3,58,Male,44,5525,Media,Very High,0,43,No,No,0
4,39,Male,24,4604,Education,High,0,47,Yes,No,1


In [51]:
X_test = df_test.drop(columns=['Attrition'])
y_test = df_test['Attrition']

In [52]:
X_test.shape

(14900, 10)

In [55]:
preprocessor.transform(pd.DataFrame([X_test.iloc[1,:]]))

array([[   0,    0,    1,    0,    0,   35,    7, 4563,    1,   55]],
      dtype=int64)

In [22]:
X_test = preprocessor.transform(X_test)

In [23]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.6236912751677852

In [24]:
import pickle

pickle.dump(model, open('model.pkl','wb'))
pickle.dump(preprocessor, open('preprocessor.pkl','wb'))

In [31]:
X_test.shape

(14900, 10)

In [32]:
model.predict([X_test[0]])

array([0])