In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import optuna                               
%matplotlib inline
sns.set_palette('deep')
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from warnings import filterwarnings
import xgboost as xgb
from sklearn.model_selection import KFold   
from sklearn.metrics import accuracy_score
filterwarnings('ignore')
import time

In [2]:
train=pd.read_csv('Train_Dataset.csv')

In [3]:
def fill_workexperience(Designation,WorkExperience):
    if pd.isna(WorkExperience):
        if Designation=='AVP':
            ab=25.0
        elif Designation=='Executive':
            ab=6.0
        elif Designation=='Manager':
            ab=10.0
        elif Designation=='Senior Manager':
            ab=14.0
        elif Designation=='VP':
            ab=25.0
    else:
        ab=WorkExperience
    return ab

In [4]:
def fill_age(Designation,Age):
    if pd.isna(Age):
        if Designation=='AVP':
            ab=48
        elif Designation=='Executive':
            ab=31
        elif Designation=='Manager':
            ab=36
        elif Designation=='Senior Manager':
            ab=39
        elif Designation=='VP':
            ab=48
    else:
        ab=Age
    return ab

In [5]:
def fill_lastpromotion(Designation,LastPromotion):
    if pd.isna(LastPromotion):
        if Designation=='AVP':
            ab=3.0
        elif Designation=='Executive':
            ab=1.0
        elif Designation=='Manager':
            ab=1.0
        elif Designation=='Senior Manager':
            ab=2.0
        elif Designation=='VP':
            ab=3.0
    else:
        ab=LastPromotion
    return ab

In [6]:
def fill_currentprofile(Designation,CurrentProfile):
    if pd.isna(CurrentProfile):
        if Designation=='AVP':
            ab=7.0
        elif Designation=='Executive':
            ab=2.0
        elif Designation=='Manager':
            ab=4.0
        elif Designation=='Senior Manager':
            ab=7.0
        elif Designation=='VP':
            ab=8.0
    else:
        ab=CurrentProfile
    return ab

In [7]:
def fill_currentmi(Designation,MonthlyIncome):
    if pd.isna(MonthlyIncome):
        if Designation=='AVP':
            ab=31307.0
        elif Designation=='Executive':
            ab=18145.0
        elif Designation=='Manager':
            ab=20765.0
        elif Designation=='Senior Manager':
            ab=25333.0
        elif Designation=='VP':
            ab=34609.5
    else:
        ab=MonthlyIncome
    return ab

In [8]:
def remove_outliers(df):
    df.drop(782,inplace=True)
    df.drop([1157,1404,4351],inplace=True)
    df=df[df['NumCompaniesWorked']<=10]
    df=df[df['WorkExperience']<=32]
    df=df[df['LastPromotion']<=8]
    df.drop([1528,4781],inplace=True)

In [9]:
def process_data(df):
    df=df.dropna(how='all')
    df.drop('EmployeeID',axis=1,inplace=True)
    df['Gender']=df['Gender'].replace(to_replace='F',value='Female')
    df['MaritalStatus']=df['MaritalStatus'].replace(to_replace='M',value='Married')
    #df['Age']=df['Age'].fillna(value=df['Age'].mean())
    df['Age']=df.apply(lambda x: fill_age(x['Designation'], x['Age']), axis=1)
    df['Department']=df['Department'].fillna(value='Analytics')
    df['HomeToWork']=df['HomeToWork'].fillna(value=df['HomeToWork'].median())
    df['Gender']=df['Gender'].fillna(value='Male')
    df['HourlnWeek']=df['HourlnWeek'].fillna(value=df['HourlnWeek'].median())
    df['Designation']=df['Designation'].fillna(value='Executive')
    df['SalaryHikelastYear']=df['SalaryHikelastYear'].fillna(value=df['SalaryHikelastYear'].median())
    df['WorkExperience']=df.apply(lambda x: fill_workexperience(x['Designation'], x['WorkExperience']), axis=1)
    df['LastPromotion']=df.apply(lambda x: fill_lastpromotion(x['Designation'], x['LastPromotion']), axis=1)
    df['CurrentProfile']=df.apply(lambda x: fill_currentprofile(x['Designation'], x['CurrentProfile']), axis=1)
    df['MonthlyIncome']=df.apply(lambda x: fill_currentmi(x['Designation'], x['MonthlyIncome']), axis=1)
    df=pd.get_dummies(df,drop_first=True)
    return df

In [10]:
df_train=remove_outliers(train)

In [11]:
df_train=process_data(train)

In [12]:
df_train.head()

Unnamed: 0,Attrition,Age,HomeToWork,HourlnWeek,Involvement,WorkLifeBalance,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,...,EducationField_Marketing Diploma,EducationField_Other,EducationField_Statistics,Gender_Male,Designation_Executive,Designation_Manager,Designation_Senior Manager,Designation_VP,MaritalStatus_Married,MaritalStatus_Single
0,0.0,35.0,5.0,69.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,1,1,0,0,0,1,0
1,1.0,32.0,5.0,62.0,4.0,3.0,2.0,0.0,8.0,0.0,...,0,0,1,0,1,0,0,0,0,1
2,0.0,31.0,5.0,45.0,5.0,3.0,2.0,1.0,3.0,0.0,...,0,0,1,0,0,1,0,0,0,1
3,0.0,34.0,10.0,32.0,3.0,2.0,4.0,1.0,1.0,0.0,...,0,0,1,0,0,1,0,0,0,0
4,0.0,37.0,27.0,49.0,3.0,4.0,4.0,1.0,8.0,0.0,...,0,0,1,0,0,1,0,0,0,0


In [13]:
X= df_train.drop('Attrition',axis=1)

In [14]:
y= df_train['Attrition']

## Random Forest Classifier

In [15]:
rf=RandomForestClassifier(n_estimators=500,max_depth =20)

In [16]:
rf.fit(X.values,y.values)

RandomForestClassifier(max_depth=20, n_estimators=500)

X,y=df_train.drop('Attrition',axis=1),df_train['Attrition']

In [17]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.30)

In [18]:
rf.score(X_train.values,y_train.values)

1.0

In [19]:
rf.score(X_test.values,y_test.values)

1.0

In [27]:
pd.DataFrame(rf.feature_importances_,index=X.columns).sort_values(0,ascending=False)

Unnamed: 0,0
MonthlyIncome,0.092367
Age,0.087705
WorkExperience,0.077466
OverTime,0.072938
HomeToWork,0.069789
HourlnWeek,0.069353
CurrentProfile,0.056006
SalaryHikelastYear,0.055636
NumCompaniesWorked,0.052011
JobSatisfaction,0.049365


## Processing Testset and making prediction

In [20]:
test=pd.read_csv('Test_Dataset.csv')
df_test=process_data(test)
test['Attrition']=rf.predict(df_test.values)
test[['EmployeeID','Attrition']].to_csv('Submit030322{}.csv'.format(time.time()),index=False)

# Using XGBClassifier

In [21]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 350, 600),
        'max_depth': trial.suggest_int('max_depth', 6, 25),
       }
        
    clf = xgb.XGBClassifier(**params)
    auccuracies=[]
    X_train_k = X.values
    y_train_k = y.values
    kf = KFold(n_splits=3,random_state=2000,shuffle=True)
    for train_idx, valid_idx in kf.split(X_train_k,y_train_k):
        clf.fit(X_train_k[train_idx, :], y_train_k[train_idx])
        pred = clf.predict(X_train_k[valid_idx, :])
        accuracy = accuracy_score(y_train_k[valid_idx],pred)
        auccuracies.append(accuracy)
    print(f'Trial done: Accuracy values on folds: {accuracy}')
    return np.average(auccuracies)

In [22]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-03-03 20:36:08,704][0m A new study created in memory with name: no-name-b760e310-92ec-4188-a28e-8e4549202359[0m




[32m[I 2022-03-03 20:36:12,025][0m Trial 0 finished with value: 0.9719852665401559 and parameters: {'n_estimators': 531, 'max_depth': 17}. Best is trial 0 with value: 0.9719852665401559.[0m


Trial done: Accuracy values on folds: 0.9715942028985507


[32m[I 2022-03-03 20:36:14,567][0m Trial 1 finished with value: 0.9719854904529196 and parameters: {'n_estimators': 362, 'max_depth': 13}. Best is trial 1 with value: 0.9719854904529196.[0m


Trial done: Accuracy values on folds: 0.9681159420289855


[32m[I 2022-03-03 20:36:17,388][0m Trial 2 finished with value: 0.9727578775309139 and parameters: {'n_estimators': 403, 'max_depth': 25}. Best is trial 2 with value: 0.9727578775309139.[0m


Trial done: Accuracy values on folds: 0.9710144927536232


[32m[I 2022-03-03 20:36:20,476][0m Trial 3 finished with value: 0.9710191949216584 and parameters: {'n_estimators': 469, 'max_depth': 17}. Best is trial 2 with value: 0.9727578775309139.[0m


Trial done: Accuracy values on folds: 0.9710144927536232


[32m[I 2022-03-03 20:36:23,366][0m Trial 4 finished with value: 0.9715982333282952 and parameters: {'n_estimators': 415, 'max_depth': 14}. Best is trial 2 with value: 0.9727578775309139.[0m


Trial done: Accuracy values on folds: 0.9681159420289855


[32m[I 2022-03-03 20:36:26,808][0m Trial 5 finished with value: 0.9731440150917203 and parameters: {'n_estimators': 523, 'max_depth': 21}. Best is trial 5 with value: 0.9731440150917203.[0m


Trial done: Accuracy values on folds: 0.9715942028985507


[32m[I 2022-03-03 20:36:30,519][0m Trial 6 finished with value: 0.9727577655745322 and parameters: {'n_estimators': 593, 'max_depth': 23}. Best is trial 5 with value: 0.9731440150917203.[0m


Trial done: Accuracy values on folds: 0.9710144927536232


[32m[I 2022-03-03 20:36:33,837][0m Trial 7 finished with value: 0.9715987931102043 and parameters: {'n_estimators': 519, 'max_depth': 17}. Best is trial 5 with value: 0.9731440150917203.[0m


Trial done: Accuracy values on folds: 0.9710144927536232


[32m[I 2022-03-03 20:36:36,476][0m Trial 8 finished with value: 0.9696659781349185 and parameters: {'n_estimators': 385, 'max_depth': 8}. Best is trial 5 with value: 0.9731440150917203.[0m


Trial done: Accuracy values on folds: 0.9669565217391304


[32m[I 2022-03-03 20:36:39,640][0m Trial 9 finished with value: 0.9715991289793496 and parameters: {'n_estimators': 474, 'max_depth': 13}. Best is trial 5 with value: 0.9731440150917203.[0m


Trial done: Accuracy values on folds: 0.9692753623188406
Number of finished trials: 10
Best trial:
  Value: 0.9731440150917203
  Params: 
    n_estimators: 523
    max_depth: 21


In [23]:
#Best Parameter from Optuna Hyperparameter optimization
best_param = study.best_params

### Dropping correlated features and fitting model

In [24]:
#Fitting XGBoost model
model = xgb.XGBClassifier(**best_param)
model.fit(X.drop(['WorkExperience','MonthlyIncome'],axis=1),y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=21, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=523, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
test=pd.read_csv('Test_Dataset.csv')
df_test=process_data(test)
test['Attrition']=model.predict(df_test.drop(['WorkExperience','MonthlyIncome'],axis=1).values)
test[['EmployeeID','Attrition']].to_csv('Submit030322{}.csv'.format(time.time()),index=False)