In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Model Creating

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score

In [9]:
models = {
    'Logisrtic': LogisticRegression(),
    'SVC': SVC(),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Ada Boost': AdaBoostClassifier(),
    'Gradient Boost': GradientBoostingClassifier(),
    'XGB': XGBClassifier()
}

In [12]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Model prediction
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Train data performance
    train_as = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)

    # Test data performance
    test_as = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)

    print(list(models.keys())[i])
    print('\t-Train accuracy score:', train_as)
    print('\t-Train f1 score: ', train_f1)
    print('\t-Train recall score: ', train_recall, '\n')

    print('\t-Test accuracy score: ', test_as)
    print('\t-Test f1 score: ', test_f1)
    print('\t-Test recall score: ', test_recall)

    print('*'*15)

Logisrtic
	-Train accuracy score: 0.9513358778625954
	-Train f1 score:  0.9516129032258065
	-Train recall score:  0.9570610687022901 

	-Test accuracy score:  0.9025844930417495
	-Test f1 score:  0.6918238993710691
	-Test recall score:  0.7432432432432432
***************
SVC
	-Train accuracy score: 0.982824427480916
	-Train f1 score:  0.9828408007626311
	-Train recall score:  0.9837786259541985 

	-Test accuracy score:  0.9204771371769384
	-Test f1 score:  0.7058823529411764
	-Test recall score:  0.6486486486486487
***************
Naive Bayes
	-Train accuracy score: 0.8869274809160306
	-Train f1 score:  0.8882602545968883
	-Train recall score:  0.898854961832061 

	-Test accuracy score:  0.8389662027833003
	-Test f1 score:  0.5317919075144508
	-Test recall score:  0.6216216216216216
***************
KNN
	-Train accuracy score: 0.9270038167938931
	-Train f1 score:  0.9319697643397066
	-Train recall score:  1.0 

	-Test accuracy score:  0.7892644135188867
	-Test f1 score:  0.5225225225225

# Hyperparameter tuning

In [33]:
param_grid_svc = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.001, 0.01, 0.1, 1]
}


param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.5],
    'estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)]
}


param_grid_gb = {
    'n_estimators': [50, 100, 200, 300],  # Number of boosting stages
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Controls step size of updates
    'max_depth': [1, 3, 4, 5, 6],  # Depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split
    'min_samples_leaf': [1, 2, 5, 10],  # Minimum samples per leaf
    'subsample': [0.6, 0.8, 1.0],  # Fraction of samples per boosting iteration
    'max_features': ['sqrt', 'log2', None],  # Number of features considered for best split
}

param_grid_xgb = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_weight': [1, 3, 5]
}


In [35]:
randomized_cv = [
    ('SVC', SVC(), param_grid_svc),
    ('AdaBoost', AdaBoostClassifier(), param_grid_adaboost),
    ('GradientBoost', GradientBoostingClassifier(), param_grid_gb),
    ('XGBoost', XGBClassifier(), param_grid_xgb)
]

In [37]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [39]:
from sklearn.model_selection import RandomizedSearchCV

models_param = {}
for name, model, param in randomized_cv:
    r_cv = RandomizedSearchCV(estimator=model, param_distributions=param,
                              n_iter=100,
                              cv=cv, 
                              verbose=2, 
                              n_jobs=-1)
    r_cv.fit(X_train, y_train)
    models_param[name] = r_cv.best_params_

    for name in models_param:
        print(f"--------- Best Params for Model {name} ----------")
        print(models_param[name])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

--------- Best Params for Model SVC ----------
{'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
Fitting 5 folds for each of 18 candidates, totalling 90 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

--------- Best Params for Model SVC ----------
{'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
--------- Best Params for Model AdaBoost ----------
{'n_estimators': 200, 'learning_rate': 0.05, 'estimator': DecisionTreeClassifier(max_depth=3)}
Fitting 5 folds for each of 100 candidates, totalling 500 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

--------- Best Params for Model SVC ----------
{'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
--------- Best Params for Model AdaBoost ----------
{'n_estimators': 200, 'learning_rate': 0.05, 'estimator': DecisionTreeClassifier(max_depth=3)}
--------- Best Params for Model GradientBoost ----------
{'subsample': 0.8, 'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 6, 'learning_rate': 0.2}
Fitting 5 folds for each of 81 candidates, totalling 405 fits
--------- Best Params for Model SVC ----------
{'kernel': 'rbf', 'gamma': 0.1, 'C': 10}
--------- Best Params for Model AdaBoost ----------
{'n_estimators': 200, 'learning_rate': 0.05, 'estimator': DecisionTreeClassifier(max_depth=3)}
--------- Best Params for Model GradientBoost ----------
{'subsample': 0.8, 'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 6, 'learning_rate': 0.2}
--------- Best Params for Model XGBoost ----------
{

In [48]:
models = {
    'SVC': SVC(kernel= 'rbf', gamma= 0.1, C= 10, class_weight='balanced'),
    
    'AdaBoost': AdaBoostClassifier(n_estimators= 200, learning_rate= 0.05,
                                   estimator= DecisionTreeClassifier(max_depth=3, class_weight='balanced')),

    'GradientBoost': GradientBoostingClassifier(subsample= 0.8, n_estimators= 300, min_samples_split= 10, 
                                                min_samples_leaf= 10, max_features= 'sqrt', max_depth= 6, learning_rate= 0.2),

    'XGBoost': XGBClassifier(n_estimators= 500, min_child_weight= 5, max_depth= 6, learning_rate= 0.1)
}

In [51]:
for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Model prediction
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Train data performance
    train_as = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)

    # Test data performance
    test_as = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)

    print(list(models.keys())[i])
    print('\t-Train accuracy score:', train_as)
    print('\t-Train f1 score: ', train_f1)
    print('\t-Train recall score: ', train_recall, '\n')

    print('\t-Test accuracy score: ', test_as)
    print('\t-Test f1 score: ', test_f1)
    print('\t-Test recall score: ', test_recall)

    print('*'*15)

SVC
	-Train accuracy score: 1.0
	-Train f1 score:  1.0
	-Train recall score:  1.0 

	-Test accuracy score:  0.8926441351888668
	-Test f1 score:  0.5263157894736842
	-Test recall score:  0.40540540540540543
***************
AdaBoost
	-Train accuracy score: 0.9995229007633588
	-Train f1 score:  0.9995231282784931
	-Train recall score:  1.0 

	-Test accuracy score:  0.9025844930417495
	-Test f1 score:  0.5882352941176471
	-Test recall score:  0.47297297297297297
***************
GradientBoost
	-Train accuracy score: 1.0
	-Train f1 score:  1.0
	-Train recall score:  1.0 

	-Test accuracy score:  0.9165009940357853
	-Test f1 score:  0.6499999999999999
	-Test recall score:  0.527027027027027
***************
XGBoost
	-Train accuracy score: 1.0
	-Train f1 score:  1.0
	-Train recall score:  1.0 

	-Test accuracy score:  0.9184890656063618
	-Test f1 score:  0.7007299270072993
	-Test recall score:  0.6486486486486487
***************


# Final prediction

In [56]:
xgb = XGBClassifier(n_estimators= 500, min_child_weight= 5, max_depth= 6, learning_rate= 0.1)

xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [62]:
final_df = pd.DataFrame(y_pred, columns=['Attrition'])

In [64]:
final_df['Attrition'] = np.where(final_df['Attrition']==0, 'No', 'Yes')

In [66]:
final_df.to_csv('Final_df.csv', index=False)