In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [4]:
df = pd.read_csv("Churn_analyzed.csv")
df.drop("Unnamed: 0", axis =1, inplace = True)

In [5]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,6.480741,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,6.403124,1,83807.86,1,0,1,112542.58,0
2,502,France,0,6.480741,8,159660.8,3,1,0,113931.57,1
3,699,France,0,6.244998,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,6.557439,2,125510.82,1,1,1,79084.1,0


***Train-Test-Split***

In [6]:
X = df.drop("Exited", axis = 1)
y = df["Exited"]

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 39)

# ***LogisticRegression***

In [15]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = LogisticRegression()

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.808875
Testing Accuracy : 0.833
Cross Validation Score : 0.806875
Mean Absolute Error : 0.167


In [16]:
confusion_matrix(y_test, pred_test)

array([[1568,   49],
       [ 285,   98]], dtype=int64)

In [17]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.85      0.97      0.90      1617
           1       0.67      0.26      0.37       383

    accuracy                           0.83      2000
   macro avg       0.76      0.61      0.64      2000
weighted avg       0.81      0.83      0.80      2000



# ***SVC***

In [59]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = SVC(C = 10, kernel = "rbf")

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.859125
Testing Accuracy : 0.867
Cross Validation Score : 0.85175
Mean Absolute Error : 0.133


In [60]:
confusion_matrix(y_test, pred_test)

array([[1580,   37],
       [ 229,  154]], dtype=int64)

In [56]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1617
           1       0.76      0.45      0.56       383

    accuracy                           0.87      2000
   macro avg       0.82      0.71      0.74      2000
weighted avg       0.86      0.87      0.85      2000



# ***KNeighborsClassifier***

In [26]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = KNeighborsClassifier(n_neighbors = 11)

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.849125
Testing Accuracy : 0.836
Cross Validation Score : 0.828125
Mean Absolute Error : 0.164


In [27]:
confusion_matrix(y_test, pred_test)

array([[1565,   52],
       [ 276,  107]], dtype=int64)

In [28]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91      1617
           1       0.67      0.28      0.39       383

    accuracy                           0.84      2000
   macro avg       0.76      0.62      0.65      2000
weighted avg       0.82      0.84      0.81      2000



# ***DecisionTreeClassifier***

In [31]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = DecisionTreeClassifier(max_depth = 6, criterion = "entropy")

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.86375
Testing Accuracy : 0.87
Cross Validation Score : 0.8553750000000001
Mean Absolute Error : 0.13


In [32]:
confusion_matrix(y_test, pred_test)

array([[1555,   62],
       [ 198,  185]], dtype=int64)

In [33]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1617
           1       0.75      0.48      0.59       383

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000



# ***RandomForestClassifier***

In [40]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = RandomForestClassifier(n_estimators = 17)

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.99525
Testing Accuracy : 0.867
Cross Validation Score : 0.852625
Mean Absolute Error : 0.133


In [41]:
confusion_matrix(y_test, pred_test)

array([[1542,   75],
       [ 191,  192]], dtype=int64)

In [42]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92      1617
           1       0.72      0.50      0.59       383

    accuracy                           0.87      2000
   macro avg       0.80      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000



# ***AdaBoostClassifier***

In [43]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = AdaBoostClassifier(n_estimators = 13)

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.85525
Testing Accuracy : 0.8695
Cross Validation Score : 0.853125
Mean Absolute Error : 0.1305


In [44]:
confusion_matrix(y_test, pred_test)

array([[1567,   50],
       [ 211,  172]], dtype=int64)

In [45]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      1617
           1       0.77      0.45      0.57       383

    accuracy                           0.87      2000
   macro avg       0.83      0.71      0.75      2000
weighted avg       0.86      0.87      0.86      2000



# ***GradientBoostingClassifier***

In [46]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = GradientBoostingClassifier(n_estimators = 15, learning_rate = 0.4)

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.866875
Testing Accuracy : 0.871
Cross Validation Score : 0.860625
Mean Absolute Error : 0.129


In [47]:
confusion_matrix(y_test, pred_test)

array([[1552,   65],
       [ 193,  190]], dtype=int64)

In [48]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1617
           1       0.75      0.50      0.60       383

    accuracy                           0.87      2000
   macro avg       0.82      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000



# ***XGBClassifier***

In [49]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = XGBClassifier(n_estimators = 15, learning_rate = 0.3, gamma = 0.4)

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.887
Testing Accuracy : 0.87
Cross Validation Score : 0.8602500000000001
Mean Absolute Error : 0.13


In [50]:
confusion_matrix(y_test, pred_test)

array([[1547,   70],
       [ 190,  193]], dtype=int64)

In [53]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1617
           1       0.73      0.50      0.60       383

    accuracy                           0.87      2000
   macro avg       0.81      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000



# ***Final Model***

In [79]:
step1 = ColumnTransformer(transformers = [
    ("col_tnf", OneHotEncoder(sparse = False, drop = "first"), [1]),
    ("scaler", StandardScaler(), [0, 3, 5, 9])
], remainder = "passthrough")

model = XGBClassifier(n_estimators = 15, learning_rate = 0.3, gamma = 0.4)

pipe = Pipeline([
    ("step1", step1),
    ("model", model)
])

pipe.fit(X_train, y_train)

pred_train = pipe.predict(X_train)
pred_test = pipe.predict(X_test)

print("Training Accuracy :", accuracy_score(y_train, pred_train))
print("Testing Accuracy :", accuracy_score(y_test, pred_test))
score = cross_val_score(pipe, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)
print("Mean Absolute Error :", mean_absolute_error(y_test, pred_test))

Training Accuracy : 0.887
Testing Accuracy : 0.87
Cross Validation Score : 0.8602500000000001
Mean Absolute Error : 0.13


In [62]:
from joblib import dump
dump(pipe, "xgb_model")

['xgb_model']

# ***Predicting with New Data***

In [107]:
new_data = pd.DataFrame({
    "CreditScore": 608,
    "Geography": "Spain",
    "Gender": 0,
    "Age": 41,
    "Tenure": 1,
    "Balance": 500000,
    "NumOfProducts": 4,
    "HasCrCard": 0,
    "IsActiveMember": 1,
    "EstimatedSalary":112542.58,
}, index = [0])

pred = pipe.predict(new_data)[0]
if pred == 0:
    print("Employee wil not Churn from the company.")
else:
    print("Employee will churn from the company")

Employee will churn from the company
