In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline


In [71]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=8000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

In [44]:
def splitting(data, target):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data, target):
        train_set = data.loc[train_index]
        test_set = data.loc[test_index]
    return train_set, test_set

In [45]:
def data_info(data):
    print(data.head())
    print(data.info())
    print(data.describe())

In [46]:
def drop(data, columns):
    return data.drop(columns, axis=1)

def isnull(data):
    return data.isnull().sum()

In [47]:
df = pd.read_csv('Churn_Modelling.csv')

In [48]:
data_info(df)

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [49]:
df = drop(df, ['RowNumber', 'CustomerId', 'Surname'])

In [50]:
isnull(df)

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [51]:
col = ['Geography', 'Gender']
encoder = LabelEncoder()
for i in col:
    df[i] = encoder.fit_transform(df[i])


In [54]:
pipeline = my_pipeline.fit(df)
df = pd.DataFrame(pipeline.transform(df), columns=df.columns)

In [55]:
x, y = df.drop('Exited', axis=1), df['Exited']

In [56]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(x, y):
    x_train, x_test = x.loc[train_index], x.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

In [57]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((8000, 10), (2000, 10), (8000,), (2000,))

In [58]:
x_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2151,753.0,0.0,1.0,57.0,7.0,0.0,1.0,1.0,0.0,159475.08
8392,739.0,1.0,1.0,32.0,3.0,102128.27,1.0,1.0,0.0,63981.37
5006,755.0,1.0,0.0,37.0,0.0,113865.23,2.0,1.0,1.0,117396.25
4117,561.0,0.0,1.0,37.0,5.0,0.0,2.0,1.0,0.0,83093.25
7182,692.0,1.0,1.0,49.0,6.0,110540.43,2.0,0.0,1.0,107472.99


In [72]:
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_train)
    print(f'{name} accuracy: {accuracy_score(y_train, y_pred)}')

Logistic Regression accuracy: 0.808125
Support Vector Machine accuracy: 0.79625
Random Forest accuracy: 1.0
Decision Tree accuracy: 1.0
K-Nearest Neighbors accuracy: 0.816125


In [73]:
for name, model in models.items():
    y_pred = model.predict(x_test)
    print(f'{name} accuracy: {accuracy_score(y_test, y_pred)}')

Logistic Regression accuracy: 0.8055
Support Vector Machine accuracy: 0.7965
Random Forest accuracy: 0.86
Decision Tree accuracy: 0.7865
K-Nearest Neighbors accuracy: 0.764


In [75]:
# classfications report

for name, model in models.items():
    y_pred = model.predict(x_test)
    print(f'{name} classification report: \n{classification_report(y_test, y_pred)}')

Logistic Regression classification report: 
              precision    recall  f1-score   support

         0.0       0.82      0.97      0.89      1593
         1.0       0.59      0.14      0.23       407

    accuracy                           0.81      2000
   macro avg       0.70      0.56      0.56      2000
weighted avg       0.77      0.81      0.75      2000

Support Vector Machine classification report: 
              precision    recall  f1-score   support

         0.0       0.80      1.00      0.89      1593
         1.0       0.00      0.00      0.00       407

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.63      0.80      0.71      2000

Random Forest classification report: 
              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92      1593
         1.0       0.77      0.44      0.56       407

    accuracy                           0.86      2000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [78]:
# confusion matrix

for name, model in models.items():
    y_pred = model.predict(x_test)
    print(f'{name} confusion matrix: \n{confusion_matrix(y_test, y_pred)}\n')

Logistic Regression confusion matrix: 
[[1553   40]
 [ 349   58]]

Support Vector Machine confusion matrix: 
[[1593    0]
 [ 407    0]]

Random Forest confusion matrix: 
[[1539   54]
 [ 226  181]]

Decision Tree confusion matrix: 
[[1374  219]
 [ 208  199]]

K-Nearest Neighbors confusion matrix: 
[[1494   99]
 [ 373   34]]

