In [11]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotnine as p9

%matplotlib inline 


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from xgboost import XGBClassifier



from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score

import warnings
warnings.filterwarnings("ignore")


In [4]:
#constants 
SEED = 42
REMOVE_COLUMNS = ['RowNumber','CustomerId','Surname']

In [23]:
#reading the data
df = pd.read_csv('data/processed_data.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,6,15574012,Chu,645,Spain,Male,44.0,8,113755.78,2,1.0,0.0,149756.71,1


In [24]:
proc_df = df.drop(columns=REMOVE_COLUMNS)
proc_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,645,Spain,Male,44.0,8,113755.78,2,1.0,0.0,149756.71,1


In [25]:
proc_df = (proc_df.assign(
    Geography = proc_df.Geography.astype('category'),
    Gender = proc_df.Gender.astype('category'),
    Exited = proc_df.Exited.astype('category')
))
proc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9996 entries, 0 to 9995
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   CreditScore      9996 non-null   int64   
 1   Geography        9996 non-null   category
 2   Gender           9996 non-null   category
 3   Age              9996 non-null   float64 
 4   Tenure           9996 non-null   int64   
 5   Balance          9996 non-null   float64 
 6   NumOfProducts    9996 non-null   int64   
 7   HasCrCard        9996 non-null   float64 
 8   IsActiveMember   9996 non-null   float64 
 9   EstimatedSalary  9996 non-null   float64 
 10  Exited           9996 non-null   category
dtypes: category(3), float64(5), int64(3)
memory usage: 654.5 KB


In [42]:
x = proc_df.loc[:,proc_df.columns != 'Exited']
y= proc_df['Exited']

In [43]:
num_cols = x.select_dtypes(['integer']).columns
cat_cols = x.select_dtypes(['category']).columns
print(f'Categorical Columns : {str(cat_cols)}')
print(f'Numerical Columns : {str(num_cols)}')

Categorical Columns : Index(['Geography', 'Gender'], dtype='object')
Numerical Columns : Index(['CreditScore', 'Tenure', 'NumOfProducts'], dtype='object')


In [44]:
# Create Column Transformer with 3 types of transformers
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_cols),
         ("StandardScaler", numeric_transformer, num_cols),        
    ]
)

x = preprocessor.fit_transform(x)
x.shape

(9996, 8)

In [45]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=SEED)
X_train.shape, X_test.shape


((7996, 8), (2000, 8))

In [46]:
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1score = f1_score(true,predicted)
    auc = roc_auc_score(true, predicted)
    return accuracy,precision,recall,f1score,auc


In [51]:
def model_executor(models,X_train,X_test,y_train,y_test):
    model_list = []
    auc_list =[]
    f1_list = []

    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Evaluate Train and Test dataset
        model_train_accuracy , model_train_precision, model_train_recall,model_train_f1score,model_train_auc= evaluate_model(y_train, y_train_pred)
        model_test_accuracy , model_test_precision, model_test_recall,model_test_f1score,model_test_auc = evaluate_model(y_test, y_test_pred)


        print(list(models.keys())[i])
        model_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Accuracy: {:.4f}".format(model_train_accuracy))
        print("- Precision: {:.4f}".format(model_train_precision))
        print("- Recall: {:.4f}".format(model_train_recall))
        print("- F1 Score: {:.4f}".format(model_train_f1score))
        print("- AUC: {:.4f}".format(model_train_auc))

        print('----------------------------------')

        print('Model performance for Test set')
        print("- Accuracy: {:.4f}".format(model_test_accuracy))
        print("- Precision: {:.4f}".format(model_test_precision))
        print("- Recall: {:.4f}".format(model_test_recall))
        print("- F1 Score: {:.4f}".format(model_test_f1score))
        print("- AUC: {:.4f}".format(model_test_auc))
        auc_list.append(model_test_auc)
        f1_list.append(model_test_f1score)

        print('='*35)
        print('\n')
    return model_list,auc_list,f1_list

In [52]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Linear SVC": LinearSVC(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Ada Boost Classifier": AdaBoostClassifier(),
    "XGBoost Classifier": XGBClassifier()
    }
model_list,auc_list,f1_list = model_executor(models,X_train,X_test,y_train,y_test)

Logistic Regression
Model performance for Training set
- Accuracy: 0.7964
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000
- AUC: 0.5000
----------------------------------
Model performance for Test set
- Accuracy: 0.7955
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000
- AUC: 0.5000


Linear SVC
Model performance for Training set
- Accuracy: 0.7964
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000
- AUC: 0.5000
----------------------------------
Model performance for Test set
- Accuracy: 0.7955
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000
- AUC: 0.5000


Decision Tree Classifier
Model performance for Training set
- Accuracy: 0.9791
- Precision: 0.9993
- Recall: 0.8980
- F1 Score: 0.9460
- AUC: 0.9489
----------------------------------
Model performance for Test set
- Accuracy: 0.7335
- Precision: 0.3473
- Recall: 0.3447
- F1 Score: 0.3460
- AUC: 0.5891


Random Forest Classifier
Model performance for Training set
- Accuracy: 0.9791
- Precision: 0.9710
- 

In [53]:
perfDf = pd.DataFrame(list(zip(model_list, auc_list,f1_list)), columns=['Model Name', 'AUC','F1Score']).sort_values(by=["AUC","F1Score"],ascending=[False,False])
perfDf

Unnamed: 0,Model Name,AUC,F1Score
5,XGBoost Classifier,0.597353,0.341137
3,Random Forest Classifier,0.595549,0.353247
2,Decision Tree Classifier,0.589091,0.346012
4,Ada Boost Classifier,0.583796,0.29572
0,Logistic Regression,0.5,0.0
1,Linear SVC,0.5,0.0
