In [None]:
#Importing required libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
data = pd.read_csv('/content/BankChurners.csv')

In [None]:
data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


We see that the data the 10,000+ rows and 23 columns.

The last two columns are unneccesary and can be excluded along with the 'CLIENTNUM' column

Also, we have multiple categorical columns which need to be encoded before being to the model

# Data Pre-processing

In [None]:
data.isna().sum() #to check and count for missing values

Unnamed: 0,0
CLIENTNUM,0
Attrition_Flag,0
Customer_Age,0
Gender,0
Dependent_count,0
Education_Level,0
Marital_Status,0
Income_Category,0
Card_Category,0
Months_on_book,0


In [None]:
#Drop the last two columns

data = data.drop(data.columns[-2:], axis = 1)

In [None]:
data = data.drop('CLIENTNUM', axis = 1)

In [None]:
data.isna().sum()

Unnamed: 0,0
Attrition_Flag,0
Customer_Age,0
Gender,0
Dependent_count,0
Education_Level,0
Marital_Status,0
Income_Category,0
Card_Category,0
Months_on_book,0
Total_Relationship_Count,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  object 
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Op

In [None]:
#Finding the unique value in target column

data['Attrition_Flag'].unique()

# Gives us an idea of the unique values of the target column

array(['Existing Customer', 'Attrited Customer'], dtype=object)

In [None]:
#Finding unique values in each caategorical column

{column: list(data[column].unique()) for column in data.select_dtypes('object').columns}

{'Attrition_Flag': ['Existing Customer', 'Attrited Customer'],
 'Gender': ['M', 'F'],
 'Education_Level': ['High School',
  'Graduate',
  'Uneducated',
  'Unknown',
  'College',
  'Post-Graduate',
  'Doctorate'],
 'Marital_Status': ['Married', 'Single', 'Unknown', 'Divorced'],
 'Income_Category': ['$60K - $80K',
  'Less than $40K',
  '$80K - $120K',
  '$40K - $60K',
  '$120K +',
  'Unknown'],
 'Card_Category': ['Blue', 'Gold', 'Silver', 'Platinum']}

We observe the following columns:


*   'Attrition_Flag' & 'Gender' can be BinaryEncoded
*   'Education Level' & 'Income_Category' can be ordinally encoded
*   'Marital_Status' & 'Card_Category' can be onehot encoded



In [None]:
#To check if we have 'Unknown' values in the categorical columns

{column: (data[column] == 'Unknown').sum() for column in data.select_dtypes('object').columns}

{'Attrition_Flag': 0,
 'Gender': 0,
 'Education_Level': 1519,
 'Marital_Status': 749,
 'Income_Category': 1112,
 'Card_Category': 0}

Since we have multiple "Unknown" as values in columns. These are missing values and we need to handle it.

First we change "Unknkown" to numpy NaN and then handle the missing values accordingly. Either by replacing with Mode or using OneHot Encoding

In [None]:
#Replacing Unknown with numpy NaN

data = data.replace('Unknown', np.NaN)
data.isna().sum()

Unnamed: 0,0
Attrition_Flag,0
Customer_Age,0
Gender,0
Dependent_count,0
Education_Level,1519
Marital_Status,749
Income_Category,1112
Card_Category,0
Months_on_book,0
Total_Relationship_Count,0


In [None]:
#Replacing NaN values in 'Education_Level' and 'Income_Category' with modes

import statistics

print("Modes for the following columns")
print(f"Education_Level: {data['Education_Level'].mode()}")
print(f"Income_Category: {data['Income_Category'].mode()}")


Modes for the following columns
Education_Level: 0    Graduate
Name: Education_Level, dtype: object
Income_Category: 0    Less than $40K
Name: Income_Category, dtype: object


In [None]:
data['Education_Level'] = data['Education_Level'].fillna('Graduate')
data['Income_Category'] = data['Income_Category'].fillna('Less than $40K')
print("Null value replaced with mode successfully !!")


Null value replaced with mode successfully !!


In [None]:
data.isna().sum()

Unnamed: 0,0
Attrition_Flag,0
Customer_Age,0
Gender,0
Dependent_count,0
Education_Level,0
Marital_Status,749
Income_Category,0
Card_Category,0
Months_on_book,0
Total_Relationship_Count,0


# Defining Encoding Functions

In [None]:
#Encoding different categorical columns

def binary_encode(df, column, positive_value):
  df = df.copy()
  df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
  return df

def ordinal_encode(df, column, ordering):
  df = df.copy()
  df[column] = df[column].apply(lambda x: ordering.index(x))
  return df

def onehot_encode(df, column, prefix):
  df = df.copy()
  dummies = pd.get_dummies(df[column], prefix=prefix)
  df = pd.concat([df, dummies], axis = 1)
  df = df.drop(column, axis = 1)
  return df

# Creating a Pre-process function

In [None]:
def preprocess_inputs(df):
  df = df.copy()

  #Binary encoding 'Attrition_Flaf' and 'Gender' column
  df = binary_encode(df, 'Attrition_Flag', positive_value='Attrited Customer')
  df = binary_encode(df, 'Gender', positive_value='M')

  #Ordinally encode 'Education_Level' & 'Income_Category'

  education_ordering = [
              'Uneducated',
              'High School',
              'College',
              'Graduate',
              'Post-Graduate',
              'Doctorate'
               ]
  income_ordering = [
              'Less than $40K',
              '$40K - $60K',
              '$60K - $80K',
              '$80K - $120K',
              '$120K +'
               ]

  df = ordinal_encode(df, 'Education_Level', ordering = education_ordering)
  df = ordinal_encode(df, 'Income_Category', ordering = income_ordering)


  #Encode Nominal Columns
  df = onehot_encode(df, 'Marital_Status', prefix = 'MS')
  df = onehot_encode(df, 'Card_Category', prefix = 'CC')


  #Spliting into x & y
  y = df['Attrition_Flag'].copy()
  x = df.drop('Attrition_Flag', axis = 1).copy()

  #Scaling with standard scaler to have mean = 0 & variance = 1
  scaler = StandardScaler()
  x = pd.DataFrame(scaler.fit_transform(x), columns = x.columns)

  #Returning x & y
  return x, y

In [None]:
#Preprocessing function call
x, y = preprocess_inputs(data)

In [None]:
x

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Income_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,MS_Divorced,MS_Married,MS_Single,CC_Blue,CC_Gold,CC_Platinum,CC_Silver
0,-0.165406,1.059956,0.503368,-0.893680,0.597300,0.384621,0.763943,-1.327136,0.492404,0.446622,-0.473422,0.488971,2.623494,-0.959707,-0.973895,3.834003,-0.775882,-0.282405,1.077338,-0.798507,0.270611,-0.107644,-0.044484,-0.240794
1,0.333570,-0.943436,2.043199,0.593388,-0.887628,1.010715,1.407306,-1.327136,-0.411616,-0.041367,-0.366667,-0.008486,3.563293,-0.916433,-1.357340,12.608573,-0.616276,-0.282405,-0.928214,1.252337,0.270611,-0.107644,-0.044484,-0.240794
2,0.583058,1.059956,0.503368,0.593388,1.339764,0.008965,0.120579,-1.327136,-2.219655,-0.573698,-1.426858,-0.445658,8.367214,-0.740982,-1.911206,6.807864,-0.997155,-0.282405,1.077338,-0.798507,0.270611,-0.107644,-0.044484,-0.240794
3,-0.789126,-0.943436,1.273283,-0.893680,-0.887628,-0.241473,-0.522785,1.641478,-1.315636,-0.585251,1.661686,-0.734100,2.942843,-0.951758,-1.911206,6.807864,1.759686,-0.282405,-0.928214,-0.798507,0.270611,-0.107644,-0.044484,-0.240794
4,-0.789126,1.059956,0.503368,-1.637214,0.597300,-1.869317,0.763943,-1.327136,-2.219655,-0.430877,-1.426858,-0.302868,6.455682,-1.056263,-1.570365,7.509325,-0.997155,-0.282405,1.077338,-0.798507,0.270611,-0.107644,-0.044484,-0.240794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,0.458314,1.059956,-0.266547,0.593388,-0.145164,0.509840,-0.522785,-0.337598,0.492404,-0.509330,0.844455,-0.584929,-0.259771,3.259358,2.221481,0.608119,0.678714,-0.282405,-0.928214,1.252337,0.270611,-0.107644,-0.044484,-0.240794
10123,-0.664382,1.059956,-0.266547,0.593388,-0.145164,-1.368442,0.120579,-0.337598,0.492404,-0.479181,1.255524,-0.591639,0.201004,1.283475,0.176440,-0.122745,0.856458,3.541013,-0.928214,-0.798507,0.270611,-0.107644,-0.044484,-0.240794
10124,-0.290150,-0.943436,-1.036462,-0.893680,-0.887628,0.008965,0.763943,0.651940,1.396424,-0.354626,-1.426858,-0.226632,0.269436,1.732994,-0.207005,0.444305,-0.997155,-0.282405,1.077338,-0.798507,0.270611,-0.107644,-0.044484,-0.240794
10125,-2.036565,1.059956,-0.266547,0.593388,-0.145164,0.008965,0.120579,0.651940,0.492404,-0.368710,-1.426858,-0.240713,-1.026208,1.174848,-0.121795,0.041070,-0.997155,-0.282405,-0.928214,-0.798507,0.270611,-0.107644,-0.044484,-0.240794


In [None]:
y

Unnamed: 0,Attrition_Flag
0,0
1,0
2,0
3,0
4,0
...,...
10122,0
10123,1
10124,1
10125,1


In [None]:
#Model Training and implementation

x_train, x_test, y_train, y_test = train_test_split(x,y, train_size= 0.7,random_state=123)

In [None]:
#Implementation of each model

models = [
        LogisticRegression(),
        SVC(),
        DecisionTreeClassifier(),
        MLPClassifier(),
        RandomForestClassifier()
]

for model in models:
  model.fit(x_train, y_train)


model_names = [
    "  Logistic Rregression",
    "Support Vector Machine",
    "         Decision Tree",
    "        Neural Network",
    "         Random Forest"
]

for model, name in zip(models, model_names):
  print(name + ": {:4f}%".format(model.score(x_test, y_test)*100))



  Logistic Rregression: 90.457387%
Support Vector Machine: 92.991115%
         Decision Tree: 93.188549%
        Neural Network: 93.747943%
         Random Forest: 96.117144%
