# This Notebook Includes Classification Using Tabular Data, by Implementing Different Machine Learning Models

In [1]:
#Importing necessary packages
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyClassifier
import seaborn as sns

In [2]:
#Importing the data
df = pd.read_csv(r"E:\bank-data\bank-tabular.csv", sep="\t")
df

Unnamed: 0,customer_id,date,customer_gender,customer_age,customer_location,customer_type,has_cc,has_mortgage,convenience,customer_service,online_banking,interest_rates,fees_charges,community_involvement,products_services,privacy_security,reputation,satisfied
0,216604,2022-08-22,Male,50.00,Munster,Personal,True,False,4.00,5.00,4.00,4.00,4.00,4.00,5.00,2.00,4.00,True
1,259276,2022-11-23,Female,61.00,Leinster,Personal,True,False,5.00,5.00,5.00,3.00,5.00,4.00,4.00,5.00,5.00,True
2,265459,2022-01-21,Female,63.00,Munster,Business,True,False,2.00,2.00,5.00,5.00,2.00,,4.00,4.00,,True
3,58770,2022-03-13,f,,Leinster,Business,True,False,,4.00,5.00,1.00,5.00,5.00,5.00,5.00,5.00,True
4,318031,2022-08-08,Female,41.00,Leinster,Personal,True,True,1.00,1.00,1.00,1.00,2.00,2.00,4.00,5.00,2.00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,322582,2021-09-23,Male,41.00,Munster,Personal,True,True,3.00,3.00,3.00,3.00,5.00,3.00,3.00,1.00,5.00,False
2996,53418,2021-03-07,f,57.00,Munster,Business,False,False,3.00,2.00,5.00,1.00,2.00,2.00,2.00,2.00,3.00,True
2997,79364,2021-08-01,m,,Munster,Personal,True,True,3.00,3.00,3.00,4.00,4.00,3.00,4.00,4.00,4.00,False
2998,371134,2021-06-25,m,42.00,Leinster,Business,False,False,3.00,2.00,1.00,5.00,4.00,4.00,3.00,4.00,1.00,True


## Data Preprocessing Steps: Data Transformation, Filling in Missing Values, Dropping Unnecessary Columns and Checking Correlation to Remove Redundant Features

In [3]:
df.drop(['customer_id','date'],axis=1,inplace=True)
df

Unnamed: 0,customer_gender,customer_age,customer_location,customer_type,has_cc,has_mortgage,convenience,customer_service,online_banking,interest_rates,fees_charges,community_involvement,products_services,privacy_security,reputation,satisfied
0,Male,50.00,Munster,Personal,True,False,4.00,5.00,4.00,4.00,4.00,4.00,5.00,2.00,4.00,True
1,Female,61.00,Leinster,Personal,True,False,5.00,5.00,5.00,3.00,5.00,4.00,4.00,5.00,5.00,True
2,Female,63.00,Munster,Business,True,False,2.00,2.00,5.00,5.00,2.00,,4.00,4.00,,True
3,f,,Leinster,Business,True,False,,4.00,5.00,1.00,5.00,5.00,5.00,5.00,5.00,True
4,Female,41.00,Leinster,Personal,True,True,1.00,1.00,1.00,1.00,2.00,2.00,4.00,5.00,2.00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,Male,41.00,Munster,Personal,True,True,3.00,3.00,3.00,3.00,5.00,3.00,3.00,1.00,5.00,False
2996,f,57.00,Munster,Business,False,False,3.00,2.00,5.00,1.00,2.00,2.00,2.00,2.00,3.00,True
2997,m,,Munster,Personal,True,True,3.00,3.00,3.00,4.00,4.00,3.00,4.00,4.00,4.00,False
2998,m,42.00,Leinster,Business,False,False,3.00,2.00,1.00,5.00,4.00,4.00,3.00,4.00,1.00,True


In [4]:
df['satisfied'].value_counts()

False    1737
True     1263
Name: satisfied, dtype: int64

### THIS SHOWS THAT DATA IS SLIGHTLY UNBALANCED, THEREFORE THE BEST METRIC TO EVALUATE THESE MODELS WOULD BALANCED ACCURACY

In [5]:
#Checking Unique Values
df.nunique()

customer_gender           6
customer_age             71
customer_location         4
customer_type             3
has_cc                    2
has_mortgage              2
convenience               5
customer_service          5
online_banking            5
interest_rates            5
fees_charges              5
community_involvement     5
products_services         5
privacy_security          5
reputation                5
satisfied                 2
dtype: int64

In [6]:
#Checknig missing values across all columns
df.isna().sum()

customer_gender          312
customer_age             470
customer_location        365
customer_type              0
has_cc                     0
has_mortgage               0
convenience               64
customer_service          29
online_banking            30
interest_rates           165
fees_charges              76
community_involvement    300
products_services        100
privacy_security         262
reputation               402
satisfied                  0
dtype: int64

### Since the data is categorical in nature, the rule of thumb is to fill missing values of that column by its mode

In [7]:
df['customer_age'].fillna(value=df['customer_age'].mode()[0], inplace=True)
df['interest_rates'].fillna(value=df['interest_rates'].mode()[0], inplace=True)
df['customer_location'].fillna(value=df['customer_location'].mode()[0], inplace=True)
df['customer_gender'].fillna(value=df['customer_gender'].mode()[0], inplace=True)
df['convenience'].fillna(value=df['convenience'].mode()[0], inplace=True)
df['customer_service'].fillna(value=df['customer_service'].mode()[0], inplace=True)
df['online_banking'].fillna(value=df['online_banking'].mode()[0], inplace=True)
df['fees_charges'].fillna(value=df['fees_charges'].mode()[0], inplace=True)
df['community_involvement'].fillna(value=df['community_involvement'].mode()[0], inplace=True)
df['products_services'].fillna(value=df['products_services'].mode()[0], inplace=True)
df['privacy_security'].fillna(value=df['privacy_security'].mode()[0], inplace=True)
df['reputation'].fillna(value=df['reputation'].mode()[0], inplace=True)

In [8]:
#Succesfully removed missing values
df.isna().sum()

customer_gender          0
customer_age             0
customer_location        0
customer_type            0
has_cc                   0
has_mortgage             0
convenience              0
customer_service         0
online_banking           0
interest_rates           0
fees_charges             0
community_involvement    0
products_services        0
privacy_security         0
reputation               0
satisfied                0
dtype: int64

In [9]:
df['customer_gender'].value_counts()

Female           1436
Male             1057
Not specified     221
f                 134
m                 118
Unspecified        34
Name: customer_gender, dtype: int64

In [10]:
#Cleaning gender column
df['customer_gender'] = df['customer_gender'].str.replace('Not specified','Undisclosed')
df['customer_gender'] = df['customer_gender'].str.replace('Unspecified','Undisclosed')
df['customer_gender'] = df['customer_gender'].str.replace('f','Female')
df['customer_gender'] = df['customer_gender'].str.replace('m','Male')
df['customer_gender'] = df['customer_gender'].str.replace('FeMaleale','Female')
df

Unnamed: 0,customer_gender,customer_age,customer_location,customer_type,has_cc,has_mortgage,convenience,customer_service,online_banking,interest_rates,fees_charges,community_involvement,products_services,privacy_security,reputation,satisfied
0,Male,50.00,Munster,Personal,True,False,4.00,5.00,4.00,4.00,4.00,4.00,5.00,2.00,4.00,True
1,Female,61.00,Leinster,Personal,True,False,5.00,5.00,5.00,3.00,5.00,4.00,4.00,5.00,5.00,True
2,Female,63.00,Munster,Business,True,False,2.00,2.00,5.00,5.00,2.00,4.00,4.00,4.00,4.00,True
3,Female,48.00,Leinster,Business,True,False,2.00,4.00,5.00,1.00,5.00,5.00,5.00,5.00,5.00,True
4,Female,41.00,Leinster,Personal,True,True,1.00,1.00,1.00,1.00,2.00,2.00,4.00,5.00,2.00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,Male,41.00,Munster,Personal,True,True,3.00,3.00,3.00,3.00,5.00,3.00,3.00,1.00,5.00,False
2996,Female,57.00,Munster,Business,False,False,3.00,2.00,5.00,1.00,2.00,2.00,2.00,2.00,3.00,True
2997,Male,48.00,Munster,Personal,True,True,3.00,3.00,3.00,4.00,4.00,3.00,4.00,4.00,4.00,False
2998,Male,42.00,Leinster,Business,False,False,3.00,2.00,1.00,5.00,4.00,4.00,3.00,4.00,1.00,True


In [11]:
#Converting object data to categorical
for col_name in df.columns:
    if(df[col_name].dtype == 'object'):
        df[col_name]= df[col_name].astype('category')
        df[col_name] = df[col_name].cat.codes
df        

Unnamed: 0,customer_gender,customer_age,customer_location,customer_type,has_cc,has_mortgage,convenience,customer_service,online_banking,interest_rates,fees_charges,community_involvement,products_services,privacy_security,reputation,satisfied
0,1,50.00,2,2,True,False,4.00,5.00,4.00,4.00,4.00,4.00,5.00,2.00,4.00,True
1,0,61.00,1,2,True,False,5.00,5.00,5.00,3.00,5.00,4.00,4.00,5.00,5.00,True
2,0,63.00,2,0,True,False,2.00,2.00,5.00,5.00,2.00,4.00,4.00,4.00,4.00,True
3,0,48.00,1,0,True,False,2.00,4.00,5.00,1.00,5.00,5.00,5.00,5.00,5.00,True
4,0,41.00,1,2,True,True,1.00,1.00,1.00,1.00,2.00,2.00,4.00,5.00,2.00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1,41.00,2,2,True,True,3.00,3.00,3.00,3.00,5.00,3.00,3.00,1.00,5.00,False
2996,0,57.00,2,0,False,False,3.00,2.00,5.00,1.00,2.00,2.00,2.00,2.00,3.00,True
2997,1,48.00,2,2,True,True,3.00,3.00,3.00,4.00,4.00,3.00,4.00,4.00,4.00,False
2998,1,42.00,1,0,False,False,3.00,2.00,1.00,5.00,4.00,4.00,3.00,4.00,1.00,True


In [12]:
#converting boolean data for dependent variable to binary
for col_name in df.columns:
    if(df[col_name].dtype == 'bool'):
        df[col_name]= df[col_name].astype('category')
        df[col_name] = df[col_name].cat.codes
df

Unnamed: 0,customer_gender,customer_age,customer_location,customer_type,has_cc,has_mortgage,convenience,customer_service,online_banking,interest_rates,fees_charges,community_involvement,products_services,privacy_security,reputation,satisfied
0,1,50.00,2,2,1,0,4.00,5.00,4.00,4.00,4.00,4.00,5.00,2.00,4.00,1
1,0,61.00,1,2,1,0,5.00,5.00,5.00,3.00,5.00,4.00,4.00,5.00,5.00,1
2,0,63.00,2,0,1,0,2.00,2.00,5.00,5.00,2.00,4.00,4.00,4.00,4.00,1
3,0,48.00,1,0,1,0,2.00,4.00,5.00,1.00,5.00,5.00,5.00,5.00,5.00,1
4,0,41.00,1,2,1,1,1.00,1.00,1.00,1.00,2.00,2.00,4.00,5.00,2.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1,41.00,2,2,1,1,3.00,3.00,3.00,3.00,5.00,3.00,3.00,1.00,5.00,0
2996,0,57.00,2,0,0,0,3.00,2.00,5.00,1.00,2.00,2.00,2.00,2.00,3.00,1
2997,1,48.00,2,2,1,1,3.00,3.00,3.00,4.00,4.00,3.00,4.00,4.00,4.00,0
2998,1,42.00,1,0,0,0,3.00,2.00,1.00,5.00,4.00,4.00,3.00,4.00,1.00,1


In [13]:
#Checking the correlation between all features of the data 
#Reference:https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python
corr_matrix=df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.50)]
to_drop
upper

Unnamed: 0,customer_gender,customer_age,customer_location,customer_type,has_cc,has_mortgage,convenience,customer_service,online_banking,interest_rates,fees_charges,community_involvement,products_services,privacy_security,reputation,satisfied
customer_gender,,0.0,0.02,0.01,0.01,0.0,0.01,0.01,0.01,0.0,0.02,0.0,0.02,0.02,0.0,0.02
customer_age,,,0.0,0.02,0.21,0.03,0.0,0.0,0.15,0.09,0.05,0.06,0.02,0.03,0.03,0.09
customer_location,,,,0.04,0.0,0.03,0.03,0.02,0.01,0.0,0.02,0.02,0.03,0.0,0.01,0.02
customer_type,,,,,0.01,0.46,0.02,0.06,0.19,0.2,0.16,0.11,0.12,0.12,0.07,0.38
has_cc,,,,,,0.01,0.0,0.0,0.0,0.02,0.0,0.02,0.03,0.03,0.02,0.01
has_mortgage,,,,,,,0.11,0.12,0.16,0.23,0.17,0.04,0.04,0.04,0.07,0.44
convenience,,,,,,,,0.43,0.28,0.13,0.17,0.06,0.06,0.08,0.1,0.3
customer_service,,,,,,,,,0.25,0.05,0.05,0.01,0.02,0.03,0.03,0.21
online_banking,,,,,,,,,,0.3,0.21,0.11,0.08,0.09,0.21,0.46
interest_rates,,,,,,,,,,,0.33,0.11,0.08,0.08,0.33,0.35


### Since there are no two columns with high correlation, the data can be fed to a classifier without dropping any feature.

In [14]:
X=df.drop(['satisfied'],axis=1).values
y=df['satisfied'].values

In [15]:
#Scaling Values of Training Set
standard = preprocessing.scale(X)
X

array([[ 1., 50.,  2., ...,  5.,  2.,  4.],
       [ 0., 61.,  1., ...,  4.,  5.,  5.],
       [ 0., 63.,  2., ...,  4.,  4.,  4.],
       ...,
       [ 1., 48.,  2., ...,  4.,  4.,  4.],
       [ 1., 42.,  1., ...,  3.,  4.,  1.],
       [ 1., 42.,  1., ...,  2.,  2.,  3.]])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state= 2023)

In [17]:
clf=LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
models,predictions=clf.fit(X_train, X_test, y_train, y_test)
print(models)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:03<00:00,  8.77it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
RandomForestClassifier             0.89               0.87     0.87      0.89   
SVC                                0.88               0.87     0.87      0.88   
ExtraTreesClassifier               0.88               0.86     0.86      0.88   
XGBClassifier                      0.87               0.86     0.86      0.87   
LGBMClassifier                     0.88               0.86     0.86      0.87   
AdaBoostClassifier                 0.86               0.85     0.85      0.86   
BaggingClassifier                  0.86               0.85     0.85      0.86   
NuSVC                              0.85               0.84     0.84      0.85   
GaussianNB                         0.84               0.83     0.83      0.84   
QuadraticDiscriminantAnalysis      0.83               0.83     0.83      0.83   
KNeighborsClassifier        




## Conclusion 

Even though the results are similar and comparable, Random Forest Classifier and SVM(C) have performed the best across all Classification Metrics, especially Balanced Accuracy as the output classes were Imbalanced.