## 1.Setup


In [27]:
# Common imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import warnings
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

np.random.seed(1)

## 2. Data Preprocessing

### 2.1 Load Data

In [28]:
df = pd.read_csv('UniversalBank.csv')
df.head(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [29]:
# drop ID, and Zip Code as predictors
df = df.drop(columns=['ID', 'ZIP Code'])

### 2.2 Checking for any null values

In [30]:
df.isnull().sum()

Age                   0
Experience            0
Income                0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [31]:
df.describe()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,45.3384,20.1046,73.7742,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,11.463166,11.467954,46.033729,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,23.0,-3.0,8.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35.0,10.0,39.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,45.0,20.0,64.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,55.0,30.0,98.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,67.0,43.0,224.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


### 2.3 Checking for any categorical variable

In [32]:
df.dtypes

Age                     int64
Experience              int64
Income                  int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object

### 31. Scaling continuous variables

In [33]:
# Create scaler for standardization
scaler = preprocessing.StandardScaler()

#Apply to numerical columns
df.iloc[:,[0,1,2,4,6]] = scaler.fit_transform(df.iloc[:,[0,1,2,4,6]])


In [34]:
df.describe()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2.478018e-17,-1.693312e-16,1.939449e-16,2.3964,-2.078338e-17,1.881,2.810197e-16,0.096,0.1044,0.0604,0.5968,0.294
std,1.0001,1.0001,1.0001,1.147663,1.0001,0.839869,1.0001,0.294621,0.305809,0.23825,0.490589,0.455637
min,-1.948906,-2.014911,-1.428969,1.0,-1.108987,1.0,-0.5555239,0.0,0.0,0.0,0.0,0.0
25%,-0.9019702,-0.8812043,-0.7554825,1.0,-0.7084116,1.0,-0.5555239,0.0,0.0,0.0,0.0,0.0
50%,-0.02952359,-0.009121982,-0.2123482,2.0,-0.2506106,2.0,-0.5555239,0.0,0.0,0.0,1.0,0.0
75%,0.842923,0.8629604,0.5263146,3.0,0.3216407,3.0,0.4375576,0.0,0.0,0.0,1.0,1.0
max,1.889859,1.996667,3.263712,4.0,4.613525,3.0,5.688108,1.0,1.0,1.0,1.0,1.0


### 3.2 Splitting Data into test and training set

In [35]:
# splitting data into test and training set
train_df, test_df = train_test_split(df, test_size=0.3, random_state=1)


In [36]:
# defining predictors and target
target = 'CD Account'
predictors = list(df.columns)
predictors.remove(target)


## 3.Model the data


In [37]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 3.1 Fit and test a Logistic Regression model

In [38]:
log_reg_model = LogisticRegression(penalty='none', max_iter=900)
_ = log_reg_model.fit(train_X, np.ravel(train_y))

In [39]:
model_preds = log_reg_model.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"default logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.9788,1.0,0.649007,0.787149


### 3.2 Fit a SVM Model Using Linear Kernal

In [40]:
svm_lin_model = SVC(kernel="linear")
_ = svm_lin_model.fit(train_X, np.ravel(train_y))

In [41]:
model_preds = svm_lin_model.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149


### 3.3 Fit a SVM classification model using rbf kernal

In [42]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale')
_ = svm_rbf_model.fit(train_X, np.ravel(train_y))

In [43]:
model_preds = svm_rbf_model.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149
0,rbf svm,0.9814,1.0,0.692053,0.818004


### 3.4 Fit a SVM classification model using polynomial kernal

In [44]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10)
svm_poly = svm_poly_model.fit(train_X, np.ravel(train_y))

In [45]:
model_preds = svm_poly_model.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149
0,rbf svm,0.9814,1.0,0.692053,0.818004
0,poly svm,0.9816,1.0,0.695364,0.820312


### 3.5 Decision Tree Random Search

In [46]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,200),  
    'min_samples_leaf': np.arange(1,200),
    'max_leaf_nodes': np.arange(5, 200), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestRecallTree = rand_search.best_estimator_

warnings.filterwarnings('ignore')

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 0.6887978142076503
... with parameters: {'min_samples_split': 129, 'min_samples_leaf': 22, 'max_leaf_nodes': 55, 'max_depth': 23, 'criterion': 'gini'}


In [47]:
c_matrix = confusion_matrix(test_y, rand_search.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]

performance = pd.concat([performance, pd.DataFrame({'model':"Decision tree Random Search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149
0,rbf svm,0.9814,1.0,0.692053,0.818004
0,poly svm,0.9816,1.0,0.695364,0.820312
0,Decision tree Random Search,0.967,0.768627,0.649007,0.70377


### 3.6 Decision Tree Grid Search

In [48]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(90,94),  
    'min_samples_leaf': np.arange(2,5),
    'min_impurity_decrease': np.arange(0.0048, 0.0054, 0.0001),
    'max_leaf_nodes': np.arange(46,50), 
    'max_depth': np.arange(24,28), 
    'criterion': ['entropy'],
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 1344 candidates, totalling 6720 fits
The best recall score is 0.6359562841530055
... with parameters: {'criterion': 'entropy', 'max_depth': 24, 'max_leaf_nodes': 46, 'min_impurity_decrease': 0.0048, 'min_samples_leaf': 2, 'min_samples_split': 90}


In [49]:
c_matrix = confusion_matrix(test_y, grid_search.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Decision tree Grid Search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149
0,rbf svm,0.9814,1.0,0.692053,0.818004
0,poly svm,0.9816,1.0,0.695364,0.820312
0,Decision tree Random Search,0.967,0.768627,0.649007,0.70377
0,Decision tree Grid Search,0.9762,0.955224,0.635762,0.763419


In [50]:
performance.sort_values(by=['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision tree Random Search,0.967,0.768627,0.649007,0.70377
0,Decision tree Grid Search,0.9762,0.955224,0.635762,0.763419
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149
0,rbf svm,0.9814,1.0,0.692053,0.818004
0,poly svm,0.9816,1.0,0.695364,0.820312


In [51]:
performance.sort_values(by=['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision tree Random Search,0.967,0.768627,0.649007,0.70377
0,Decision tree Grid Search,0.9762,0.955224,0.635762,0.763419
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149
0,rbf svm,0.9814,1.0,0.692053,0.818004
0,poly svm,0.9816,1.0,0.695364,0.820312


In [52]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision tree Grid Search,0.9762,0.955224,0.635762,0.763419
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149
0,Decision tree Random Search,0.967,0.768627,0.649007,0.70377
0,rbf svm,0.9814,1.0,0.692053,0.818004
0,poly svm,0.9816,1.0,0.695364,0.820312


In [53]:
performance.sort_values(by=['F1'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision tree Random Search,0.967,0.768627,0.649007,0.70377
0,Decision tree Grid Search,0.9762,0.955224,0.635762,0.763419
0,default logistic,0.9788,1.0,0.649007,0.787149
0,linear svm,0.9788,1.0,0.649007,0.787149
0,rbf svm,0.9814,1.0,0.692053,0.818004
0,poly svm,0.9816,1.0,0.695364,0.820312


Among all the models, Poly SVM had highest accuracy,precision,recall and F1 rate of 0.9816, 1.0, 0.6953, 0.8203 respectively. 
Poly SVM model is fitting the data quite accurately and it shows great performance for all the four metrics when compared to other models. Among all the 6 models, Poly SVM is the best fit for the data.