# Universal bank marketing campaign to existing customers (selling new Securities account product) using 
##### a.1)Logistic Regression using RandomSearchCV
##### a.2)Logistic Regression using GridSearchCV
##### b.1)SVM classification with Linear|RFB|Polynomial Kernal using RandomSearchCV
##### b.2)SVM classification with Linear|RFB|Polynomial Kernal using GridSearchCV
##### c.1)Decision trees using RandomSearchCV
##### c.2)Decision trees using GridSearchCV

#### Target Variable : Securities Account
#### Scoring Measure : Precision

### Shambhavi Mishra

### U01040066

### Import Libraries

In [1]:
import pandas as pd
from sklearn.svm import SVC
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report  
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

np.random.seed(1)

### 1. Loading data

In [2]:
df = pd.read_csv("/Users/shambhavimishra/Downloads/DSP/UniversalBank.csv") 
df.head(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


#### Number of rows & columns

In [4]:
df.shape 

(5000, 14)

 #### Summary of numerical value

In [5]:
df.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


#### Statistical summary of numeriacal values in the data

In [6]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


#### Target Variable "Securities Account"

In [7]:
df['Securities Account'].value_counts() 

0    4478
1     522
Name: Securities Account, dtype: int64

#### Missing value check

In [8]:
df.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

#### Check for null values

In [9]:
df.isnull().sum() 

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [10]:
df.corr()['Securities Account'].sort_values(ascending=False)

Securities Account    1.000000
CD Account            0.317034
Personal Loan         0.021954
Family                0.019994
CCAvg                 0.015086
Online                0.012627
ZIP Code              0.004704
Age                  -0.000436
Experience           -0.001232
Income               -0.002616
Mortgage             -0.005411
Education            -0.010812
CreditCard           -0.015028
ID                   -0.016972
Name: Securities Account, dtype: float64

#### Data Cleaning

In [11]:
df.drop(columns=["ID"])
df.drop(columns=["ZIP Code"])
df.head(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


#### Using One-hot Encoding to encode the categorical variable "Education"

In [12]:
dummies_df = pd.get_dummies(df['Education'], prefix='Education', drop_first=True)
df = df.join(dummies_df)
df.drop('Education', axis=1, inplace = True)
df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Education_2,Education_3
0,1,25,1,49,91107,4,1.6,0,0,1,0,0,0,0,0
1,2,45,19,34,90089,3,1.5,0,0,1,0,0,0,0,0
2,3,39,15,11,94720,1,1.0,0,0,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,0,0,0,0,0,0,1,0
4,5,35,8,45,91330,4,1.0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,0,0,0,0,1,0,0,1
4996,4997,30,4,15,92037,4,0.4,85,0,0,0,1,0,0,0
4997,4998,63,39,24,93023,2,0.3,0,0,0,0,0,0,0,1
4998,4999,65,40,49,90034,3,0.5,0,0,0,0,1,0,1,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Mortgage            5000 non-null   int64  
 8   Personal Loan       5000 non-null   int64  
 9   Securities Account  5000 non-null   int64  
 10  CD Account          5000 non-null   int64  
 11  Online              5000 non-null   int64  
 12  CreditCard          5000 non-null   int64  
 13  Education_2         5000 non-null   uint8  
 14  Education_3         5000 non-null   uint8  
dtypes: float64(1), int64(12), uint8(2)
memory usage: 517.7 

#### Train-Test Split

In [14]:
train_df, test_df = train_test_split(df, test_size=0.3)
target = 'Securities Account'
predictors = list(df.columns)
predictors.remove(target)

In [15]:
scaler = preprocessing.StandardScaler()
cols_to_stdize = predictors               
train_df[cols_to_stdize] = scaler.fit_transform(train_df[cols_to_stdize])
test_df[cols_to_stdize] = scaler.transform(test_df[cols_to_stdize])

train_X = train_df[predictors]
train_y = train_df[target]
test_X = train_df[predictors]
test_y = test_df[target]

#### Splitting dataset into Training & Testing data

In [16]:
X = df.drop('Securities Account',axis=1)
y = df['Securities Account']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 12)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3500, 14), (1500, 14), (3500,), (1500,))

# Model the Data

### a.1) Logistic regression Using RandomSearchCV

In [17]:
score_measure = "precision"
kfolds = 3

param_grid = {'C':[0.001,0.01,0.1,1,10], # C is the regulization strength
               'penalty':['l1', 'l2','elasticnet','none'],
              'solver':['saga','liblinear'],
              'max_iter': np.arange(500,1000)
    
    
}

logreg = LogisticRegression()
randomSearch = RandomizedSearchCV(estimator = logreg, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = randomSearch.fit(train_X,train_y)

print(f"The best {score_measure} score is {randomSearch.best_score_}")
print(f"... with parameters: {randomSearch.best_params_}")

bestlr = randomSearch.best_estimator_

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The best precision score is 0.9333333333333332
... with parameters: {'solver': 'saga', 'penalty': 'l2', 'max_iter': 892, 'C': 0.01}


### a.2) Logistic regression Using GridSearchCV

In [18]:
score_measure = "precision"
kfolds = 3
best_penality = randomSearch.best_params_['penalty']
best_solver = randomSearch.best_params_['solver']
min_regulization_strength=randomSearch.best_params_['C']
min_iter = randomSearch.best_params_['max_iter']

#Using the best parameters from the Random Search to use as range for the parameters to do the grid search
param_grid = {
    
    'C':np.arange(min_regulization_strength-1,min_regulization_strength+1), 
               'penalty':[best_penality],
              'solver':[best_solver],
              'max_iter': np.arange(min_iter-400,min_iter+400)
}

logreg =  LogisticRegression()
gridSearch = GridSearchCV(estimator = logreg, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1, # n_jobs=-1 will utilize all available CPUs 
                return_train_score=True)

_ = gridSearch.fit(train_X,train_y)

print(f"The best {score_measure} score is {gridSearch.best_score_}")
print(f"... with parameters: {gridSearch.best_params_}")

bestlogreg = gridSearch.best_estimator_

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits
The best precision score is 0.9333333333333332
... with parameters: {'C': 0.010000000000000009, 'max_iter': 493, 'penalty': 'l2', 'solver': 'saga'}


2400 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2400 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/shambhavimishra/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/shambhavimishra/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/Users/shambhavimishra/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "/Users/shambhavimishra/opt/anaconda3/lib/python3.9/site-packages/skl

### b.1.1)SVM classification model with linear kernel using RandomSearchCV

In [19]:
score_measure = "precision"
kfolds = 3

param_grid = {
     'C': [0.0001, 0.001, 0.1, 1, 10, 100, 1000], 
    'kernel': ['linear']
}

random_linear_SVC = SVC()
random_search = RandomizedSearchCV(estimator = random_linear_SVC, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = random_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {random_search.best_score_}")
print(f"... with parameters: {random_search.best_params_}")

bestPrecisionTree = random_search.best_estimator_



Fitting 3 folds for each of 7 candidates, totalling 21 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The best precision score is 0.0
... with parameters: {'kernel': 'linear', 'C': 0.0001}


### b.1.2)SVM classification model with rbf kernel using RandomSearchCV

In [20]:
score_measure = "precision"
kfolds = 3

param_grid = {
     'C': [0.1,1, 10], 
    'gamma': [1,0.1,0.011],
    'kernel': ['rbf']
}

random_rbf_SVC = SVC()
random_search = RandomizedSearchCV(estimator = random_rbf_SVC, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = random_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {random_search.best_score_}")
print(f"... with parameters: {random_search.best_params_}")

bestPrecisionTree = random_search.best_estimator_



Fitting 3 folds for each of 9 candidates, totalling 27 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

The best precision score is 0.0
... with parameters: {'kernel': 'rbf', 'gamma': 1, 'C': 0.1}


### b.1.3)SVM classification model with polynomial kernel using RandomSearchCV

In [None]:
score_measure = "precision"
kfolds = 3

param_grid = {
     'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['poly']
}

random_Poly_SVC = SVC()
random_search = RandomizedSearchCV(estimator = random_Poly_SVC, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = random_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {random_search.best_score_}")
print(f"... with parameters: {random_search.best_params_}")

bestPrecisionTree = random_search.best_estimator_



Fitting 3 folds for each of 16 candidates, totalling 48 fits


### b.2.1)SVM classification model with linear kernal using GridSearchCV

In [None]:
score_measure = "precision"
kfolds = 3

param_grid = {
     'C': [0.0001, 0.001, 0.1, 1, 10, 100, 1000], 
    'kernel': ['linear']
}

grid_linear_SVC = SVC()
grid_search = GridSearchCV(estimator = grid_linear_SVC, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

### b.2.2)SVM classification model with rbf kernal using GridSearchCV

In [None]:
score_measure = "precision"
kfolds = 3

param_grid = {
     'C': [0.1,1, 10], 
    'gamma': [1,0.1,0.011],
    'kernel': ['rbf']
}

grid_rbf_SVC = SVC()
grid_search = GridSearchCV(estimator = grid_rbf_SVC, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

### b.2.3)SVM classification model with polynomial kernal using GridSearchCV

In [None]:
score_measure = "precision"
kfolds = 3

param_grid = {
     'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['poly']
}


grid_Poly_SVC = SVC()
grid_search = GridSearchCV(estimator = grid_Poly_SVC, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

### c.1)Decision Tree Model using RandomSearchCV

In [None]:
score_measure = "precision"
kfolds = 3

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

decisiontree = DecisionTreeClassifier()
randomSearch = RandomizedSearchCV(estimator = decisiontree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = randomSearch.fit(X_train, y_train)

print(f"The best {score_measure} score is {randomSearch.best_score_}")
print(f"... with parameters: {randomSearch.best_params_}")

bestRecallTree = randomSearch.best_estimator_

### c.2)Decision Tree Model using GridSearchCV

In [None]:
score_measure = "precision"
kfolds = 3

param_grid = {
    'min_samples_split': np.arange(94,99),  
    'min_samples_leaf': np.arange(9,13),
    'min_impurity_decrease': np.arange(0.0038, 0.0044, 0.0001),
    'max_leaf_nodes': np.arange(92,98), 
    'max_depth': np.arange(38,44), 
    'criterion': ['entropy'],
}

d_tree = DecisionTreeClassifier()
gridSearch = GridSearchCV(estimator = d_tree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = gridSearch.fit(X_train, y_train)

print(f"The best {score_measure} score is {gridSearch.best_score_}")
print(f"... with parameters: {gridSearch.best_params_}")

bestRecallTree = gridSearchh.best_estimator_

## Conclusion: 
#### From the above analysis, the following can be inferred:
#### Universal bank marketing campaign to existing customers (selling new Securities account product) where the Target Variable is "Securities Account"
#### Precision Score using the following:
##### a.1)Logistic Regression using RandomSearchCV: 0.9333333333333332 i.e., 93%
##### a.2)Logistic Regression using GridSearchCV: 0.9333333333333332 i.e., 93%

###### The data was divided into training datasets which comprises 70% of data and test datasets which comprises 30% of data. 

#### The Model that stands the best is Logistic Regression bith RandomSearchCV & GridSearchCV. 