In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report, confusion_matrix

encoder = LabelEncoder()

bank_data = pd.read_csv('bank-additional-full.csv', sep=',')

bank_data['marital'].replace('unknown', 'married', inplace=True)
bank_data['default'].replace('unknown', 'no', inplace=True)
bank_data['loan'].replace('unknown', 'no', inplace=True)


bank_data.drop(bank_data[bank_data.housing == 'unknown'].index, inplace=True)
bank_data.drop(bank_data[bank_data.education == 'unknown'].index, inplace=True)
bank_data.drop(bank_data[bank_data.job == 'unknown'].index, inplace=True)
bank_data.drop(bank_data[bank_data.age >= 70].index, inplace=True)

bank_data.describe(include='all')

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
count,37899.0,37899,37899,37899,37899,37899,37899,37899,37899,37899,37899.0,37899.0,37899.0,37899,37899.0,37899.0,37899.0,37899.0,37899.0,37899
unique,,11,3,7,2,2,2,2,10,5,,,,3,,,,,,2
top,,admin.,married,university.degree,no,yes,no,cellular,may,thu,,,,nonexistent,,,,,,no
freq,,9943,22974,11787,37896,20376,31991,24120,12793,7943,,,,32879,,,,,,33817
mean,39.458825,,,,,,,,,,2.574158,965.357661,0.164859,,0.108509,93.574986,-40.58519,3.651885,5168.87972,
std,9.574816,,,,,,,,,,2.778507,179.656227,0.477472,,1.551501,0.571991,4.566371,1.716182,70.583983,
min,17.0,,,,,,,,,,1.0,0.0,0.0,,-3.4,92.201,-50.8,0.634,4963.6,
25%,32.0,,,,,,,,,,1.0,999.0,0.0,,-1.8,93.075,-42.7,1.354,5099.1,
50%,38.0,,,,,,,,,,2.0,999.0,0.0,,1.1,93.444,-41.8,4.857,5191.0,
75%,46.0,,,,,,,,,,3.0,999.0,0.0,,1.4,93.994,-36.4,4.961,5228.1,


In [2]:
X = bank_data.drop(columns=['y'])
# encoding the data from strings to numbers
X.job = encoder.fit_transform(X.job)
X.marital = encoder.fit_transform(X.marital)
X.education = encoder.fit_transform(X.education)
X.default = encoder.fit_transform(X.default)
X.housing = encoder.fit_transform(X.housing)
X.contact = encoder.fit_transform(X.contact)
X.loan = encoder.fit_transform(X.loan)
X.month = encoder.fit_transform(X.month)
X.day_of_week = encoder.fit_transform(X.day_of_week)
X.poutcome = encoder.fit_transform(X.poutcome)

# confirm target variable has been removed --> should print out one less column
X.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,3,1,0,0,0,0,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
1,57,7,1,3,0,0,0,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
2,37,7,1,3,0,1,0,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
3,40,0,1,1,0,0,0,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0
4,56,7,1,3,0,0,1,1,6,1,1,999,0,1,1.1,93.994,-36.4,4.857,5191.0


In [3]:
y = bank_data['y'].values
scores = cross_val_score(MLPClassifier(hidden_layer_sizes=(19, 19, 19),
                                       activation='relu', solver='adam', max_iter=500), X, y, cv=10, n_jobs=-1)

print(scores.mean())

0.8719433215578215


In [6]:
# split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# feature scaling --> so that all of them can be uniformly evaluated.. 
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)
# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(19, 19, 19),activation='relu', solver='adam', max_iter=500)
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(19, 19, 19), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [7]:
# Making predictions
predictions = mlp.predict(X_test)

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))


[[6603  161]
 [ 607  209]]
              precision    recall  f1-score   support

          no       0.92      0.98      0.95      6764
         yes       0.56      0.26      0.35       816

    accuracy                           0.90      7580
   macro avg       0.74      0.62      0.65      7580
weighted avg       0.88      0.90      0.88      7580



In [8]:
# Cross validation
model = MLPClassifier(hidden_layer_sizes=(19, 19, 19),activation='relu', solver='adam', max_iter=500)
scores = cross_val_score(model,X,y,cv=10,n_jobs=-1)
scores.mean()

0.8973574665170876

In [9]:
# RMSE
scores_rmse = cross_val_score(model,X,y,cv=10,n_jobs=-1, scoring='neg_brier_score')
scores_rmse.mean()

-0.11270010446060885

In [10]:
#ROC_AUC
scores_auc = cross_val_score(model,X,y,cv=10,n_jobs=-1, scoring='roc_auc')
scores_auc.mean()


0.6962372877142636

In [14]:
# One hot encoding... 
one_enc_X = pd.get_dummies(bank_data.drop(columns=["y"]))
one_enc_X

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_admin.,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0,1,...,0,0,0,1,0,0,0,0,1,0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41181,37,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,1,...,0,0,1,0,0,0,0,0,1,0
41182,29,1,9,1,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,0,1
41184,46,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,1,0
41185,56,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,0,...,0,0,1,0,0,0,0,0,1,0


In [15]:
# Cross validation on one hot encoded data.. 
model = MLPClassifier(hidden_layer_sizes=(19, 19, 19),activation='relu', solver='adam', max_iter=500)
scores = cross_val_score(model,one_enc_X,y,cv=10,n_jobs=-1)
scores.mean()

0.8080589346608813

In [16]:
# RMSE on onehotEncoded dataset.
scores_rmse = cross_val_score(model,one_enc_X,y,cv=10,n_jobs=-1, scoring='neg_brier_score')
scores_rmse.mean()

-0.1524685375023547

In [17]:
#ROC_AUC on oneHotEncoded
scores_auc = cross_val_score(model,one_enc_X,y,cv=10,n_jobs=-1, scoring='roc_auc')
scores_auc.mean()

0.5670004496194865