## Bank Marketing data set analysis

In [460]:
# import usefull libraries
import pandas as pd
import numpy as np

### 1- Data preprocessing

In [461]:
#import data
data = pd.read_csv('bank-full.csv',sep=';')

In [462]:
data.tail(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,no


* Traitement des donnees manquantes du type (NaN ou Null)

In [463]:
data.isna().values.any(),data.isnull().values.any()

(False, False)

* Traitement des variables categorielles

In [464]:
#Resume des variables categorielles
labels = ['y','job','marital','education','default','housing','loan','contact','month','poutcome']
for l in labels:
    print(l+":")
    print(data[l].value_counts())
    print('-------------------------------')


y:
no     39922
yes     5289
Name: y, dtype: int64
-------------------------------
job:
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64
-------------------------------
marital:
married     27214
single      12790
divorced     5207
Name: marital, dtype: int64
-------------------------------
education:
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64
-------------------------------
default:
no     44396
yes      815
Name: default, dtype: int64
-------------------------------
housing:
yes    25130
no     20081
Name: housing, dtype: int64
-------------------------------
loan:
no     37967
yes     7244
Name: loan, dtype: int64
-------------------------------
contact:
cellular     29285
unknown      

In [465]:
#Transformation des categories de chaque variable en nombre entier
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
l_encoder = LabelEncoder()
labels = ['y','job','marital','education','default','housing','loan','contact','month','poutcome']
for l in labels:
    data[l] = l_encoder.fit_transform(data[l])

In [466]:
data.tail(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45208,72,5,1,1,0,5715,0,0,0,17,9,1127,5,184,3,2,1
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3,0
45210,37,2,1,1,0,2971,0,0,0,17,9,361,2,188,11,1,0


In [467]:
#Separation du variables independants et des variables independants
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [468]:
X.tail(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
45208,72,5,1,1,0,5715,0,0,0,17,9,1127,5,184,3,2
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3
45210,37,2,1,1,0,2971,0,0,0,17,9,361,2,188,11,1


In [469]:
#X.corr()

* Traitement des "Dummy variables"

In [470]:
#Du type non-boolean
l_index = [1] #,2,3,8,10,15
for li in l_index:
    onehotencoder1 = OneHotEncoder(categorical_features=[li])
    X = onehotencoder1.fit_transform(X).toarray()
    X.shape
    X=X[:,1:]

* Selection des features

In [471]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lv = LinearSVC(C=0.01, penalty='l1', dual=False).fit(X,y)

In [472]:
model = SelectFromModel(lv, prefit=True)
X_new = model.transform(X)
X = X_new

* Separation des donnes en donnees pour l' aprentissage et donnees pour le teste

In [473]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=1/6, random_state = 4)

#Data scalling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### 2 - Creation du modele

In [474]:
#SVM algorythm

from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state=3)
#Training phase
classifier.fit(X_train,y_train)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=3, shrinking=True,
  tol=0.001, verbose=False)

### 4 -  Evaluation du modele par le k-fold cross-validation 

In [475]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=5)
accuracies.mean()

0.8973855341738555

### 5- Prediction


In [476]:
y_pred = classifier.predict(X_test)
#print(y_pred)
#Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

In [478]:
print(cm)

[[6549  118]
 [ 648  221]]


### 6- Optimiser le model par la mathode "Grid Search" pour trouver les hyper-parametres optimaux

In [479]:
#Grid search
from sklearn.model_selection import GridSearchCV
#Test de linearite
parameters = [{'C':[1.0, 1.5], 'kernel':['linear']},
               {'C':[1.0, 1.5], 'kernel':['rbf'], 'gamma':[0.1,0.2,0.3]}]

grid = GridSearchCV(estimator = classifier, param_grid = parameters, scoring='accuracy', cv = 3)
grid = grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
best_accuracy = grid.best_score_
best_parm = grid.best_params_

In [480]:
best_accuracy

0.897677504976775

In [481]:
best_parm

{'C': 1.5, 'gamma': 0.1, 'kernel': 'rbf'}

In [482]:
#print(y_pred)

### 4- Test d' un second modele: Deep Learning

In [486]:
#Build an ANN
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

def build_nn(activation='relu',dropout_rate=0.0, optimizer = 'adam',X=X):
    #initialize the Neural network
    classifier = Sequential()
    #Add the different layers of the ANN
    classifier.add(Dense(12, input_shape = (X.shape[1],), kernel_initializer='uniform')) #Input
    
    classifier.add(Dense(12, activation=activation,kernel_initializer='uniform')) #First hidden layer
    
    classifier.add(Dropout(dropout_rate)) #Droupout layer
    
    classifier.add(Dense(6, activation=activation,kernel_initializer='uniform'))# Second HL
    
    classifier.add(Dense(1,activation='sigmoid',kernel_initializer='uniform')) #Output layer
    
    classifier.summary()
    
    classifier.compile(optimizer= optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return classifier


#classifier.fit(X_train, y_train, batch_size =10, epochs = 20)
#Wrap the ANN 
from keras.wrappers.scikit_learn import KerasClassifier
classifier = KerasClassifier(build_fn = build_nn, epochs = 5, batch_size=10)

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

#k-fold cross-validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv=3)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 12)                252       
_________________________________________________________________
dense_3 (Dense)              (None, 12)                156       
_________________________________________________________________
dropout_1 (Dropout)          (None, 12)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 78        
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 7         
Total params: 493
Trainable params: 493
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
_________________________________________________________________
Layer (type)                

In [488]:
accuracies.mean()

0.891758636590165

* Recherche des hyper-parametres optimaux pour le Deep learning a l' aide du "Grid search"

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'epochs':[10], 'batch_size':[10,15],'activation':['relu','sigmoid','tanh'],
               'dropout_rate':[0.0,0.4],'optimizer':['adam','SGD','Adagrad']}]

grid = GridSearchCV(estimator = classifier, param_grid = parameters, scoring='accuracy', cv = 3)
grid = grid.fit(X_train, y_train)