In [None]:
###############################################
# Author: Pedro Igor Salvador Alves
# Project: Bank Marketing Prediction
# Database Link: https://archive.ics.uci.edu/ml/datasets/bank+marketing
# Start Date: 31/12/2021
# Type: Data Science - Machine Learning 
###############################################

In [1]:
import pandas as pd

dataset = pd.read_csv("data/bank.csv")

In [2]:
dataset

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [3]:
#drop duration feature. The idea is understading if a client would buy a product before calling the client.
dataset = dataset.drop("duration", axis=1)

In [10]:
X = dataset.iloc[:, :-1]
Y = dataset.iloc[:, -1]

X = pd.get_dummies(X, drop_first=True)
Y = pd.get_dummies(Y, drop_first=True)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 157)

In [15]:
#random forest
from sklearn.ensemble import RandomForestClassifier

ranfor1 = RandomForestClassifier(random_state = 157)
ranfor1.fit(X_train, Y_train.values.ravel())
Y_predict1 = ranfor1.predict(X_test)

In [16]:
#evaluate
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_predict1)

array([[10631,   353],
       [  995,   378]], dtype=int64)

In [17]:
ranfor1.score(X_test, Y_test)

0.890912033665129

In [18]:
#Recursive Feature Elimination
from sklearn.feature_selection import RFE

In [19]:
ranfor2 = RandomForestClassifier(random_state = 157)
rfe = RFE(estimator = ranfor2, n_features_to_select = 30, step=1)

In [21]:
rfe.fit(X, Y.values.ravel())

RFE(estimator=RandomForestClassifier(random_state=157), n_features_to_select=30)

In [None]:
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [25]:
ranfor2.fit(X_train_rfe, Y_train.values.ravel())
Y_predict2 = ranfor2.predict(X_test_rfe)

In [26]:
confusion_matrix(Y_test, Y_predict2)

array([[10607,   377],
       [  981,   392]], dtype=int64)

In [27]:
ranfor2.score(X_test_rfe, Y_test)

0.890102775754633

In [28]:
#ranking features
columns = list(X.columns)
rank = rfe.ranking_

In [31]:
feature_importance = ranfor1.feature_importances_

In [36]:
rfe_selected = pd.DataFrame()
rfe_selected = pd.concat([pd.DataFrame(columns), pd.DataFrame(rank), pd.DataFrame(feature_importance)], axis=1)
rfe_selected.columns = ["Feature Name", "Rank", "Feature Importance (Gini Index)"]

In [35]:
rfe_selected

Unnamed: 0,Feature Name,Rank,Feature Importance (Gini Scale)
0,age,1,0.170767
1,campaign,1,0.083434
2,pdays,1,0.034975
3,previous,1,0.014687
4,emp.var.rate,1,0.01938
5,cons.price.idx,1,0.022327
6,cons.conf.idx,1,0.030267
7,euribor3m,1,0.129868
8,nr.employed,1,0.051898
9,job_blue-collar,1,0.013861
