In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score

RS = 56
data = pd.read_csv('./bankDataset/bank-full.csv', sep=';')

## FEATURE ENGINEERING

In [10]:
# Data features
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [14]:
#Checking null values
data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [17]:
#nformation about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [24]:
#We're going to transform the categorical data into numeric data
for column in data.select_dtypes(include=['object']).columns:
    data[column] = LabelEncoder().fit_transform(data[column])

In [27]:
data["y"].unique()

array([0, 1])

In [23]:
print("The count of yes output : ",data[data["y"] == "yes"]["y"].count())
print("The count of no output : ",data[data["y"] == "no"]["y"].count())

The count of yes output :  5289
The count of no output :  39922


In [26]:
#The dataset is imbalance, we will do a resampling using SMOTE 

X = data.drop('y', axis=1)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RS)

smote = SMOTE(random_state=RS)
X_res, y_res = smote.fit_resample(X_train, y_train)

## CLASSIFICATION WITH RANDOM FOREST

In [40]:
#Classification task using the balanced dataset
rf = RandomForestClassifier(random_state=RS)
rf.fit(X_res, y_res)

y_pred = rf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall : ", recall_score(y_test,y_pred))


Accuracy:  0.880128276014597
Recall :  0.6538461538461539
