In [17]:
import pandas as pd
import numpy as np


df = pd.read_csv('./data/bank-additional-full.csv', sep=';')

In [18]:
df.size

864948

In [19]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [20]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [21]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [22]:
df.groupby('poutcome')['poutcome'].count()

poutcome
failure         4252
nonexistent    35563
success         1373
Name: poutcome, dtype: int64

In [23]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [24]:
cat_feature_cols = ["marital", "education", "contact", "default", "housing", "loan", "poutcome"]

In [25]:
num_feature_cols = ["age", "pdays", "previous", "emp.var.rate", "euribor3m", "nr.employed"]

In [26]:
feature_cols = cat_feature_cols + num_feature_cols

In [27]:
X = df[feature_cols].copy()

In [28]:
y = df['y'].apply(lambda x: 1 if x == 'yes' else 0).copy()

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X_train_cat_encoded = enc.fit_transform(X_train[cat_feature_cols])
X_test_cat_encoded = enc.transform(X_test[cat_feature_cols])

In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[num_feature_cols])
X_test_num_scaled = scaler.transform(X_test[num_feature_cols])

In [32]:
X_train = np.concatenate((X_train_cat_encoded.toarray(), X_train_num_scaled), axis=1)
X_test = np.concatenate((X_test_cat_encoded.toarray(), X_test_num_scaled), axis=1)

In [34]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_balanced, y_balanced = sm.fit_resample(X_train, y_train)

In [36]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Define classifier
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(X_balanced, y_balanced)
f1_score(y_test.values, rfc.predict(X_test))
y_balanced.value_counts()
y_test.value_counts()

0    7303
1     935
Name: y, dtype: int64