## import libraries

In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## read in data

In [2]:
df = pd.read_csv('../Dataset/bank-additional-full.csv', sep=';')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

In [4]:
num_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## convert categorical variables

In [6]:
_df = pd.get_dummies(df, columns=cat_cols, prefix=cat_cols, drop_first=True)

In [22]:
_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 54 columns):
age                              41188 non-null int64
duration                         41188 non-null int64
campaign                         41188 non-null int64
pdays                            41188 non-null int64
previous                         41188 non-null int64
emp.var.rate                     41188 non-null float64
cons.price.idx                   41188 non-null float64
cons.conf.idx                    41188 non-null float64
euribor3m                        41188 non-null float64
nr.employed                      41188 non-null float64
y                                41188 non-null object
job_blue-collar                  41188 non-null uint8
job_entrepreneur                 41188 non-null uint8
job_housemaid                    41188 non-null uint8
job_management                   41188 non-null uint8
job_retired                      41188 non-null uint8
job_self-employe

In [7]:
_df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0


## split data into X and y

In [25]:
X = _df.drop(['y'], axis=1)
X = X.values

In [26]:
y = df['y'].apply(lambda x: 0 if x == 'no' else 1)
y = y.values

## split data into training and evaluation sets

In [27]:
train_X, eval_X, train_y, eval_y = train_test_split(X, y, test_size=0.3, random_state=0)

In [28]:
val_X, test_X, val_y, test_y = train_test_split(eval_X, eval_y, random_state=0)

## create an instance of LogisticRegression

In [29]:
lr_model = LogisticRegression()

## fit the training data to the LR model

In [30]:
lr_model.fit(train_X, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## Use LR model to predict on validation data

In [31]:
lr_preds = lr_model.predict(val_X)

## Use LR prediction to generate classification report

In [34]:
lr_report = classification_report(val_y, lr_preds)

## Print classification report

In [35]:
print(lr_report)

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      8220
           1       0.65      0.41      0.50      1047

    accuracy                           0.91      9267
   macro avg       0.79      0.69      0.73      9267
weighted avg       0.90      0.91      0.90      9267



## Create instance of DecisionTreeClassifier

In [62]:
dt_model = DecisionTreeClassifier(max_depth= 6)

## Fit training data on DecisionTreeClassifier

In [63]:
dt_model.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

## Use DecisionTreeClassifier to predict on validation data

In [64]:
dt_preds = dt_model.predict(val_X)

## Use DT predictions to generate classification report

In [65]:
dt_report = classification_report(val_y, dt_preds)

## Print Classification Report

In [66]:
print(dt_report)

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      8220
           1       0.66      0.54      0.60      1047

    accuracy                           0.92      9267
   macro avg       0.80      0.75      0.78      9267
weighted avg       0.91      0.92      0.91      9267



## Create an instance of RandomForestClassifier

In [143]:
rf_model = RandomForestClassifier(n_estimators=1000)

## Fit training data on RandomForestClassifier model

In [144]:
rf_model.fit(train_X, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Use RandomForestClassifier model to predict on validation data

In [145]:
rf_preds = rf_model.predict(val_X)

## Generate classification report for RandomForestClassifier

In [146]:
rf_report = classification_report(val_y, rf_preds)

## Print the report

In [147]:
print(rf_report)

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      8220
           1       0.68      0.48      0.56      1047

    accuracy                           0.92      9267
   macro avg       0.81      0.72      0.76      9267
weighted avg       0.91      0.92      0.91      9267



## Compare the R2 Score of all three models

In [150]:
lr_preds.shape

(9267,)

In [152]:
print('Linear Score: {}, DecisionTree Score: {}, RandomForest Score: {}'.format(lr_model.score(val_X, val_y), dt_model.score(val_X, val_y), rf_model.score(val_X, val_y)))

Linear Score: 0.9087083198446099, DecisionTree Score: 0.9172331930506097, RandomForest Score: 0.9156145462393439
