# Importing Dependencies

In [11]:
# Make sure all outputs are displayed and not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Import all necessary dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model, model_selection, decomposition, metrics, svm, tree, neighbors
from sklearn.ensemble import RandomForestClassifier
import time

 

# Reading dataset

In [12]:
df = pd.read_csv("dataset_2/bank-full.csv", sep=';')
df.head()

# Replace yes and no to 1 and 0 for easy classification and making scoring possible in
# cross-validation (cross_val_score())
df.y.replace(('yes', 'no'),(1,0), inplace=True)
df.head()

# Amount of yes and no
print ((df[['y']] == 1).sum()) # 5289
print ((df[['y']] == 0).sum()) # 39922


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


y    5289
dtype: int64
y    39922
dtype: int64


# Convert string dataset to binary 

In [13]:
df = pd.get_dummies(df, columns=['job','marital','education','default','housing',
                                   'loan','contact','month','poutcome'])
df.head()


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [14]:
# Make y as the target feature
df['Target'] = df['y']

# Drop target columns
df.drop(['y'], axis=1, inplace=True)

df.head()

# Getting Target column [n_samples]
y = df['Target']

features = df.columns
# Getting Features matrix [n_samples, n_features]
x = df[features[:-1]].as_matrix()


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,Target
0,58,2143,5,261,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,33,2,5,76,1,-1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
3,47,1506,5,92,1,-1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [15]:
# Splitting data to train and test
x_train, x_test, y_train, y_test = \
    model_selection.train_test_split(x, y, test_size=0.2, random_state=0)

x.shape
y.shape
x_train.shape
x_test.shape
y_train.shape
y_test.shape

(45211, 51)

(45211,)

(36168, 51)

(9043, 51)

(36168,)

(9043,)

# Logistic Regression

In [16]:
logistic = linear_model.LogisticRegression(C=1e5)

tick = time.time()

# Doing cross-validation 5-folds
scores = model_selection.cross_val_score(logistic, x, y, scoring='accuracy', cv=5)
print ("Cross-validation scores for accuracy: {}".format(scores))
scores = model_selection.cross_val_score(logistic, x, y, scoring='precision', cv=5)
print ("Cross-validation scores for precision: {}".format(scores))
scores = model_selection.cross_val_score(logistic, x, y, scoring='recall', cv=5)
print ("Cross-validation scores for recall: {}".format(scores))
scores = model_selection.cross_val_score(logistic, x, y, scoring='f1', cv=5)
print ("Cross-validation scores for F1: {}".format(scores))

tock = time.time()-tick
print ("Total cross-validation time: %.8f" % tock)

tick = time.time()
logistic.fit(x_train,y_train)
tock = time.time()-tick
print ("time: %.8f" % tock)


y_pred = logistic.predict(x_test)

print("Accuracy: %.8f" % metrics.accuracy_score(y_test, y_pred))

print(metrics.classification_report(y_test, y_pred))


Cross-validation scores for accuracy: [ 0.89251355  0.87238748  0.84140677  0.63691661  0.33856874]
Cross-validation scores for precision: [ 0.93        0.39237668  0.22189349  0.12630165  0.12066574]
Cross-validation scores for recall: [ 0.0879017   0.16540643  0.14177694  0.35538752  0.74077578]
Cross-validation scores for F1: [ 0.16062176  0.23271277  0.17301038  0.18636927  0.20752717]
Total cross-validation time: 17.20432019


LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

time: 0.82156301
Accuracy: 0.89936968
             precision    recall  f1-score   support

          0       0.92      0.97      0.94      7980
          1       0.63      0.35      0.45      1063

avg / total       0.88      0.90      0.89      9043



# Support Vector Machine

In [17]:
svm_clf = svm.LinearSVC(loss='hinge')


tick = time.time()

scores = model_selection.cross_val_score(svm_clf, x, y, scoring='accuracy', cv=5)
print ("Cross-validation scores for accuracy: {}".format(scores))
scores = model_selection.cross_val_score(svm_clf, x, y, scoring='precision', cv=5)
print ("Cross-validation scores for precision: {}".format(scores))
scores = model_selection.cross_val_score(svm_clf, x, y, scoring='recall', cv=5)
print ("Cross-validation scores for recall: {}".format(scores))
scores = model_selection.cross_val_score(svm_clf, x, y, scoring='f1', cv=5)
print ("Cross-validation scores for F1: {}".format(scores))

tock = time.time()-tick
print ("Total cross-validation time: %.8f" % tock)

tick = time.time()
svm_clf.fit(x_train,y_train)
tock = time.time()-tick
print ("time: %.8f" % tock)

y_pred = svm_clf.predict(x_test)

print("Accuracy: %.8f" % metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))


Cross-validation scores for accuracy: [ 0.91042796  0.86685834  0.68436187  0.57697412  0.61917929]
Cross-validation scores for precision: [ 1.          0.48108108  0.47961165  0.13904861  0.18095439]
Cross-validation scores for recall: [ 0.86672968  0.28922495  0.86672968  0.53591682  0.60170293]
Cross-validation scores for F1: [ 0.04770017  0.32679739  0.31210606  0.22208041  0.26278223]
Total cross-validation time: 109.82704997


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)

time: 5.17533803
Accuracy: 0.89372996
             precision    recall  f1-score   support

          0       0.92      0.96      0.94      7980
          1       0.57      0.41      0.48      1063

avg / total       0.88      0.89      0.89      9043



# Decision tree

In [18]:
tree_clf = tree.DecisionTreeClassifier()

tick = time.time()

scores = model_selection.cross_val_score(tree_clf, x, y, scoring='accuracy', cv=5)
print ("Cross-validation scores for accuracy: {}".format(scores))
scores = model_selection.cross_val_score(tree_clf, x, y, scoring='precision', cv=5)
print ("Cross-validation scores for precision: {}".format(scores))
scores = model_selection.cross_val_score(tree_clf, x, y, scoring='recall', cv=5)
print ("Cross-validation scores for recall: {}".format(scores))
scores = model_selection.cross_val_score(tree_clf, x, y, scoring='f1', cv=5)
print ("Cross-validation scores for F1: {}".format(scores))

tock = time.time()-tick
print ("Total cross-validation time: %.8f" % tock)

tick = time.time()
tree_clf.fit(x_train,y_train)
tock = time.time()-tick
print ("time: %.8f" % tock)

y_pred = tree_clf.predict(x_test)

print("Accuracy: %.8f" % metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))


Cross-validation scores for accuracy: [ 0.74466438  0.64071658  0.68989162  0.52598983  0.21159164]
Cross-validation scores for precision: [ 0.05151515  0.05705354  0.08760172  0.07797322  0.10895141]
Cross-validation scores for recall: [ 0.06521739  0.13988658  0.17391304  0.30812854  0.79754021]
Cross-validation scores for F1: [ 0.06060606  0.08543689  0.11042945  0.13225539  0.19294277]
Total cross-validation time: 6.63773203


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

time: 0.29571080
Accuracy: 0.87139224
             precision    recall  f1-score   support

          0       0.93      0.92      0.93      7980
          1       0.45      0.47      0.46      1063

avg / total       0.87      0.87      0.87      9043



# Random forest

In [19]:
forest_clf = RandomForestClassifier(n_estimators=10)

tick = time.time()

scores = model_selection.cross_val_score(forest_clf, x, y, scoring='accuracy', cv=5)
print ("Cross-validation scores for accuracy: {}".format(scores))
scores = model_selection.cross_val_score(forest_clf, x, y, scoring='precision', cv=5)
print ("Cross-validation scores for precision: {}".format(scores))
scores = model_selection.cross_val_score(forest_clf, x, y, scoring='recall', cv=5)
print ("Cross-validation scores for recall: {}".format(scores))
scores = model_selection.cross_val_score(forest_clf, x, y, scoring='f1', cv=5)
print ("Cross-validation scores for F1: {}".format(scores))

tock = time.time()-tick
print ("Total cross-validation time: %.8f" % tock)

tick = time.time()
forest_clf.fit(x_train,y_train)
tock = time.time()-tick
print ("time: %.8f" % tock)

y_pred = forest_clf.predict(x_test)

print("Accuracy: %.8f" % metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Cross-validation scores for accuracy: [ 0.88322459  0.77562756  0.73169653  0.52410971  0.27784537]
Cross-validation scores for precision: [ 0.7         0.0310559   0.06533036  0.04846871  0.12195469]
Cross-validation scores for recall: [ 0.00378072  0.03308129  0.06899811  0.18147448  0.81456954]
Cross-validation scores for F1: [ 0.02222222  0.02902494  0.07686275  0.08958249  0.20944507]
Total cross-validation time: 7.60256696


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

time: 0.31405210
Accuracy: 0.89682627
             precision    recall  f1-score   support

          0       0.92      0.97      0.94      7980
          1       0.61      0.34      0.43      1063

avg / total       0.88      0.90      0.88      9043



# K-NN

In [20]:
nn_clf = neighbors.KNeighborsClassifier()

tick = time.time()

scores = model_selection.cross_val_score(nn_clf, x, y, scoring='accuracy', cv=5)
print ("Cross-validation scores for accuracy: {}".format(scores))
scores = model_selection.cross_val_score(nn_clf, x, y, scoring='precision', cv=5)
print ("Cross-validation scores for precision: {}".format(scores))
scores = model_selection.cross_val_score(nn_clf, x, y, scoring='recall', cv=5)
print ("Cross-validation scores for recall: {}".format(scores))
scores = model_selection.cross_val_score(nn_clf, x, y, scoring='f1', cv=5)
print ("Cross-validation scores for F1: {}".format(scores))

tock = time.time()-tick
print ("Total cross-validation time: %.8f" % tock)

tick = time.time()
forest_clf.fit(x_train,y_train)
tock = time.time()-tick
print ("time: %.8f" % tock)

y_pred = forest_clf.predict(x_test)

print("Accuracy: %.8f" % metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Cross-validation scores for accuracy: [ 0.89350879  0.88399867  0.88630834  0.85445698  0.84338016]
Cross-validation scores for precision: [ 0.62025316  0.50959488  0.5375      0.33290155  0.31664964]
Cross-validation scores for recall: [ 0.231569    0.22589792  0.20321361  0.24291115  0.29328288]
Cross-validation scores for F1: [ 0.33723331  0.31303209  0.29492455  0.28087432  0.30451866]
Total cross-validation time: 9.13027096


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

time: 0.30589700
Accuracy: 0.89594161
             precision    recall  f1-score   support

          0       0.92      0.97      0.94      7980
          1       0.60      0.35      0.44      1063

avg / total       0.88      0.90      0.88      9043

