# Data Pre-processing and Feature Engineering

In [34]:
import pandas as pd
import numpy as np

data = pd.read_csv("DSA Data Set.csv")

In [35]:
data['y'] = data.y.map({'no':0,'yes':1}) 

In [36]:
data['marital'] = data.marital.map({'divorced':0,'unknown':0,'single':0,'married':1}) 

In [37]:
data.loc[data['age'] <= 21, 'age'] = 0 
data.loc[(data['age'] > 21) & (data['age'] <= 30), 'age'] = 1 
data.loc[(data['age'] > 30) & (data['age'] <= 50), 'age'] = 2 
data.loc[(data['age'] > 50) & (data['age'] <= 65), 'age'] = 3 
data.loc[data['age'] > 65, 'age'] = 4 

In [38]:
data['job'] = data.job.map({'admin.':0,'management':0,'housemaid':1,'services':1,'blue-collar':1,'technician':1,'self-employed':2,'entrepreneur':2,'retired':3,'unemployed':3,'unknown':3,'student':4})

In [39]:
data['default'] = data.default.map({'no':0,'unknown':0,'yes':1})

In [40]:
data['housing'] = data.housing.map({'no':0,'unknown':0,'yes':1})

In [41]:
data['loan'] = data.loan.map({'no':0,'unknown':0,'yes':1})

In [42]:
data['contact'] = data.contact.map({'telephone':0,'cellular':1})

In [43]:
data.loc[data['previous'] == 0, 'previous'] = 0
data.loc[data['previous'] > 0, 'previous'] = 1

In [44]:
data['month'] = data.month.map({'mar':1,'sep':1,'oct':1,'nov':1,'dec':1,'apr':0,'may':0,'jun':0,'jul':0,'aug':0})

In [45]:
data['day_of_week'] = data.day_of_week.map({'mon':0,'tue':1,'wed':2,'thu':3,'fri':4})

In [46]:
data['education'] = data.education.map({'basic.4y':0,'basic.6y':0,'basic.9y':0,'high.school':1,'professional.course':1,'university.degree':1,'unknown':2,'illiterate':2})

In [47]:
data['poutcome'] = data.poutcome.map({'nonexistent':0,'success':1,'failure':2})

In [48]:
data = data.drop(['duration','emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed','ModelPrediction','campaign','pdays'], axis = 1)

In [49]:
data.to_csv('dsa_data.csv')

# Model Building

In [50]:
new_data = pd.read_csv("dsa_data.csv")

In [51]:
X = new_data.drop(['y'],axis=1)
y = new_data['y']

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC

## Logistic Regression

In [67]:
lr_model = LogisticRegression(max_iter=100, solver='lbfgs')

param_grid = {'C': [0.1, 0.5, 1, 10, 100]}
cv = StratifiedKFold(n_splits=20, random_state=0, shuffle=True)

grid = GridSearchCV(lr_model, param_grid, cv = cv, scoring='recall', return_train_score=False)
grid.fit(X, y.values.ravel())

print("Best Parameter: {}".format(grid.best_params_))

Best Parameter: {'C': 0.1}


In [68]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)

lr_model = LogisticRegression(C = 0.1, solver='lbfgs')

lr_model.fit(X,y)

y_test_hat=lr_model.predict(X_test)

print("The accuracy score of Logistic Regression model is:", accuracy_score(y_test,y_test_hat, normalize = True) * 100)

The accuracy score of Logistic Regression model is: 88.44379703811605


In [69]:
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,y_test_hat))
print("------------------------------------------------------------------------")
print(classification_report(y_test,y_test_hat))

Confusion Matrix
[[7276   43]
 [ 909   10]]
------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.89      0.99      0.94      7319
           1       0.19      0.01      0.02       919

    accuracy                           0.88      8238
   macro avg       0.54      0.50      0.48      8238
weighted avg       0.81      0.88      0.84      8238



## Decision Tree

#### Recall

In [57]:
tree = DecisionTreeClassifier()

param_grid = {'max_depth': [2,3,4,5,6,7,8,9,10]}

cv = StratifiedKFold(n_splits=15, random_state=0, shuffle=True)
grid = GridSearchCV(tree, param_grid, cv = cv, scoring='recall', return_train_score=True)
grid.fit(X, y)

print("Best Parameter: {}".format(grid.best_params_))
print("Best Cross Validation Score: {}".format(grid.best_score_))

Best Parameter: {'max_depth': 2}
Best Cross Validation Score: 0.3094905340512978


In [58]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=0)

tree = DecisionTreeClassifier(max_depth = 2)

tree.fit(X,y)

y_test_hat = tree.predict(X_test)

print("The accuracy score of Decision Tree model is:", accuracy_score(y_test,y_test_hat, normalize = True) * 100)

The accuracy score of Decision Tree model is: 89.08715707696044


In [59]:
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,y_test_hat))
print("------------------------------------------------------------------------")
print(classification_report(y_test,y_test_hat))

Confusion Matrix
[[7055  264]
 [ 635  284]]
------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7319
           1       0.52      0.31      0.39       919

    accuracy                           0.89      8238
   macro avg       0.72      0.64      0.66      8238
weighted avg       0.87      0.89      0.88      8238



#### Accuracy

In [60]:
tree = DecisionTreeClassifier()

param_grid = {'max_depth': [2,3,4,5,6,7,8,9,10]}

cv = StratifiedKFold(n_splits=15, random_state=0, shuffle=True)
grid = GridSearchCV(tree, param_grid, cv = cv, scoring='accuracy', return_train_score=True)
grid.fit(X, y)

print("Best Parameter: {}".format(grid.best_params_))
print("Best Cross Validation Score: {}".format(grid.best_score_))

Best Parameter: {'max_depth': 7}
Best Cross Validation Score: 0.8998737496358162


In [62]:
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=0)

tree = DecisionTreeClassifier(max_depth = 7)

tree.fit(X,y)

y_test_hat = tree.predict(X_test)

print("The accuracy score of Decision Tree model is:", accuracy_score(y_test,y_test_hat, normalize = True) * 100)

The accuracy score of Decision Tree model is: 90.58023792182568


In [63]:
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,y_test_hat))
print("------------------------------------------------------------------------")
print(classification_report(y_test,y_test_hat))

Confusion Matrix
[[7218  101]
 [ 675  244]]
------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      7319
           1       0.71      0.27      0.39       919

    accuracy                           0.91      8238
   macro avg       0.81      0.63      0.67      8238
weighted avg       0.89      0.91      0.89      8238



## SVM

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify = y)

svm = SVC(kernel='rbf',  class_weight='balanced', C = 0.1,gamma = 0.0005)

svm.fit(X_train, y_train)

SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0005, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [65]:
y_test_hat=svm.predict(X_test)
accuracy_score(y_test,y_test_hat, normalize = True) * 100

83.98883224083515

In [66]:
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix")
print(confusion_matrix(y_test,y_test_hat))
print("------------------------------------------------------------------------")
print(classification_report(y_test,y_test_hat))

Confusion Matrix
[[6333  977]
 [ 342  586]]
------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.95      0.87      0.91      7310
           1       0.37      0.63      0.47       928

    accuracy                           0.84      8238
   macro avg       0.66      0.75      0.69      8238
weighted avg       0.88      0.84      0.86      8238

