In [2]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc

from sklearn import svm, datasets
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/gdrive')

%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [3]:
trainfile = r'/gdrive/My Drive/assignment2/Portugese Bank Data - TRAIN.csv'
trainData = pd.read_csv(trainfile)
testfile = r'/gdrive/My Drive/assignment2/Portugese Bank Data - TEST.csv'
testData = pd.read_csv(testfile)

print(trainData.shape)
print(testData.shape)

trainData.head()
testData.head()

(4521, 17)
(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
trainData.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [0]:
categoricalFeatures = ['job','marital','education','default','housing','loan','contact','month','poutcome']
combined_Data = pd.concat([trainData,testData], keys=[0,1])
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)
trainData = combined_Data.xs(0)
testData = combined_Data.xs(1)
y_train = trainData["y"]
X_train = trainData.drop(["y"], axis=1)
y_test = testData["y"]
X_test = testData.drop(["y"], axis=1)



In [6]:
#Decision Tree Classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)

print("accuracy Score (training) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print(classification_report(y_test,clf_predict))

#print(confusion_matrix(y_pred = clf_predict, y_true = y_test))

accuracy Score (training) for Decision Tree:0.881467
Confusion Matrix for Decision Tree
[[37165  2757]
 [ 2602  2687]]
              precision    recall  f1-score   support

          no       0.93      0.93      0.93     39922
         yes       0.49      0.51      0.50      5289

    accuracy                           0.88     45211
   macro avg       0.71      0.72      0.72     45211
weighted avg       0.88      0.88      0.88     45211



In [7]:
#Hyperparameter tuning done for decision tree classifier
parameters={'max_depth': range(10,70,10),'max_leaf_nodes': range(10,50,10),'criterion':['gini','entropy'],'max_features':[3,4,5]}
#random search
print("RandomizedSearchCV-Decision tree")
clf_random = RandomizedSearchCV(clf,parameters,n_iter=40,cv=5)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)
#Now do grid search
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)

RandomizedSearchCV-Decision tree
{'max_leaf_nodes': 30, 'max_features': 4, 'max_depth': 30, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 10, 'max_features': 4, 'max_leaf_nodes': 30}


In [0]:
#Construct Decision Trees using the best parameters
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)

In [15]:
#Obtain accuracy, confusion matrix, classification report
print("accuracy Score (training) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("accuracy Score (training) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))

clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring="balanced_accuracy")
print(clf_cv_score)

print(confusion_matrix(y_test,clfr_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clfr_predict))

clfr_cv_score = cross_val_score(clfr, X_train, y_train, cv=10, scoring="balanced_accuracy")
print(clfr_cv_score)


accuracy Score (training) after hypertuning randomized search for Decision Tree:0.886621
accuracy Score (training) after hypertuning grid search for Decision Tree:0.885382
Confusion Matrix after hypertuning for Decision Tree
[[39615   307]
 [ 4819   470]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.89      0.99      0.94     39922
         yes       0.60      0.09      0.15      5289

    accuracy                           0.89     45211
   macro avg       0.75      0.54      0.55     45211
weighted avg       0.86      0.89      0.85     45211

[0.53966981 0.53221154 0.50836538 0.51923077 0.55394231 0.54432692
 0.54432692 0.63048077 0.51884615 0.55394231]
[[39806   116]
 [ 5066   223]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.89      1.00      0.94     39922
         yes       0.66      0.04      0.08      5289

    accuracy                           0.89     4

In [10]:
#Normal randomforest
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (training) for RandomForest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print(classification_report(y_test,rfc_predict))


accuracy Score (training) for RandomForest:0.905709
Confusion Matrix for Random Forest:
[[39258   664]
 [ 3599  1690]]
              precision    recall  f1-score   support

          no       0.92      0.98      0.95     39922
         yes       0.72      0.32      0.44      5289

    accuracy                           0.91     45211
   macro avg       0.82      0.65      0.70     45211
weighted avg       0.89      0.91      0.89     45211



In [11]:
rand_parameters={'criterion':['gini','entropy'],'min_impurity_decrease': range(2,22,4),'min_samples_leaf': range(4,24,4),'max_features':[2,3,4]}
#random search with cross-validation
print("RandomizedSearchCV-Decision tree")
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=30,cv=10)
rfc_random.fit(X_train, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
#grid search
print("GridSearchCV-Decision tree")
rfc_grid = GridSearchCV(rfc,rand_parameters)
rfc_grid.fit(X_train, y_train)
grid_parm_rfc1=rfc_grid.best_params_
print(grid_parm_rfc1)

RandomizedSearchCV-Decision tree
{'min_samples_leaf': 8, 'min_impurity_decrease': 14, 'max_features': 3, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_features': 2, 'min_impurity_decrease': 2, 'min_samples_leaf': 4}


In [13]:
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfcr= RandomForestClassifier(**grid_parm_rfc1)
rfcr.fit(X_train,y_train)
rfcr_predict = rfcr.predict(X_test)
print("accuracy Score (training) after hypertuning random search for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("accuracy Score (training) after hypertuning grid search for Random Forest:{0:6f}".format(rfcr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train, cv=10)
print(rfc_cv_score)

print(confusion_matrix(y_test,rfcr_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfcr_predict))
rfcr_cv_score = cross_val_score(rfcr, X_train, y_train, cv=10)
print(rfcr_cv_score)


accuracy Score (training) after hypertuning random search for Random Forest:0.883015
accuracy Score (training) after hypertuning grid search for Random Forest:0.883015
Confusion Matrix after hypertuning for Random Forest:
[[39922     0]
 [ 5289     0]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.00      0.00      0.00      5289

    accuracy                           0.88     45211
   macro avg       0.44      0.50      0.47     45211
weighted avg       0.78      0.88      0.83     45211

[0.88300221 0.88495575 0.88495575 0.88495575 0.88495575 0.88495575
 0.88495575 0.88495575 0.88495575 0.88495575]
[[39922     0]
 [ 5289     0]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.00      0.00      0.00      5289

    accuracy                           0.88     4521