# Bagging

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
df = pd.read_csv("bank.csv")
df.head()
df.shape

(11162, 17)

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3,1
1,56,0,1,1,0,45,0,0,2,5,8,1467,1,-1,0,3,1
2,41,9,1,1,0,1270,1,0,2,5,8,1389,1,-1,0,3,1
3,55,7,1,1,0,2476,1,0,2,5,8,579,1,-1,0,3,1
4,54,0,1,2,0,184,0,0,2,5,8,673,2,-1,0,3,1


In [6]:
df.deposit.value_counts()

0    5873
1    5289
Name: deposit, dtype: int64

In [7]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.3, random_state=1)

# Voting Classifier

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [8]:
dt_clf = DecisionTreeClassifier(max_depth=10)
log_clf = LogisticRegression()
svm_clf = SVC()

In [9]:
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('dt', dt_clf), ('svc', svm_clf)])

In [11]:
import warnings
warnings.filterwarnings("ignore")

In [12]:
voting_clf.fit(xtrain, ytrain)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('dt', DecisionTreeClassifier()), ('svc', SVC())])

In [13]:
from sklearn.metrics import accuracy_score

In [22]:
for clf in (log_clf, dt_clf, svm_clf, voting_clf):
    clf.fit(xtrain, ytrain)
    y_pred = clf.predict(xtest)
    print(clf.__class__.__name__, accuracy_score(ytest, y_pred))

LogisticRegression 0.7659002687369364
DecisionTreeClassifier 0.7876978202448492
SVC 0.7414153478650344
VotingClassifier 0.7817258883248731


# Bagging Classifier

In [16]:
from sklearn.ensemble import BaggingClassifier

In [18]:
bag_clf = BaggingClassifier(log_clf,n_estimators=15)
# bag_clf = BaggingClassifier(svm_clf,n_estimators=15)

In [20]:
bag_clf.fit(xtrain,ytrain)

BaggingClassifier(base_estimator=LogisticRegression(), n_estimators=15)

In [23]:
y_pred = bag_clf.predict(xtest)
accuracy_score(y_pred, ytest)

0.7682890415049268

# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
my_rf_classifier = RandomForestClassifier()

In [10]:
my_rf_classifier.fit(xtrain, ytrain)

RandomForestClassifier()

In [11]:
my_predictions = my_rf_classifier.predict(xtest)

In [12]:
print(accuracy_score(ytest, my_predictions))

0.8489101224246044


In [13]:
print(classification_report(ytest, my_predictions))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85      1760
           1       0.81      0.88      0.85      1589

    accuracy                           0.85      3349
   macro avg       0.85      0.85      0.85      3349
weighted avg       0.85      0.85      0.85      3349



# Hyperparameter Tunning

In [14]:
my_rf_classifier1 = RandomForestClassifier(n_estimators=200,max_features=5,criterion='entropy')
my_rf_classifier1.fit(xtrain, ytrain)
my_predictions = my_rf_classifier.predict(xtest)
print(classification_report(ytest, my_predictions))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85      1760
           1       0.81      0.88      0.85      1589

    accuracy                           0.85      3349
   macro avg       0.85      0.85      0.85      3349
weighted avg       0.85      0.85      0.85      3349



In [15]:
from sklearn.model_selection import GridSearchCV

params = {'criterion' : ['gini', 'entropy'],
          'n_estimators' : [ 150,200],
          'max_features' : [5,10]
         }

grid_search = GridSearchCV(my_rf_classifier, param_grid= params)

In [16]:
grid_search.fit(xtrain, ytrain)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [5, 10, 15],
                         'n_estimators': [150, 200, 250]})

In [45]:
grid_search.best_params_

{'criterion': 'entropy', 'n_estimators': 150}

In [46]:
my_best_preds = grid_search.predict(xtest)

In [47]:
print(classification_report(ytest, my_best_preds))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85      1760
           1       0.81      0.88      0.84      1589

    accuracy                           0.85      3349
   macro avg       0.85      0.85      0.85      3349
weighted avg       0.85      0.85      0.85      3349

