In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [21]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import metrics

## Display Training & Test data (top & bottom five datasets)

In [3]:
#load the csv file
train_data = pd.read_csv('train_data_after_preprocess.csv')
test_data = pd.read_csv('test_data_after_preprocess.csv')

In [4]:
train_data.shape

(63874, 29)

In [5]:
test_data.shape

(9043, 29)

In [6]:
train_data.head()

Unnamed: 0,housing,loan,default,age_1,age_2,age_3,marital_married,marital_single,job_blue-collar,job_entrepreneur,...,education_tertiary,contact_telephone,poutcome_other,poutcome_success,balance_log,pdays_log,duration_log,campaign_log,previous_log,y
0,1,1,0,0,1,0,0,0,0,0,...,0,0,1,0,-0.645275,0.45035,-1.728912,4.961712,-0.420772,0
1,1,0,0,0,1,0,1,0,0,0,...,1,1,0,0,-0.538652,0.45035,0.762074,-0.173538,-0.420772,0
2,1,0,0,0,1,0,1,0,0,0,...,0,0,1,0,-0.369329,0.45035,-0.972209,2.939376,-0.420772,0
3,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,-0.077591,0.45035,-1.019112,-0.802554,-0.420772,0
4,0,0,0,0,0,0,0,1,0,0,...,1,0,1,0,-0.393144,0.45035,0.429922,-0.173538,-0.420772,1


In [7]:
test_data.head()

Unnamed: 0,housing,loan,default,age_1,age_2,age_3,marital_married,marital_single,job_blue-collar,job_entrepreneur,...,education_tertiary,contact_telephone,poutcome_other,poutcome_success,balance_log,pdays_log,duration_log,campaign_log,previous_log,y
0,1,0,0,0,1,0,1,0,0,0,...,1,1,0,0,-0.353943,0.46161,0.033731,-0.811596,-0.435061,0
1,0,0,0,1,0,0,0,1,0,0,...,1,0,0,0,3.367438,0.46161,-0.425208,-0.170094,-0.435061,0
2,1,0,0,0,1,0,1,0,1,0,...,0,0,0,0,-0.112356,0.46161,-0.002151,-0.811596,-0.435061,0
3,0,0,0,0,0,1,1,0,1,0,...,0,1,0,0,0.064959,0.46161,-1.512344,0.322148,-0.435061,0
4,0,0,0,1,0,0,0,1,0,0,...,1,0,1,0,1.839344,0.46161,2.41368,-0.170094,-0.435061,1


In [8]:
# Creating X,Y test and train data
x_train = train_data[train_data.columns[:-1]]
y_train = train_data[train_data.columns[-1]]
x_test = test_data[test_data.columns[:-1]]
y_test = test_data[test_data.columns[-1]]

### Creating individual base classifiers

In [9]:
random_forest = RandomForestClassifier(n_estimators=25, random_state=1)
logistic_regression = LogisticRegression(C = 0.1, max_iter = 200, penalty = 'l1', solver = 'liblinear')
naive_bayes = GaussianNB()

## 1. Ensemble Method - Gradient Boosting for Classification

In [10]:
clf = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 6).fit(x_train, y_train)
print ('Accuracy: ',clf.score(x_test, y_test))

Accuracy:  0.8082494747318367


### Hyperparameter Tunning for GradientBoostingClassifier

In [58]:
clf = GradientBoostingClassifier()
param_grid = {
    'learning_rate': [0.01],
    'max_depth': [6],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}
# Perform grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
y_pred = best_model.predict(x_test)

# Print the best hyperparameters and model performance
print("Best Hyperparameters:", best_params)
print("Best Model Score (Accuracy):", grid_search.best_score_)
print("Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

KeyboardInterrupt: 

## 2. Ensemble Method - AdaBoost

### Model Performance

In [12]:
for clf, label in zip([random_forest, logistic_regression, naive_bayes], ['random_forest', 'logistic_regression', 'naive_bayes']):
    adaboost_classifier = AdaBoostClassifier(clf,algorithm='SAMME')
   # y_pred = adaboost_classifier.predict(x_test)
   # accuracy = accuracy_score(y_test, y_pred)
    scores = cross_val_score(adaboost_classifier, x_test, y_test,error_score='raise', scoring='accuracy')
    print("Training accuracy: %0.3f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))

Training accuracy: 0.890 (+/- 0.0047) [random_forest]
Training accuracy: 0.883 (+/- 0.0002) [logistic_regression]
Training accuracy: 0.878 (+/- 0.0046) [naive_bayes]


## 3. Ensemble Method - Hard voting classifier

In [13]:
# Soft voting classifier
ensemble_classifier = VotingClassifier(estimators=[
    ('random_forest', random_forest),
    ('logistic_regression', logistic_regression),
    ('naive_bayes',naive_bayes )
], voting='hard')

In [14]:
ensemble_classifier = ensemble_classifier.fit(x_train, y_train)

### Model Performance

In [23]:
for clf, label in zip([random_forest, logistic_regression, naive_bayes, ensemble_classifier], ['random_forest', 'logistic_regression', 'naive_bayes', 'ensemble_classifier']):
    scores = cross_val_score(clf, x_train, y_train, cv=10, scoring='accuracy')
    print("Training accuracy: %0.3f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(),label))
y_pred = ensemble_classifier.predict(x_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Testing accuracy: %0.3f [Ensemble Classifier]', accuracy)

Training accuracy: 0.921 (+/- 0.0248) [random_forest]
Training accuracy: 0.836 (+/- 0.0229) [logistic_regression]
Training accuracy: 0.716 (+/- 0.0274) [naive_bayes]
Training accuracy: 0.872 (+/- 0.0258) [ensemble_classifier]
Testing accuracy: %0.3f [Ensemble Classifier] 0.8148844410040915
