In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [55]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm

## Display Training & Test data (top & bottom five datasets)

In [56]:
#load the csv file
train_data = pd.read_csv('train_data_after_preprocess.csv')
test_data = pd.read_csv('test_data_after_preprocess.csv')

In [57]:
train_data.shape

(63874, 29)

In [58]:
test_data.shape

(9043, 29)

In [59]:
train_data.head()

Unnamed: 0,housing,loan,default,age_1,age_2,age_3,marital_married,marital_single,job_blue-collar,job_entrepreneur,...,education_tertiary,contact_telephone,poutcome_other,poutcome_success,balance_log,pdays_log,duration_log,campaign_log,previous_log,y
0,1,1,0,0,1,0,0,0,0,0,...,0,0,1,0,-0.645275,0.45035,-1.728912,4.961712,-0.420772,0
1,1,0,0,0,1,0,1,0,0,0,...,1,1,0,0,-0.538652,0.45035,0.762074,-0.173538,-0.420772,0
2,1,0,0,0,1,0,1,0,0,0,...,0,0,1,0,-0.369329,0.45035,-0.972209,2.939376,-0.420772,0
3,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,-0.077591,0.45035,-1.019112,-0.802554,-0.420772,0
4,0,0,0,0,0,0,0,1,0,0,...,1,0,1,0,-0.393144,0.45035,0.429922,-0.173538,-0.420772,1


In [60]:
test_data.tail()

Unnamed: 0,housing,loan,default,age_1,age_2,age_3,marital_married,marital_single,job_blue-collar,job_entrepreneur,...,education_tertiary,contact_telephone,poutcome_other,poutcome_success,balance_log,pdays_log,duration_log,campaign_log,previous_log,y
9038,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,-0.629341,0.46161,-0.283707,0.322148,-0.435061,0
9039,1,0,0,1,0,0,1,0,1,0,...,0,0,0,0,-0.5249,-1.259717,-1.136982,-0.170094,1.584341,0
9040,1,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0.391482,0.46161,-0.113487,-0.811596,-0.435061,0
9041,1,0,0,1,0,0,0,1,0,0,...,0,0,1,0,-0.593074,0.46161,-0.168644,2.020132,-0.435061,0
9042,1,0,0,0,1,0,1,0,1,0,...,0,0,0,0,-0.599099,0.46161,2.528064,-0.811596,-0.435061,0


In [61]:
# Creating X,Y test and train data
x_train = train_data[train_data.columns[:-1]]
y_train = train_data[train_data.columns[-1]]
x_test = test_data[test_data.columns[:-1]]
y_test = test_data[test_data.columns[-1]]

### Creating individual base classifiers 这个到时你们把各个model的数值放进去

In [62]:
random_forest = RandomForestClassifier()
logistic_regression = LogisticRegression()
support_vector_machine = svm.SVC()

## Ensemble Method - Gradient Boosting for Classification

In [63]:
clf = GradientBoostingClassifier().fit(x_train, y_train)
print ('Accuracy: ',clf.score(x_test, y_test))

Accuracy:  0.7651221939621807


## Ensemble Method - AdaBoost

In [64]:
for clf, label in zip([random_forest, logistic_regression, support_vector_machine], ['random_forest', 'logistic_regression', 'support_vector_machine']):
    adaboost_classifier = AdaBoostClassifier(clf,algorithm='SAMME')
    scores = cross_val_score(adaboost_classifier, x_test, y_test,error_score='raise', scoring='accuracy')
    print("Accuracy: %0.3f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.893 (+/- 0.0044) [random_forest]
Accuracy: 0.879 (+/- 0.0015) [logistic_regression]
Accuracy: 0.883 (+/- 0.0002) [support_vector_machine]


## Ensemble Method - Hard voting classifier

In [65]:
# Soft voting classifier
ensemble_classifier = VotingClassifier(estimators=[
    ('random_forest', random_forest),
    ('logistic_regression', logistic_regression),
    ('Naive_Bayes',support_vector_machine )
], voting='hard')

In [66]:
ensemble_classifier = ensemble_classifier.fit(x_train, y_train)

In [67]:
for clf, label in zip([random_forest, logistic_regression, Naive_Bayes, ensemble_classifier], ['random_forest', 'logistic_regression', 'support_vector_machine', 'ensemble_classifier']):
    scores = cross_val_score(clf, x_test, y_test, cv=5, scoring='accuracy')
    print("Accuracy: %0.3f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.890 (+/- 0.0044) [random_forest]
Accuracy: 0.895 (+/- 0.0045) [logistic_regression]
Accuracy: 0.894 (+/- 0.0029) [support_vector_machine]
Accuracy: 0.895 (+/- 0.0029) [ensemble_classifier]
