In [2]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from scipy.stats import rankdata
import os

data_files = os.listdir('data')

seed = 7
num_trees = 100
kfold = model_selection.KFold(n_splits=10, random_state=seed)

results = {
    'Data File': [],
    'Decision Tree': [],
    'Ada Boost': [],
    'Random Forest': [],
    'Voting Ensemble': []
}
rankings = {
    'Decision Tree': [],
    'Ada Boost': [],
    'Random Forest': [],
    'Voting Ensemble': []
}

# iterate through all files in /data folder and apply 4 different ensemble methods.
for data_file in data_files:
    print('Parsing dataset: ' + data_file + '...')
    results['Data File'].append(data_file)

    data = pd.read_csv('data/' + data_file)

    X = data[data.columns[1:-1]]
    Y = data[data.columns[-1]]

    # Bagging with Decision Tree
    cart = DecisionTreeClassifier()
    model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
    result = model_selection.cross_val_score(model, X, Y, cv=kfold)
    results['Decision Tree'].append(result.mean())

    # Boosting with Ada Boost
    model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
    result = model_selection.cross_val_score(model, X, Y, cv=kfold)
    results['Ada Boost'].append(result.mean())

    # Bagging with Random Forest
    max_features = 2
    model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
    result = model_selection.cross_val_score(model, X, Y, cv=kfold)
    results['Random Forest'].append(result.mean())

    # Voting Ensemble
    # create the sub models
    estimators = []
    model1 = LogisticRegression()
    estimators.append(('logistic', model1))
    model2 = DecisionTreeClassifier()
    estimators.append(('cart', model2))
    model3 = SVC()
    estimators.append(('svm', model3))
    # create the ensemble model
    ensemble = VotingClassifier(estimators)
    result = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
    results['Voting Ensemble'].append(result.mean())

    # modify the results of the dataframe in order to display the ranking of each method in each row
    current_results = [results['Decision Tree'][-1], results['Ada Boost'][-1], results['Random Forest'][-1], results['Voting Ensemble'][-1]]
    current_rankings = (len(current_results) - rankdata(current_results).astype(int)) + 1

    rankings['Decision Tree'].append(current_rankings[0])
    rankings['Ada Boost'].append(current_rankings[1])
    rankings['Random Forest'].append(current_rankings[2])
    rankings['Voting Ensemble'].append(current_rankings[3])

    results['Decision Tree'][-1] = str(round(results['Decision Tree'][-1], 4)) + '(' + str(current_rankings[0]) + ')'
    results['Ada Boost'][-1] = str(round(results['Ada Boost'][-1], 4)) + '(' + str(current_rankings[1]) + ')'
    results['Random Forest'][-1] = str(round(results['Random Forest'][-1], 4)) + '(' + str(current_rankings[2]) + ')'
    results['Voting Ensemble'][-1] = str(round(results['Voting Ensemble'][-1], 4)) + '(' + str(current_rankings[3]) + ')'

# print the results for all the methods applied
df = pd.DataFrame(results)
df.set_index('Data File', inplace=True)
df2 = pd.DataFrame(rankings)
df.loc['Average Rank'] = df2.mean()
print(df)

Parsing dataset: spambase.data...
Parsing dataset: breast-cancer-wisconsin.data...
Parsing dataset: banknote_authentication.data...
Parsing dataset: processed.cleveland.data...
Parsing dataset: movement_libras.data...
Parsing dataset: ultrasonic.data...
Parsing dataset: pop_failures.data...
Parsing dataset: sonar.data...
Parsing dataset: haberman.data...
Parsing dataset: pima-indians-diabetes.data...
Parsing dataset: ionosphere.data...
                             Decision Tree  Ada Boost Random Forest  \
Data File                                                             
spambase.data                     0.902(3)  0.9107(2)     0.9207(1)   
breast-cancer-wisconsin.data     0.9586(3)  0.9571(4)     0.9671(1)   
banknote_authentication.data     0.9446(2)  0.8913(4)     0.9519(1)   
processed.cleveland.data          0.798(2)  0.7812(4)     0.8442(1)   
movement_libras.data             0.1397(1)  0.0083(4)     0.1257(2)   
ultrasonic.data                  0.6167(1)  0.4056(4)     0.544