In [1]:
import pandas as pd

In [2]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import time

In [3]:
df_wines = pd.read_csv('completoWine.csv')

In [11]:
X = df_wines.drop('quality', axis=1)
Y = df_wines.quality

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)



In [12]:
df_wines.groupby('quality').size()

quality
3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
dtype: int64

In [13]:
df_scores = pd.DataFrame({})
models = []

In [16]:
MAX_DEPTH = 50
MIN_SAMPLES_SPLIT = [2,4,8,12]
MIN_SAMPLES_LEAF = [2,4,8]
MAX_FEATURES = [2,4,6,8,10,12,13]
N_ESTIMATORS=[100,300,500,700,800,900]

In [None]:
for min_samples_split in MIN_SAMPLES_SPLIT:
    for min_samples_leaf in MIN_SAMPLES_LEAF:
        for max_feature in MAX_FEATURES:
            for n_estimators in N_ESTIMATORS:
                start = time.clock()
                decision_tree_classifier=DecisionTreeClassifier(max_depth=MAX_DEPTH, min_samples_split=min_samples_split, max_features=max_feature, min_samples_leaf=min_samples_leaf, criterion='entropy', presort=True)
                clf = BaggingClassifier(n_estimators=n_estimators, warm_start=True, base_estimator=decision_tree_classifier)
                scores_train = cross_val_score(clf, X_train, Y_train, scoring='f1_micro', cv=3)
                end = time.clock()
                models.append(clf)
                Y_predict = cross_val_predict(clf, X_test, Y_test, cv=3)
                f1_score_test = f1_score(Y_test, Y_predict, average='micro')
                print(min_samples_split, min_samples_leaf, max_feature, n_estimators, scores_train.mean(), f1_score_test)
                df_scores = df_scores.append({'time': start-end,
                                              'min_samples_leaf': min_samples_leaf,
                                              'min_samples_split': min_samples_split,
                                              'max_feature': max_feature,
                                              'n_estimators': n_estimators,
                                              'f1_micro_mean_train': scores_train.mean(),
                                              'f1_micro_mean_test': f1_score_test,
                                              'f1_micro_std_train': scores_train.std()}, 
                                             ignore_index=True)
                                



2 2 2 100 0.6455727807514569 0.5484615384615384




2 2 2 300 0.648455174892231 0.56




2 2 2 500 0.6469173056116747 0.5584615384615385




2 2 2 700 0.6521110586656066 0.553076923076923




2 2 2 800 0.649033986307333 0.5561538461538461




2 2 2 900 0.6448009576920603 0.5607692307692308




2 2 4 100 0.652881435591883 0.5584615384615385




2 2 4 300 0.6488393084745457 0.5561538461538461




2 2 4 500 0.6486492956617679 0.553076923076923




2 2 4 700 0.6501852783137473 0.5492307692307692




2 2 4 800 0.6515336903059273 0.5561538461538461




2 2 4 900 0.6526868708644796 0.5546153846153846




2 2 6 100 0.6413341968921684 0.553076923076923




2 2 6 300 0.6501837229223653 0.5561538461538461




2 2 6 500 0.6497978115850231 0.556923076923077




2 2 6 700 0.6515315790054316 0.5523076923076923




2 2 6 800 0.6503772897063493 0.553076923076923




2 2 6 900 0.6503797330134645 0.5507692307692308




2 2 8 100 0.6442198141516658 0.5476923076923077




2 2 8 300 0.6449906361899456 0.5538461538461539




2 2 8 500 0.6482628295694485 0.5546153846153846




2 2 8 700 0.6486468531240771 0.5561538461538461




2 2 8 800 0.6503780664402599 0.5523076923076923




2 2 8 900 0.6513420120740814 0.5515384615384615




2 2 10 100 0.6457632359832526 0.5484615384615384




2 2 10 300 0.6476863498949269 0.5561538461538461




2 2 10 500 0.647877473756508 0.55




2 2 10 700 0.6484561743744993 0.556923076923077




2 2 10 800 0.6453764367301771 0.5507692307692308




2 2 10 900 0.6488399751807701 0.5561538461538461




2 2 12 100 0.6444139364600515 0.5446153846153846




2 2 12 300 0.6434515450634755 0.5592307692307692




2 2 12 500 0.6478770301833535 0.5492307692307692




2 2 12 700 0.6492262192941562 0.5538461538461539




2 2 12 800 0.6467229613243557 0.5576923076923077




2 2 12 900 0.6453764363454649 0.553076923076923




2 2 13 100 0.6394118612532534 0.5461538461538461




2 2 13 300 0.6457641250531224 0.5592307692307692




2 2 13 500 0.6469175260517592 0.55




2 2 13 700 0.6459533592084289 0.556923076923077




2 2 13 800 0.6428759540741115 0.5538461538461539




2 2 13 900 0.6501882759911274 0.5592307692307692




2 4 2 100 0.6215139156839006 0.5515384615384615




2 4 2 300 0.6274822659568272 0.5546153846153846




2 4 2 500 0.6238257158619395 0.5607692307692308




2 4 2 700 0.622478304121452 0.5538461538461539




2 4 2 800 0.6230542278868599 0.55




2 4 2 900 0.6253650289673428 0.5584615384615385




2 4 4 100 0.6209422095182255 0.5523076923076923




2 4 4 300 0.629790402520686 0.5553846153846154




2 4 4 500 0.6282499791359039 0.5546153846153846




2 4 4 700 0.6276727219580472 0.5553846153846154




2 4 4 800 0.628632336886752 0.55




2 4 4 900 0.6294043807709454 0.5553846153846154




2 4 6 100 0.6274778244546 0.5538461538461539




2 4 6 300 0.629790847247977 0.5492307692307692




2 4 6 500 0.6292143691123041 0.5576923076923077




2 4 6 700 0.6284417681648603 0.5538461538461539




2 4 6 800 0.6280573010370775 0.5523076923076923




2 4 6 900 0.6299829705918262 0.5515384615384615




2 4 8 100 0.6280565254573035 0.5546153846153846




2 4 8 300 0.62882690238358 0.5523076923076923




2 4 8 500 0.6307501270923649 0.5484615384615384




2 4 8 700 0.6301740932992707 0.5607692307692308




2 4 8 800 0.6290181355034228 0.5515384615384615




2 4 8 900 0.6267121098553474 0.5507692307692308




2 4 10 100 0.6226717589541889 0.5476923076923077




2 4 10 300 0.6282502003454127 0.5453846153846154




2 4 10 500 0.6292135916089693 0.5523076923076923




2 4 10 700 0.629983191416623 0.5576923076923077




2 4 10 800 0.6290204683981395 0.55




2 4 10 900 0.6288289009634042 0.5523076923076923




2 4 12 100 0.6220932814692675 0.5423076923076923




2 4 12 300 0.6265199865114982 0.5523076923076923




2 4 12 500 0.6265192090081633 0.5484615384615384




2 4 12 700 0.6288304559700739 0.5538461538461539




2 4 12 800 0.6280604106657047 0.5507692307692308




2 4 12 900 0.6276735002308066 0.553076923076923




2 4 13 100 0.6243993071173427 0.5407692307692308




2 4 13 300 0.6284418789619709 0.553076923076923




2 4 13 500 0.6259438411518692 0.5453846153846154




2 4 13 700 0.6276740542163594 0.556923076923077




2 4 13 800 0.6270974648988638 0.55


In [None]:
df_scores.sort_values('f1_micro_mean_test', ascending=False)

In [14]:
confusion_matrix(Y_test, Y_predict)

array([[  0,   0,   1,   2,   0,   0,   0],
       [  0,   0,   7,   7,   0,   0,   0],
       [  0,   1, 105,  90,   8,   0,   0],
       [  0,   3,  65, 185,  38,   5,   1],
       [  0,   0,   7,  75,  27,   0,   0],
       [  0,   0,   0,  10,   6,   5,   0],
       [  0,   0,   0,   2,   0,   0,   0]])

In [269]:
clf.classes_

AttributeError: 'AdaBoostClassifier' object has no attribute 'classes_'

In [209]:
clf.feature_importances_

array([ 0.05966777,  0.07152935,  0.09708522,  0.06851571,  0.07382377,
        0.07950466,  0.10435181,  0.08513251,  0.06454349,  0.07548736,
        0.080881  ,  0.13813066,  0.00134668])

In [202]:
[clf.tree_.max_depth for clf in clf.estimators_]

[24,
 23,
 23,
 23,
 22,
 26,
 22,
 24,
 23,
 22,
 22,
 21,
 24,
 24,
 24,
 23,
 23,
 24,
 22,
 22,
 24,
 22,
 23,
 22,
 23,
 21,
 25,
 26,
 26,
 26,
 30,
 30,
 30,
 24,
 22,
 22,
 22,
 23,
 22,
 23,
 22,
 25,
 22,
 21,
 24,
 22,
 23,
 26,
 25,
 22,
 29,
 29,
 23,
 29,
 29,
 29,
 26,
 29,
 28,
 29,
 28,
 25,
 30,
 21,
 23,
 22,
 22,
 25,
 21,
 22,
 26,
 29,
 26,
 22,
 27,
 22,
 26,
 23,
 23,
 23,
 26,
 22,
 22,
 22,
 22,
 22,
 23,
 28,
 27,
 22,
 22,
 24,
 25,
 25,
 23,
 28,
 22,
 24,
 21,
 23,
 28,
 28,
 29,
 29,
 21,
 28,
 27,
 27,
 27,
 27,
 25,
 26,
 26,
 26,
 25,
 25,
 28,
 25,
 26,
 27,
 27,
 27,
 27,
 27,
 27,
 31,
 27,
 27,
 27,
 30,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 24,
 26,
 25,
 26,
 25,
 28,
 26,
 26,
 27,
 26,
 27,
 27,
 23,
 29,
 29,
 32,
 29,
 32,
 26,
 25,
 25,
 25,
 27,
 26,
 26,
 27,
 27,
 26,
 27,
 26,
 29,
 28,
 26,
 23,
 26,
 23,
 24,
 23,
 24,
 24,
 26,
 23,
 25,
 23,
 23,
 24,
 24,
 24,
 25,
 29,
 25,
 29,
 29,
 25,
 26,
 26,
 27,
 25,
 27,
 24,
 27,
 27,
 27,


In [None]:
clf.estimators