In [22]:
import pandas as pd

In [23]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import time

In [24]:
df_wines = pd.read_csv('completoWine.csv')

In [25]:
X = df_wines.drop('quality', axis=1)
Y = df_wines.quality

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)



In [26]:
df_wines.shape

(6497, 14)

In [27]:
df_wines.groupby('quality').size()

quality
3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
dtype: int64

In [28]:
df_scores = pd.DataFrame({})
models = []

In [29]:
MAX_DEPTH = 50
MIN_SAMPLES_SPLIT = [2,6,10,12]
MIN_SAMPLES_LEAF = [2,4,6]
MAX_FEATURES = [2,6,10,13]
N_ESTIMATORS=[100,600,800,900]

In [None]:
for min_samples_split in MIN_SAMPLES_SPLIT:
    for min_samples_leaf in MIN_SAMPLES_LEAF:
        for max_feature in MAX_FEATURES:
            for n_estimators in N_ESTIMATORS:
                start = time.clock()
                decision_tree_classifier=DecisionTreeClassifier(max_depth=MAX_DEPTH, 
                                                                min_samples_split=min_samples_split, 
                                                                max_features=max_feature, 
                                                                min_samples_leaf=min_samples_leaf, 
                                                                criterion='entropy', presort=True)
                clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=0.01, base_estimator=decision_tree_classifier, algorithm='SAMME')
                scores_train = cross_val_score(clf, X_train, Y_train, scoring='f1_micro', cv=3)
                end = time.clock()
                models.append(clf)
                Y_predict = cross_val_predict(clf, X_test, Y_test, cv=3)
                f1_score_test = f1_score(Y_test, Y_predict, average='micro')
                print(min_samples_split, min_samples_leaf, max_feature, n_estimators, scores_train.mean(), f1_score_test)
                df_scores = df_scores.append({'time': start-end,
                                              'min_samples_leaf': min_samples_leaf,
                                              'min_samples_split': min_samples_split,
                                              'max_feature': max_feature,
                                              'n_estimators': n_estimators,
                                              'f1_micro_mean_train': scores_train.mean(),
                                              'f1_micro_mean_test': f1_score_test,
                                              'f1_micro_std_train': scores_train.std()},
                                             ignore_index=True)
                                



2 2 2 100 0.6424877234009028 0.546923076923077




2 2 2 600 0.6542179907073673 0.5507692307692308




2 2 2 800 0.6480633966522993 0.553076923076923




2 2 2 900 0.6509509020801146 0.55




2 2 6 100 0.6363304671339362 0.5376923076923077




2 2 6 600 0.6394055450103667 0.54




2 2 6 800 0.6392098679474932 0.5392307692307692




2 2 6 900 0.636711828347245 0.5415384615384615




2 2 10 100 0.6311337153761293 0.5007692307692307




2 2 10 600 0.6322910036383691 0.53




2 2 10 800 0.6370962967568117 0.5215384615384615




2 2 10 900 0.6353626357713642 0.5330769230769231




2 2 13 100 0.6072709418327347 0.4969230769230769




2 2 13 600 0.6247876696860329 0.5169230769230769




2 2 13 800 0.6257540481169667 0.5261538461538462




2 2 13 900 0.6342087932525597 0.5161538461538462




2 4 2 100 0.635949444467805 0.5676923076923077




2 4 2 600 0.6519127415423509 0.5615384615384615




2 4 2 800 0.6490298969089219 0.5553846153846154




2 4 2 900 0.6488365531285869 0.5615384615384615




2 4 6 100 0.6394099948539546 0.54




2 4 6 600 0.6451841118937408 0.5438461538461539




2 4 6 800 0.6432561087984962 0.5453846153846154




2 4 6 900 0.6497972820534896 0.5384615384615384




2 4 10 100 0.6303630063138151 0.5269230769230769




2 4 10 600 0.6357504357929312 0.5253846153846153




2 4 10 800 0.6365218123317836 0.5323076923076923




2 4 10 900 0.6401765887727318 0.5376923076923077




2 4 13 100 0.6192047798655324 0.5092307692307693




2 4 13 600 0.6319156420196498 0.536923076923077




2 4 13 800 0.6334455194044021 0.5261538461538462




2 4 13 900 0.6355626446992636 0.5246153846153846




2 6 2 100 0.633825874593995 0.5592307692307692




2 6 2 600 0.6521027537106855 0.5553846153846154




2 6 2 800 0.6469077741960597 0.5538461538461539




2 6 2 900 0.6476790366034773 0.5461538461538461




2 6 6 100 0.6386404969954643 0.5438461538461539




2 6 6 600 0.6447945357818655 0.5323076923076923




2 6 6 800 0.6428680882083132 0.5392307692307692




2 6 6 900 0.6428710870438265 0.5461538461538461




2 6 10 100 0.6328674904930117 0.5207692307692308




2 6 10 600 0.6411435359372767 0.5169230769230769




2 6 10 800 0.6386437144760299 0.5184615384615384




2 6 10 900 0.6378750039962032 0.5230769230769231




2 6 13 100 0.6084252428008423 0.5107692307692308




2 6 13 600 0.6244035340529538 0.5046153846153846


In [None]:
clf.base_estimator.max_depth

In [42]:
df_scores['time'] = abs(df_scores['time'])

In [44]:
df_scores.sort_values('f1_micro_mean_test', ascending=False)

Unnamed: 0,f1_micro_mean_test,f1_micro_mean_train,f1_micro_std_train,max_feature,min_samples_leaf,min_samples_split,n_estimators,time
127,0.555385,0.659652,0.002682,2.0,2.0,2.0,800.0,94.604076
125,0.555385,0.657427,0.006613,2.0,2.0,2.0,600.0,71.600046
120,0.553846,0.657260,0.001577,2.0,2.0,2.0,100.0,11.417713
128,0.550769,0.660333,0.005545,2.0,2.0,2.0,900.0,109.645694
139,0.549231,0.661021,0.002090,3.0,2.0,2.0,1000.0,161.850772
124,0.549231,0.658280,0.006251,2.0,2.0,2.0,500.0,60.506666
123,0.547692,0.657936,0.007262,2.0,2.0,2.0,400.0,47.933005
133,0.546154,0.656229,0.004908,3.0,2.0,2.0,400.0,64.586514
138,0.544615,0.655888,0.003713,3.0,2.0,2.0,900.0,145.777091
149,0.544615,0.655379,0.001608,4.0,2.0,2.0,1000.0,204.892024


In [15]:
decision_tree_classifier=DecisionTreeClassifier(max_depth=100, min_samples_split=6, min_samples_leaf=6, criterion='entropy')

In [20]:
clf = AdaBoostClassifier(n_estimators=200, learning_rate=0.01, base_estimator=decision_tree_classifier, algorithm='SAMME')

#clf.fit(X_train, Y_train)

#Y_predict = clf.predict(X_test)

#f1_score(Y_test, Y_predict, average='micro')

In [21]:
scores = cross_val_score(clf, X_train, Y_train, scoring='f1_micro', cv=3)
scores.mean()

0.6239098489561236

In [10]:
Y_predict = cross_val_predict(clf, X_test, Y_test, cv=3)



In [12]:
f1_score(Y_test, Y_predict, average='micro')

0.49538461538461537

In [14]:
confusion_matrix(Y_test, Y_predict)

array([[  0,   0,   1,   2,   0,   0,   0],
       [  0,   0,   7,   7,   0,   0,   0],
       [  0,   1, 105,  90,   8,   0,   0],
       [  0,   3,  65, 185,  38,   5,   1],
       [  0,   0,   7,  75,  27,   0,   0],
       [  0,   0,   0,  10,   6,   5,   0],
       [  0,   0,   0,   2,   0,   0,   0]])

In [269]:
clf.classes_

AttributeError: 'AdaBoostClassifier' object has no attribute 'classes_'

In [209]:
clf.feature_importances_

array([ 0.05966777,  0.07152935,  0.09708522,  0.06851571,  0.07382377,
        0.07950466,  0.10435181,  0.08513251,  0.06454349,  0.07548736,
        0.080881  ,  0.13813066,  0.00134668])

In [65]:
[estimator.tree_.max_depth for estimator in clf.estimators_]

AttributeError: 'AdaBoostClassifier' object has no attribute 'estimators_'

In [None]:
clf.estimators