In [1]:
import pandas as pd

In [2]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import time

In [3]:
df_wines = pd.read_csv('completoWine.csv')

In [4]:
X = df_wines.drop('quality', axis=1)
Y = df_wines.quality

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)



In [5]:
df_wines.shape

(6497, 14)

In [6]:
df_wines.groupby('quality').size()

quality
3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
dtype: int64

In [12]:
df_scores = pd.DataFrame({})
models = []

In [13]:
MAX_DEPTH = 50
MIN_SAMPLES_SPLIT = [2,4,8]
MIN_SAMPLES_LEAF = [2,4,8]
MAX_FEATURES = [2,8,13]
N_ESTIMATORS=[100,500,900]

In [14]:
for min_samples_split in MIN_SAMPLES_SPLIT:
    for min_samples_leaf in MIN_SAMPLES_LEAF:
        for max_feature in MAX_FEATURES:
            for n_estimators in N_ESTIMATORS:
                start = time.clock()
                decision_tree_classifier=DecisionTreeClassifier(max_depth=MAX_DEPTH, 
                                                                min_samples_split=min_samples_split, 
                                                                max_features=max_feature, 
                                                                min_samples_leaf=min_samples_leaf, 
                                                                criterion='entropy', presort=True)
                clf = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=0.01, base_estimator=decision_tree_classifier, algorithm='SAMME')
                scores_train = cross_val_score(clf, X_train, Y_train, scoring='f1_micro', cv=3)
                end = time.clock()
                models.append(clf)
                Y_predict = cross_val_predict(clf, X_test, Y_test, cv=3)
                f1_score_test = f1_score(Y_test, Y_predict, average='micro')
                print(min_samples_split, min_samples_leaf, max_feature, n_estimators, scores_train.mean(), f1_score_test)
                df_scores = df_scores.append({'time': start-end,
                                              'min_samples_leaf': min_samples_leaf,
                                              'min_samples_split': min_samples_split,
                                              'max_feature': max_feature,
                                              'n_estimators': n_estimators,
                                              'f1_micro_mean_train': scores_train.mean(),
                                              'f1_micro_mean_test': f1_score_test,
                                              'f1_micro_std_train': scores_train.std()},
                                             ignore_index=True)
                                

2 2 2 100 0.638434501938 0.554615384615
2 2 2 500 0.646511660533 0.543846153846
2 2 2 900 0.642472750383 0.55
2 2 8 100 0.628053870268 0.530769230769
2 2 8 500 0.633823107868 0.531538461538
2 2 8 900 0.632660932473 0.540769230769
2 2 13 100 0.586676276405 0.506153846154
2 2 13 500 0.615930468898 0.521538461538
2 2 13 900 0.623043127743 0.531538461538
2 4 2 100 0.635931906057 0.553076923077
2 4 2 500 0.645168022567 0.548461538462
2 4 2 900 0.639972596145 0.548461538462
2 4 8 100 0.632286683315 0.543846153846
2 4 8 500 0.632466923139 0.545384615385
2 4 8 900 0.6395851289 0.540769230769
2 4 13 100 0.60149793533 0.524615384615
2 4 13 500 0.624964800391 0.543076923077
2 4 13 900 0.630930605023 0.537692307692
2 8 2 100 0.624778346174 0.557692307692
2 8 2 500 0.637475438819 0.555384615385
2 8 2 900 0.637469222503 0.556923076923
2 8 8 100 0.620347079721 0.553846153846
2 8 8 500 0.628044873762 0.542307692308
2 8 8 900 0.632279126275 0.54
2 8 13 100 0.605144386588 0.517692307692
2 8 13 500 0.621

In [16]:
clf.base_estimator.max_depth

50

In [42]:
df_scores['time'] = abs(df_scores['time'])

In [17]:
df_scores.sort_values('f1_micro_mean_test', ascending=False)

Unnamed: 0,f1_micro_mean_test,f1_micro_mean_train,f1_micro_std_train,max_feature,min_samples_leaf,min_samples_split,n_estimators,time,Unnamed: 9
47,0.566154,0.637661,0.012568,2.0,8.0,4.0,900.0,-41.796412,
46,0.561538,0.639010,0.013786,2.0,8.0,4.0,500.0,-23.070250,
73,0.560769,0.640165,0.014923,2.0,8.0,8.0,500.0,-23.141655,
72,0.558462,0.617652,0.016050,2.0,8.0,8.0,100.0,-4.588492,
18,0.557692,0.624778,0.010372,2.0,8.0,2.0,100.0,-6.873611,
28,0.557692,0.646320,0.015540,2.0,4.0,2.0,4.0,500.0,-39.415754
64,0.556923,0.643819,0.013038,2.0,4.0,8.0,500.0,-29.084848,
20,0.556923,0.637469,0.013049,2.0,8.0,2.0,900.0,-62.892523,
54,0.556923,0.636515,0.006741,2.0,6.0,2.0,8.0,100.0,-6.326834
19,0.555385,0.637475,0.007347,2.0,8.0,2.0,500.0,-34.517308,


In [18]:
confusion_matrix(Y_test, Y_predict)

array([[  0,   0,   2,   3,   1,   0,   0],
       [  1,   2,  18,   9,   3,   0,   0],
       [  0,   3, 254, 168,   9,   1,   0],
       [  0,   2, 133, 327,  80,   5,   1],
       [  0,   1,  21, 121,  86,   3,   0],
       [  0,   0,   1,  19,  17,   7,   1],
       [  0,   0,   0,   0,   0,   1,   0]])

In [19]:
df_scores.to_csv('adaboosting.csv')