In [1]:
import pandas as pd

In [2]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import time

In [3]:
df_wines = pd.read_csv('completoWine.csv')

In [4]:
X = df_wines.drop('quality', axis=1)
Y = df_wines.quality

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)



In [11]:
df_wines.groupby('quality').size()

quality
3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
dtype: int64

In [12]:
df_scores = pd.DataFrame({})
models = []

In [13]:
MAX_DEPTH = 50
MIN_SAMPLES_SPLIT = [2,4,8]
MIN_SAMPLES_LEAF = [2,4,8]
MAX_FEATURES = [2,8,13]
N_ESTIMATORS=[100,500,900]

In [14]:
for min_samples_split in MIN_SAMPLES_SPLIT:
    for min_samples_leaf in MIN_SAMPLES_LEAF:
        for max_feature in MAX_FEATURES:
            for n_estimators in N_ESTIMATORS:
                start = time.clock()
                decision_tree_classifier=DecisionTreeClassifier(max_depth=MAX_DEPTH, min_samples_split=min_samples_split, max_features=max_feature, min_samples_leaf=min_samples_leaf, criterion='entropy', presort=True)
                clf = BaggingClassifier(n_estimators=n_estimators, warm_start=True, base_estimator=decision_tree_classifier)
                scores_train = cross_val_score(clf, X_train, Y_train, scoring='f1_micro', cv=3)
                end = time.clock()
                models.append(clf)
                Y_predict = cross_val_predict(clf, X_test, Y_test, cv=3)
                f1_score_test = f1_score(Y_test, Y_predict, average='micro')
                print(min_samples_split, min_samples_leaf, max_feature, n_estimators, scores_train.mean(), f1_score_test)
                df_scores = df_scores.append({'time': start-end,
                                              'min_samples_leaf': min_samples_leaf,
                                              'min_samples_split': min_samples_split,
                                              'max_feature': max_feature,
                                              'n_estimators': n_estimators,
                                              'f1_micro_mean_train': scores_train.mean(),
                                              'f1_micro_mean_test': f1_score_test,
                                              'f1_micro_std_train': scores_train.std()}, 
                                             ignore_index=True)
                                



2 2 2 100 0.642677726117 0.561538461538




2 2 2 500 0.641905013988 0.558461538462




2 2 2 900 0.642096248262 0.563076923077




2 2 8 100 0.63959376818 0.566153846154




2 2 8 500 0.635358962195 0.55




2 2 8 900 0.639787223398 0.556153846154




2 2 13 100 0.634975937353 0.554615384615




2 2 13 500 0.635554750307 0.547692307692




2 2 13 900 0.63478048471 0.548461538462




2 4 2 100 0.619968160573 0.55




2 4 2 500 0.622470196697 0.549230769231




2 4 2 900 0.620354737462 0.556923076923




2 4 8 100 0.621893940925 0.546923076923




2 4 8 500 0.62400573616 0.544615384615




2 4 8 900 0.626699561698 0.551538461538




2 4 13 100 0.619006325855 0.540769230769




2 4 13 500 0.622661321712 0.543846153846




2 4 13 900 0.625349928244 0.54




2 8 2 100 0.596878793305 0.540769230769




2 8 2 500 0.598613894138 0.543076923077




2 8 2 900 0.596880902682 0.547692307692




2 8 8 100 0.60033567008 0.544615384615




2 8 8 500 0.600529125682 0.542307692308




2 8 8 900 0.600530680304 0.543846153846




2 8 13 100 0.599762189621 0.535384615385




2 8 13 500 0.598988923891 0.538461538462




2 8 13 900 0.599953201917 0.535384615385




4 2 2 100 0.647101987852 0.561538461538




4 2 2 500 0.638439698936 0.564615384615




4 2 2 900 0.642674837313 0.564615384615




4 2 8 100 0.635166396047 0.553076923077




4 2 8 500 0.637475643275 0.558461538462




4 2 8 900 0.639976792637 0.552307692308




4 2 13 100 0.634210557453 0.553846153846




4 2 13 500 0.636321464773 0.552307692308




4 2 13 900 0.638051568195 0.543846153846




4 4 2 100 0.619393347087 0.549230769231




4 4 2 500 0.620546083303 0.547692307692




4 4 2 900 0.620546304512 0.55




4 4 8 100 0.626700450768 0.546923076923




4 4 8 500 0.625737061812 0.546923076923




4 4 8 900 0.626120195912 0.544615384615




4 4 13 100 0.618621525951 0.543846153846




4 4 13 500 0.624391425134 0.541538461538




4 4 13 900 0.624390425267 0.54




4 8 2 100 0.592069618176 0.546923076923




4 8 2 500 0.599190816232 0.544615384615




4 8 2 900 0.59688168057 0.55




4 8 8 100 0.597260263525 0.541538461538




4 8 8 500 0.60091448111 0.543076923077




4 8 8 900 0.598027200355 0.541538461538




4 8 13 100 0.600145769988 0.537692307692




4 8 13 500 0.597836632018 0.530769230769




4 8 13 900 0.602262116753 0.539230769231




8 2 2 100 0.626704113612 0.546153846154




8 2 2 500 0.63151351149 0.549230769231




8 2 2 900 0.63035822117 0.558461538462




8 2 8 100 0.628048306466 0.554615384615




8 2 8 500 0.633818539195 0.547692307692




8 2 8 900 0.634397461407 0.551538461538




8 2 13 100 0.6247792231 0.544615384615




8 2 13 500 0.630357443666 0.545384615385




8 2 13 900 0.630930591349 0.545384615385




8 4 2 100 0.616116933799 0.543846153846




8 4 2 500 0.623816165767 0.550769230769




8 4 2 900 0.621510250916 0.549230769231




8 4 8 100 0.625352484657 0.544615384615




8 4 8 500 0.62400451393 0.548461538462




8 4 8 900 0.624389758946 0.549230769231




8 4 13 100 0.620541976885 0.546923076923




8 4 13 500 0.626123860296 0.55




8 4 13 900 0.624967904424 0.538461538462




8 8 2 100 0.595148689884 0.538461538462




8 8 2 500 0.597073802375 0.541538461538




8 8 2 900 0.597845180323 0.546923076923




8 8 8 100 0.598221433076 0.540769230769




8 8 8 500 0.603223506359 0.543076923077




8 8 8 900 0.600914481495 0.545384615385




8 8 13 100 0.598990367332 0.539230769231




8 8 13 500 0.599758525238 0.534615384615




8 8 13 900 0.6005290145 0.536153846154


In [15]:
df_scores.sort_values('f1_micro_mean_test', ascending=False)

Unnamed: 0,f1_micro_mean_test,f1_micro_mean_train,f1_micro_std_train,max_feature,min_samples_leaf,min_samples_split,n_estimators,time
3,0.566154,0.639594,0.012400,8.0,2.0,2.0,100.0,-13.757043
28,0.564615,0.638440,0.013554,2.0,2.0,4.0,500.0,-26.210329
29,0.564615,0.642675,0.012390,2.0,2.0,4.0,900.0,-47.626633
2,0.563077,0.642096,0.013559,2.0,2.0,2.0,900.0,-50.425218
0,0.561538,0.642678,0.006058,2.0,2.0,2.0,100.0,-6.125963
27,0.561538,0.647102,0.010231,2.0,2.0,4.0,100.0,-5.247680
1,0.558462,0.641905,0.012638,2.0,2.0,2.0,500.0,-30.021299
56,0.558462,0.630358,0.011160,2.0,2.0,8.0,900.0,-42.271645
31,0.558462,0.637476,0.014950,8.0,2.0,4.0,500.0,-68.521193
11,0.556923,0.620355,0.011890,2.0,4.0,2.0,900.0,-39.651671


In [17]:
confusion_matrix(Y_test, Y_predict)

array([[  0,   0,   4,   2,   0,   0,   0],
       [  0,   0,  17,  13,   0,   0,   0],
       [  0,   0, 242, 166,   5,   0,   0],
       [  0,   0, 140, 394,  53,   0,   0],
       [  0,   0,  13, 152,  61,   0,   0],
       [  0,   0,   1,  21,  15,   0,   0],
       [  0,   0,   0,   0,   1,   0,   0]])

In [18]:
df_scores.to_csv('bagging.csv')