In [1]:
import pandas as pd

In [6]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import time

In [7]:
df_wines = pd.read_csv('completoWine.csv')

In [4]:
X = df_wines.drop('quality', axis=1)
Y = df_wines.quality

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9)



In [5]:
df_wines.groupby('quality').size()

quality
3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
dtype: int64

In [8]:
df_scores = pd.DataFrame({})
models = []

In [9]:
MAX_DEPTH = 50
CRITERIA=['gini', 'entropy']
MIN_SAMPLES_SPLIT = [2,4,8,12]
MIN_SAMPLES_LEAF = [2,4,8]
MAX_FEATURES = [2,4,6,8,10,12,13]
N_ESTIMATORS=[100,300,500,700,800,900]

In [10]:
for min_samples_split in MIN_SAMPLES_SPLIT:
    for min_samples_leaf in MIN_SAMPLES_LEAF:
        for max_feature in MAX_FEATURES:
            for criteria in CRITERIA:
                for n_estimators in N_ESTIMATORS:
                    start = time.clock()
                    clf = RandomForestClassifier(n_estimators=n_estimators, warm_start=True,
                                                 criterion=criteria, min_samples_split=min_samples_split,
                                                 min_samples_leaf=min_samples_leaf)
                    scores_train = cross_val_score(clf, X_train, Y_train, scoring='f1_micro', cv=3)
                    end = time.clock()
                    models.append(clf)
                    Y_predict = cross_val_predict(clf, X_test, Y_test, cv=3)
                    f1_score_test = f1_score(Y_test, Y_predict, average='micro')
                    print(min_samples_split, min_samples_leaf, max_feature, n_estimators, scores_train.mean(), f1_score_test)
                    df_scores = df_scores.append({'time': start-end,
                                                  'min_samples_leaf': min_samples_leaf,
                                                  'min_samples_split': min_samples_split,
                                                  'max_feature': max_feature,
                                                  'n_estimators': n_estimators,
                                                  'f1_micro_mean_train': scores_train.mean(),
                                                  'f1_micro_mean_test': f1_score_test,
                                                  'f1_micro_std_train': scores_train.std()
                                                  'depth': clf.max_depth}, 
                                                 ignore_index=True)
                                



2 2 2 100 0.649398920624653 0.536923076923077




2 2 2 300 0.6504226351646988 0.5492307692307692




2 2 2 500 0.651278124838824 0.536923076923077




2 2 2 700 0.6509365950536309 0.5461538461538461




2 2 2 800 0.6502487981522725 0.5507692307692308




2 2 2 900 0.6516221120317273 0.5461538461538461




2 2 2 100 0.6546986845508548 0.5307692307692308




2 2 2 300 0.6562349524313373 0.5492307692307692




2 2 2 500 0.6529870010278774 0.556923076923077




2 2 2 700 0.6550396865651944 0.5446153846153846




2 2 2 800 0.6567508462794266 0.5384615384615384


KeyboardInterrupt: 

In [None]:
df_scores.sort_values('f1_micro_mean_test', ascending=False)

In [15]:
decision_tree_classifier=DecisionTreeClassifier(max_depth=100, min_samples_split=6, min_samples_leaf=6, criterion='entropy')

In [20]:
clf = AdaBoostClassifier(n_estimators=200, learning_rate=0.01, base_estimator=decision_tree_classifier, algorithm='SAMME')

#clf.fit(X_train, Y_train)

#Y_predict = clf.predict(X_test)

#f1_score(Y_test, Y_predict, average='micro')

In [21]:
scores = cross_val_score(clf, X_train, Y_train, scoring='f1_micro', cv=3)
scores.mean()

0.6239098489561236

In [10]:
Y_predict = cross_val_predict(clf, X_test, Y_test, cv=3)



In [12]:
f1_score(Y_test, Y_predict, average='micro')

0.49538461538461537

In [14]:
confusion_matrix(Y_test, Y_predict)

array([[  0,   0,   1,   2,   0,   0,   0],
       [  0,   0,   7,   7,   0,   0,   0],
       [  0,   1, 105,  90,   8,   0,   0],
       [  0,   3,  65, 185,  38,   5,   1],
       [  0,   0,   7,  75,  27,   0,   0],
       [  0,   0,   0,  10,   6,   5,   0],
       [  0,   0,   0,   2,   0,   0,   0]])

In [269]:
clf.classes_

AttributeError: 'AdaBoostClassifier' object has no attribute 'classes_'

In [209]:
clf.feature_importances_

array([ 0.05966777,  0.07152935,  0.09708522,  0.06851571,  0.07382377,
        0.07950466,  0.10435181,  0.08513251,  0.06454349,  0.07548736,
        0.080881  ,  0.13813066,  0.00134668])

In [11]:
[clf.tree_.max_depth for clf in clf.estimators_]

AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

In [21]:
clf = RandomForestClassifier(n_estimators=n_estimators, warm_start=True,
                                                 criterion=criteria, min_samples_split=min_samples_split,
                                                 min_samples_leaf=min_samples_leaf)

In [22]:
clf.max_depth