In [5]:
import pandas as pd

In [6]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import time

In [7]:
df_wines = pd.read_csv('completoWine.csv')

In [8]:
X = df_wines.drop('quality', axis=1)
Y = df_wines.quality

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)



In [9]:
df_wines.groupby('quality').size()

quality
3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
dtype: int64

In [15]:
df_scores = pd.DataFrame({})
models = []

In [16]:
MAX_DEPTH = 50
CRITERIA=['gini', 'entropy']
MIN_SAMPLES_SPLIT = [2,4,8]
MIN_SAMPLES_LEAF = [2,4,8]
MAX_FEATURES = [2,8,13]
N_ESTIMATORS=[100,500,900]

In [17]:
for min_samples_split in MIN_SAMPLES_SPLIT:
    for min_samples_leaf in MIN_SAMPLES_LEAF:
        for max_feature in MAX_FEATURES:
            for n_estimators in N_ESTIMATORS:
                start = time.clock()
                clf = RandomForestClassifier(n_estimators=n_estimators, warm_start=True,
                                             criterion=criteria, min_samples_split=min_samples_split,
                                             min_samples_leaf=min_samples_leaf)
                scores_train = cross_val_score(clf, X_train, Y_train, scoring='f1_micro', cv=3)
                end = time.clock()
                models.append(clf)
                Y_predict = cross_val_predict(clf, X_test, Y_test, cv=3)
                f1_score_test = f1_score(Y_test, Y_predict, average='micro')
                print(min_samples_split, min_samples_leaf, max_feature, n_estimators, scores_train.mean(), f1_score_test)
                df_scores = df_scores.append({'time': start-end,
                                              'min_samples_leaf': min_samples_leaf,
                                              'min_samples_split': min_samples_split,
                                              'max_feature': max_feature,
                                              'n_estimators': n_estimators,
                                              'f1_micro_mean_train': scores_train.mean(),
                                              'f1_micro_mean_test': f1_score_test,
                                              'f1_micro_std_train': scores_train.std()}, 
                                             ignore_index=True)
                                



2 2 2 100 0.631341391628 0.564615384615




2 2 2 500 0.638080790027 0.568461538462




2 2 2 900 0.63750331446 0.566923076923




2 2 8 100 0.635960112551 0.57




2 2 8 500 0.636156567375 0.57




2 2 8 900 0.637886337632 0.573846153846




2 2 13 100 0.633077045425 0.565384615385




2 2 13 500 0.635577420232 0.569230769231




2 2 13 900 0.637504533358 0.569230769231




2 4 2 100 0.619032089387 0.552307692308




2 4 2 500 0.616913404723 0.561538461538




2 4 2 900 0.616528607384 0.565384615385




2 4 8 100 0.619992706104 0.564615384615




2 4 8 500 0.619989817552 0.557692307692




2 4 8 900 0.617685119809 0.56




2 4 13 100 0.612679156057 0.555384615385




2 4 13 500 0.616145359796 0.559230769231




2 4 13 900 0.617104198499 0.559230769231




2 8 2 100 0.593436442236 0.543076923077




2 8 2 500 0.59670663806 0.554615384615




2 8 2 900 0.597092661991 0.546153846154




2 8 8 100 0.594593734346 0.566153846154




2 8 8 500 0.593434553938 0.546153846154




2 8 8 900 0.59651329428 0.546153846154




2 8 13 100 0.596135707739 0.549230769231




2 8 13 500 0.596321612458 0.551538461538




2 8 13 900 0.595748021849 0.553846153846




4 2 2 100 0.63903518658 0.553846153846




4 2 2 500 0.638078570875 0.563846153846




4 2 2 900 0.63538918223 0.578461538462




4 2 8 100 0.632304560273 0.565384615385




4 2 8 500 0.634805486501 0.570769230769




4 2 8 900 0.63827169024 0.573076923077




4 2 13 100 0.637691550538 0.555384615385




4 2 13 500 0.636725160566 0.563846153846




4 2 13 900 0.637115073301 0.569230769231




4 4 2 100 0.614606712244 0.556923076923




4 4 2 500 0.619416000604 0.559230769231




4 4 2 900 0.618067587712 0.564615384615




4 4 8 100 0.619990034274 0.553076923077




4 4 8 500 0.620760306046 0.561538461538




4 4 8 900 0.617682233181 0.567692307692




4 4 13 100 0.61595457179 0.561538461538




4 4 13 500 0.618646512363 0.555384615385




4 4 13 900 0.615951017686 0.554615384615




4 8 2 100 0.597091104546 0.544615384615




4 8 2 500 0.596130824453 0.549230769231




4 8 2 900 0.595168656061 0.548461538462




4 8 8 100 0.596125157634 0.544615384615




4 8 8 500 0.592471940562 0.553076923077




4 8 8 900 0.596322057443 0.545384615385




4 8 13 100 0.593630674062 0.548461538462




4 8 13 500 0.593050530513 0.550769230769




4 8 13 900 0.594976088118 0.55




8 2 2 100 0.62191215383 0.555384615385




8 2 2 500 0.624418081323 0.571538461538




8 2 2 900 0.626918454205 0.563846153846




8 2 8 100 0.627494376174 0.568461538462




8 2 8 500 0.624608536552 0.563846153846




8 2 8 900 0.624994782975 0.563076923077




8 2 13 100 0.627883840078 0.560769230769




8 2 13 500 0.625957173859 0.567692307692




8 2 13 900 0.628457212042 0.561538461538




8 4 2 100 0.61498940264 0.559230769231




8 4 2 500 0.61980213482 0.563076923077




8 4 2 900 0.619221768778 0.555384615385




8 4 8 100 0.615181968659 0.56




8 4 8 500 0.617492772435 0.562307692308




8 4 8 900 0.620761083807 0.566923076923




8 4 13 100 0.616334151143 0.555384615385




8 4 13 500 0.615565438739 0.568461538462




8 4 13 900 0.621534013944 0.557692307692




8 8 2 100 0.592088476253 0.553076923077




8 8 2 500 0.59459072974 0.546923076923




8 8 2 900 0.593627897719 0.55




8 8 8 100 0.589586768417 0.546153846154




8 8 8 500 0.591318316689 0.550769230769




8 8 8 900 0.594207598207 0.558461538462




8 8 13 100 0.596132716598 0.553846153846




8 8 13 500 0.596317839709 0.551538461538




8 8 13 900 0.597858377483 0.554615384615


In [18]:
df_scores.sort_values('f1_micro_mean_test', ascending=False)

Unnamed: 0,f1_micro_mean_test,f1_micro_mean_train,f1_micro_std_train,max_feature,min_samples_leaf,min_samples_split,n_estimators,time
29,0.578462,0.635389,0.019418,2.0,2.0,4.0,900.0,-44.315380
5,0.573846,0.637886,0.016668,8.0,2.0,2.0,900.0,-63.832949
32,0.573077,0.638272,0.016081,8.0,2.0,4.0,900.0,-43.956533
55,0.571538,0.624418,0.016970,2.0,2.0,8.0,500.0,-33.945936
31,0.570769,0.634805,0.014702,8.0,2.0,4.0,500.0,-28.568849
3,0.570000,0.635960,0.015295,8.0,2.0,2.0,100.0,-7.066481
4,0.570000,0.636157,0.018949,8.0,2.0,2.0,500.0,-35.250809
7,0.569231,0.635577,0.016467,13.0,2.0,2.0,500.0,-35.020282
8,0.569231,0.637505,0.018597,13.0,2.0,2.0,900.0,-63.371226
35,0.569231,0.637115,0.017095,13.0,2.0,4.0,900.0,-44.372366


In [19]:
df_scores.to_csv('random_forests.csv')