In [23]:
# データ解析のライブラリ
import pandas as pd
import numpy as np 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import GridSearchCV 

In [2]:
X_train = pd.read_csv('train_x.csv')
Y_train = pd.read_csv('train_y.csv')
X_test = pd.read_csv('test_x.csv')
Y_test = pd.read_csv('test_y.csv')
X_train_in = X_train.values
Y_train_in = Y_train.values.ravel()
X_test_in = X_test.values
Y_test_in = Y_test.values.ravel()

In [3]:
rf = RandomForestClassifier(n_estimators = 300, random_state = 42)
rf.fit(X_train_in, Y_train_in)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [4]:
Y_train_pred = rf.predict(X_train_in)
cmat = confusion_matrix(Y_train_pred, Y_train_in, labels=[0,1])
print('訓練データ')
print((cmat[0,0]+cmat[1,1])/len(Y_train_pred))
print(cmat)
Y_test_pred = rf.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print("検証データ")
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

訓練データ
1.0
[[21558     0]
 [    0  2857]]
検証データ
0.909325469959
[[2338  188]
 [  58  129]]


### 過学習になっている。訓練データと検証データで同程度の正答率を探す

In [18]:
rf = RandomForestClassifier(n_estimators = 100, max_depth=8, random_state = 42)
rf.fit(X_train_in, Y_train_in)
Y_train_pred = rf.predict(X_train_in)
cmat = confusion_matrix(Y_train_pred, Y_train_in, labels=[0,1])
print('訓練データ')
print((cmat[0,0]+cmat[1,1])/len(Y_train_pred))
print(cmat)
Y_test_pred = rf.predict(X_test_in)
cmat = confusion_matrix(Y_test_pred, Y_test_in, labels=[0,1])
print("検証データ")
print((cmat[0,0]+cmat[1,1])/len(Y_test_pred))
print(cmat)

訓練データ
0.900675814049
[[21462  2329]
 [   96   528]]
検証データ
0.892738665684
[[2384  279]
 [  12   38]]


In [9]:
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

## GridSearchCVを使ってみる

In [30]:
X_train = pd.read_csv('train_all_x.csv')
Y_train = pd.read_csv('train_all_y.csv')
X_train_in = X_train.values
Y_train_in = Y_train.values.ravel()

In [31]:
initial_params = {
    'random_state':42,
}
params = {
    'max_depth':[5,7,9,11,13,15],
    'n_estimators':[10,20,50,100]
}

In [32]:
grid = GridSearchCV( estimator=RandomForestClassifier(**initial_params),
                    param_grid=params,
                    scoring="accuracy",
                    cv=3,
                    verbose=1,
                    n_jobs=-1)
grid.fit(X_train_in, Y_train_in)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   44.4s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [5, 7, 9, 11, 13, 15], 'n_estimators': [10, 20, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [33]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 15, 'n_estimators': 50}
0.902901168577


In [35]:
initial_params = {
    'random_state':42,
}
params = {
    'max_depth':[13,15,20,30],
    'n_estimators':[30,40,50,60,70,80,90]
}

In [36]:
grid = GridSearchCV( estimator=RandomForestClassifier(**initial_params),
                    param_grid=params,
                    scoring="accuracy",
                    cv=3,
                    verbose=1,
                    n_jobs=-1)
grid.fit(X_train_in, Y_train_in)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:  1.5min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [13, 15, 20, 30], 'n_estimators': [30, 40, 50, 60, 70, 80, 90]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [37]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 30, 'n_estimators': 80}
0.904744350647


In [38]:
initial_params = {
    'random_state':42,
    'n_estimators':80
}
params = {
    'max_depth':[25,30,35,40,50,75,100,200],
}

In [40]:
grid = GridSearchCV( estimator=RandomForestClassifier(**initial_params),
                    param_grid=params,
                    scoring="accuracy",
                    cv=5,
                    verbose=1,
                    n_jobs=-1)
grid.fit(X_train_in, Y_train_in)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.4min finished


{'max_depth': 30}
0.90481807793


In [41]:
initial_params = {
    'random_state':42,
    'max_depth':30
}
params = {
    'n_estimators':[70,80,90,100,150],
}

In [42]:
grid = GridSearchCV( estimator=RandomForestClassifier(**initial_params),
                    param_grid=params,
                    scoring="accuracy",
                    cv=5,
                    verbose=1,
                    n_jobs=-1)
grid.fit(X_train_in, Y_train_in)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.5min finished


{'n_estimators': 150}
0.905223577985


In [43]:
initial_params = {
    'random_state':42,
    'max_depth':30
}
params = {
    'n_estimators':[140,150,200,300,500],
}

In [44]:
grid = GridSearchCV( estimator=RandomForestClassifier(**initial_params),
                    param_grid=params,
                    scoring="accuracy",
                    cv=5,
                    verbose=1,
                    n_jobs=-1)
grid.fit(X_train_in, Y_train_in)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  3.0min finished


{'n_estimators': 150}
0.905223577985


In [45]:
initial_params = {
    'random_state':42,
    'max_depth':30
}
params = {
    'n_estimators':[145,150,160,170,180,190],
}

In [46]:
grid = GridSearchCV( estimator=RandomForestClassifier(**initial_params),
                    param_grid=params,
                    scoring="accuracy",
                    cv=5,
                    verbose=1,
                    n_jobs=-1)
grid.fit(X_train_in, Y_train_in)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.0min finished


{'n_estimators': 145}
0.905223577985


In [47]:
params = {
    'random_state':42,
    'max_depth':30,
    'n_estimators':145
}
rf = RandomForestClassifier(**params)
rf.fit(X_train_in, Y_train_in)

X_test = pd.read_csv('pred_x.csv')
X_test_in = X_test.values
Y_test_pred = rf.predict_proba(X_test_in)

submit_file = pd.read_csv('data/submit_sample.csv', names=['id','prob'])
t = Y_test_pred[:,1]
submit_file['prob']=t
submit_file.to_csv('submit9_RF_python2.csv', columns=['id','prob'], header=False, index=False)

In [49]:
submit_file[29:50]

Unnamed: 0,id,prob
29,30,0.063188
30,31,0.01476
31,32,0.379592
32,33,0.324183
33,34,0.013916
34,35,0.082771
35,36,0.18142
36,37,0.264204
37,38,0.22069
38,39,0.130264
