In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 


In [3]:
train = pd.read_csv("output/train_preprocessed.csv")

In [4]:
Y_train = train["y"]

In [5]:
X_train = train.loc[:,['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']]

In [13]:
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,1,2,3,2,1,1
1,1,3,3,5,1,2
2,4,3,5,2,1,2
3,3,3,3,5,3,2
4,3,3,3,2,2,3


In [6]:
y_train = Y_train.values.ravel()

In [15]:
RF = RandomForestClassifier(n_estimators = 300, random_state = 42)
RF.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [16]:
y_train_pred = RF.predict(X_train)

In [17]:
from sklearn.metrics import confusion_matrix 

In [18]:
cmat = confusion_matrix(y_train_pred, Y_train, labels=[1,2,3,4])
print(cmat)
(cmat[0,0]+cmat[1,1]+cmat[2,2]+cmat[3,3])/len(y_train_pred)

[[591   0   0   0]
 [  0 206   0   0]
 [  0   0  34   0]
 [  0   0   0  33]]


1.0

## おそらく過学習。GridSearchで過学習を避けるポイントをみつけないと

In [11]:
initial_params = {
    'random_state':42,
}
params = {
    'max_depth':[10,15,20,30],
    'n_estimators':[300,400,500,600]
}

In [12]:
grid = GridSearchCV( estimator=RandomForestClassifier(**initial_params),
                    param_grid=params,
                    scoring="accuracy",
                    cv=5,
                    verbose=1,
                    n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   42.0s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [10, 15, 20, 30], 'n_estimators': [300, 400, 500, 600]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [14]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 10, 'n_estimators': 300}
0.958333333333


### testデータの前処理をする

In [17]:
test = pd.read_csv("output/test_preprocessed.csv")
X_test = test.loc[:,['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']]

In [18]:
X_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,4,2,5,3,1
1,4,4,5,4,1,3
2,2,3,2,2,1,1
3,2,2,4,4,3,2
4,3,3,2,4,2,1


In [22]:
y_test = grid.predict(X_test)

In [39]:
submit_file = pd.read_csv('data/sample_submit.csv', names=['id','result'])

In [41]:
submit_file['result']=y_test

In [43]:
submit_file.to_csv('submit2_RF_on_python.csv',  header=False, index=False)