# submit5(特徴量追加)に対してパラメタチューニングする

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV 
# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier

## データを加工する関数

In [2]:
def preprocess( data ):
    data2 = data.copy()
    # doorsとpersonsに非数値があるため、置換する
    # 置換後の型がobjectになっているのでintに変換する
    data2['doors'] = data2['doors'].replace({"5more":"5"}).astype(int)
    data2['persons'] = data2['persons'].replace({"more":'5'}).astype(int)
    
    # buying, maint, lug_boot, safetyをそれぞれ数値に変換
    data2['buying']=data2['buying'].map({'low': 1, 'med': 2, 'high': 3, 'vhigh': 4});
    data2['maint']=data2['maint'].map({'low': 1, 'high': 2, 'med': 3, 'vhigh': 4});
    data2['lug_boot']=data2['lug_boot'].map({'small': 1, 'med': 2, 'big': 3});
    data2['safety']=data2['safety'].map({'low': 1, 'med': 2, 'high': 3});
    
    # 定員とトランクの大きさを足して容量とする。
    data2['capacity']=data2['persons']+data2['lug_boot']
    
    # 売値から整備代を引いて、コストとする
    data2['cost']=data2['buying']-data2['maint']
    
    return data2

## 訓練データを読み込み、目的変数と説明変数に分ける

In [3]:
train = pd.read_csv("data/train.tsv", sep='\t')
Y_train = train['class'].copy()
Y_train = Y_train.map({'unacc':1, 'acc':2, 'good':3, 'vgood':4 })
X_train = preprocess( train.loc[:, ["buying", "maint", 'doors', 'persons',"lug_boot", "safety"]] )

## パラメタチューニングする

In [4]:
initial_params = {
}
params={
    'max_depth':[2,3,4,5,6],
    'min_child_weight':[1,2,3,4]
}

In [5]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, cv=5,n_jobs=-1)
gs.fit(X_train, Y_train)
print(gs.best_params_)
print(gs.best_score_)

{'max_depth': 6, 'min_child_weight': 1}
0.967592592593


In [6]:
initial_params = {
    'min_child_weight':1
}
params={
    'max_depth':[5,6,7,8,9,10],
}

In [7]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, cv=5,n_jobs=-1)
gs.fit(X_train, Y_train)
print(gs.best_params_)
print(gs.best_score_)

{'max_depth': 7}
0.971064814815


In [8]:
initial_params = {
    'min_child_weight':1,
    'max_depth':8,
}
params={
    'gamma':[0.0,0.1,0.2],
    'subsample':[0.5,0.6,0.7,0.8,0.9,1.0]
}

In [9]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, cv=5,n_jobs=-1)
gs.fit(X_train, Y_train)
print(gs.best_params_)
print(gs.best_score_)

{'gamma': 0.1, 'subsample': 1.0}
0.974537037037


In [10]:
initial_params = {
    'min_child_weight':1,
    'max_depth':8,
    'subsample':0.9
}
params={
    'gamma':[0.1,0.2,0.3,0.4,0.5,0.75,1.0],
}
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, cv=5,n_jobs=-1)
gs.fit(X_train, Y_train)
print(gs.best_params_)
print(gs.best_score_)

{'gamma': 0.1}
0.969907407407


In [11]:
initial_params = {
    'min_child_weight':1,
    'max_depth':8,
    'gamma':0.2,
    'subsample':0.9,
}
params = {
    'colsample_bytree':[0.6,0.7,0.8,0.9,1.0],
    'learning_rate':[0.5, 0.2, 0.1, 0.05]
}

In [12]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, cv=5,n_jobs=-1)
gs.fit(X_train, Y_train)
print(gs.best_params_)
print(gs.best_score_)

{'colsample_bytree': 1.0, 'learning_rate': 0.5}
0.972222222222


In [13]:
initial_params = {
    'min_child_weight':1,
    'max_depth':8,
    'gamma':0.2,
    'subsample':0.9,
    'colsample_bytree': 0.9,
    'learning_rate': 0.2,
}
params = {
    'n_estimators':[400,500,600,700,800,900,1000]
}

In [14]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, cv=5,n_jobs=-1)
gs.fit(X_train, Y_train)
print(gs.best_params_)
print(gs.best_score_)

{'n_estimators': 400}
0.971064814815


In [15]:
initial_params = {
    'min_child_weight':1,
    'max_depth':8,
    'gamma':0.2,
    'subsample':0.9,
    'colsample_bytree': 0.9,
    'learning_rate': 0.2,
}
params = {
    'n_estimators':[100,200,400,300,500,600,700]
}

In [16]:
gs = GridSearchCV(estimator = XGBClassifier(**initial_params,seed=42),param_grid=params, cv=5,n_jobs=-1)
gs.fit(X_train, Y_train)
print(gs.best_params_)
print(gs.best_score_)

{'n_estimators': 400}
0.971064814815


### 上記までで決定したパラメタで学習する

In [17]:
params = {
    'n_estimators': 100,
    'min_child_weight':1,
    'max_depth':8,
    'gamma':0.2,
    'subsample':0.9,
    'colsample_bytree': 0.9,
    'learning_rate': 0.2,
}
xgboost_opt = XGBClassifier(**params, seed=42)
xgboost_opt.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0.2, learning_rate=0.2,
       max_delta_step=0, max_depth=8, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.9)

## テストデータを読み、訓練データと同じ加工をする

In [18]:
test = pd.read_csv('data/test.tsv', sep='\t')
X_test = preprocess( test.loc[:, ["buying", "maint", 'doors', 'persons',"lug_boot", "safety"]] )

In [19]:
Y_test = xgboost_opt.predict(X_test)

In [20]:
submit = pd.read_csv('data/sample_submit.csv',names=['id','result'])
submit['result']=Y_test
submit['result']=submit['result'].map({1:'unacc', 2:'acc', 3:'good', 4:'vgood'})
submit.to_csv('output/submit6.csv',  header=False, index=False)

### 結果確認用のファイルを生成する

In [21]:
X_test['result']=submit['result']
X_test['id']=submit['id']
out=X_test.loc[:,["id","buying", "maint", 'doors', 'persons',"lug_boot", "safety","result"]]
out.to_csv('output/submit6_confirmation.csv',header=True)