# 特徴量を追加する

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV 
# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier

## データを加工する関数

In [2]:
def preprocess( data ):
    data2 = data.copy()
    # doorsとpersonsに非数値があるため、置換する
    # 置換後の型がobjectになっているのでintに変換する
    data2['doors'] = data2['doors'].replace({"5more":"5"}).astype(int)
    data2['persons'] = data2['persons'].replace({"more":'5'}).astype(int)
    
    # buying, maint, lug_boot, safetyをそれぞれ数値に変換
    data2['buying']=data2['buying'].map({'low': 1, 'med': 2, 'high': 3, 'vhigh': 4});
    data2['maint']=data2['maint'].map({'low': 1, 'med': 2, 'high': 3, 'vhigh': 4});
    data2['lug_boot']=data2['lug_boot'].map({'small': 1, 'med': 2, 'big': 3});
    data2['safety']=data2['safety'].map({'low': 1, 'med': 2, 'high': 3});
    
    # 定員とトランクの大きさを足して容量とする。
    data2['capacity']=data2['persons']+data2['lug_boot']
    
    # 売値から整備代を引いて、コストとする
    data2['cost']=data2['buying']-data2['maint']
    
    return data2

## 訓練データを読み込み、目的変数と説明変数に分ける

In [3]:
train = pd.read_table("data/train.tsv")
Y_train = train['class'].copy()
Y_train = Y_train.map({'unacc':1, 'acc':2, 'good':3, 'vgood':4 })
X_train = preprocess( train.loc[:, ["buying", "maint", 'doors', 'persons',"lug_boot", "safety"]] )

### ハイパーパラメタのチューニング

In [4]:
params = {
    'n_estimators':600,
    'max_depth':4,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}
xgboost_opt = XGBClassifier(**params, seed=42)
xgboost_opt.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=9, missing=None, n_estimators=600,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=1.0)

## テストデータを読み、訓練データと同じ加工をする

In [5]:
test = pd.read_table('data/test.tsv')
X_test = preprocess( test.loc[:, ["buying", "maint", 'doors', 'persons',"lug_boot", "safety"]] )

In [6]:
Y_test = xgboost_opt.predict(X_test)

In [7]:
submit = pd.read_csv('data/sample_submit.csv',names=['id','result'])
submit['result']=Y_test
submit['result']=submit['result'].map({1:'unacc', 2:'acc', 3:'good', 4:'vgood'})
submit.to_csv('output/submit5.csv',  header=False, index=False)

### 結果確認用のファイルを生成する

In [14]:
X_test['result']=submit['result']
X_test['id']=submit['id']
out=X_test.loc[:,["id","buying", "maint", 'doors', 'persons',"lug_boot", "safety","result"]]
out.to_csv('output/submit5_confirmation.csv',header=True)