# 複数の乱数でアンサンブル学習をする

二つの関数を作る
-  データの前処理関数
- 指定の乱数で学習、予測して結果を返す関数

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV 
# XGBoost
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
#
import random
import sys

In [2]:
def preprocess( data ):
    data2 = data.copy()
    # doorsとpersonsに非数値があるため、置換する
    # 置換後の型がobjectになっているのでintに変換する
    data2['doors'] = data2['doors'].replace({"5more":"5"}).astype(int)
    data2['persons'] = data2['persons'].replace({"more":'5'}).astype(int)
    
    # buying, maint, lug_boot, safetyをそれぞれ数値に変換
    data2['buying']=data2['buying'].map({'low': 1, 'med': 2, 'high': 3, 'vhigh': 4});
    data2['maint']=data2['maint'].map({'low': 1, 'high': 2, 'med': 3, 'vhigh': 4});
    data2['lug_boot']=data2['lug_boot'].map({'small': 1, 'med': 2, 'big': 3});
    data2['safety']=data2['safety'].map({'low': 1, 'med': 2, 'high': 3});
    
    # 定員とトランクの大きさを足して容量とする。
    data2['capacity']=data2['persons']+data2['lug_boot']
    
    # 売値から整備代を引いて、コストとする
    data2['cost']=data2['buying']-data2['maint']
    
    return data2

In [3]:
def learn( X_train, Y_train, param, rnd_num ):
    model = XGBClassifier( **param, seed=rnd_num)
    model.fit(X_train, Y_train )
    return model

In [4]:
train = pd.read_csv("data/train.tsv", sep='\t')
Y_train = train['class'].copy()
Y_train = Y_train.map({'unacc':1, 'acc':2, 'good':3, 'vgood':4 })
X_train = preprocess( train.loc[:, ["buying", "maint", 'doors', 'persons',"lug_boot", "safety"]] )

test = pd.read_csv('data/test.tsv', sep='\t')
X_test = preprocess( test.loc[:, ["buying", "maint", 'doors', 'persons',"lug_boot", "safety"]] )

In [5]:
init_param = {
    'n_estimators': 100,
    'min_child_weight':1,
    'max_depth':8,
    'gamma':0.2,
    'subsample':0.9,
    'colsample_bytree': 0.9,
    'learning_rate': 0.2,
}
acc_ans = np.ndarray([])
max_iter = 1000
for iter in range(max_iter):
    num = random.randint(0, 65535)
    model = learn(X_train, Y_train, init_param, num )
    ans = model.predict(X_test)
    acc_ans = acc_ans + ans

acc_ans = (acc_ans / max_iter).round().astype(int)

### 答えを用意する

In [6]:
submit = pd.read_csv('data/sample_submit.csv',names=['id','result'])
submit['result']=acc_ans
submit['result']=submit['result'].map({1:'unacc', 2:'acc', 3:'good', 4:'vgood'})
submit.to_csv('output/submit7.csv',  header=False, index=False)