## Cross Validationを取り入れる

In [1]:
import xgboost as xgb
from xgboost import XGBRegressor
# 事前準備処理
# x_train. y_train, x_eval, y_evalを作成する
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time

## 　ハイパーパラメタ
params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}

def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score

train_x = pd.read_csv("data/processed_train_goto_x_v12.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v12.csv")
train_g = pd.read_csv("data/train_genba.tsv", sep='\t')

In [2]:
## train_x, train_y, org_genbaを受けとり、genbaデータで分割を行ったX_train, X_eval, Y_train, Y_evalを返す
from sklearn.model_selection import train_test_split
def split_by_genba( genba, train_x, train_y, train_s, test_s, rs ):
    y = genba['pj_no']
    x = genba.drop(['pj_no'],axis=1)
    train_y_with_pj = pd.merge(train_y, train_x[['id','pj_no']], how='inner', on='id')
    pj_train, pj_eval, genba_train, genba_eval = train_test_split(y, x, train_size = train_s, test_size = test_s, random_state = rs)
    X_train = pd.merge(pj_train, train_x,  how='inner', on='pj_no')
    X_eval = pd.merge(pj_eval, train_x,  how='inner', on='pj_no')
    Y_train = pd.merge(pj_train, train_y_with_pj,  how='inner', on='pj_no').drop(['pj_no'],axis=1)
    Y_eval = pd.merge(pj_eval, train_y_with_pj,  how='inner', on='pj_no').drop(['pj_no'],axis=1)
    
    return(X_train, X_eval, Y_train, Y_eval)

In [None]:
X_train, X_eval, Y_train, Y_eval = split_by_genba( train_g, train_x, train_y, train_s = 0.8, test_s = 0.2, rs = 11)
print(len(Y_train))

In [None]:
 X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, test_size=0.2, random_state = 1)

In [3]:
error = []
models = []
for i in range(10):
    s = np.random.randint(2143486417,high=None)
    #X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, test_size=0.2, random_state = s)
    X_train, X_eval, Y_train, Y_eval = split_by_genba(train_g, train_x, train_y, train_s=0.8, test_s=0.2, rs = s)
    
    x_train = X_train.drop(['id','pj_no'],axis=1)
    y_train = Y_train.drop(['id'],axis=1)
    x_eval = X_eval.drop(['id','pj_no'],axis=1)

    model = XGBRegressor(**params, seed=19711022, n_jobs=-1)
    model.fit(x_train, y_train )
    pred = model.predict(x_eval)
    
    e = mean_absolute_percentage_error(Y_eval['keiyaku_pr'].values, pred)
    
    d = (s, e)
    print(d)
    error.append(d)
    models.append(model)

(1331772230, 10.139171265157536)
(1628932642, 10.119895243840316)
(1804610804, 10.750349810344487)
(1248366107, 10.133013824816778)
(1209639770, 11.6642383455501)
(1262238905, 10.802457851738122)
(305833057, 11.375713745279832)
(1939017787, 11.116624664611042)
(903097299, 11.189529106416034)
(1837115670, 10.080631547187274)


In [None]:
predict_y = pd.DataFrame(pred, columns=['predict_pr'])
Y_eval_pred = pd.concat([Y_eval, predict_y], axis=1)

In [None]:
Y_eval_pred['mean_error(abs)']=abs(Y_eval_pred['keiyaku_pr']-Y_eval_pred['predict_pr'])/Y_eval_pred['keiyaku_pr']*100
Y_eval_pred['mean_error']=(Y_eval_pred['keiyaku_pr']-Y_eval_pred['predict_pr'])/Y_eval_pred['keiyaku_pr']*100
Y_eval_pred['mean_error(abs)'].mean()

In [None]:
out = pd.merge(X_eval, Y_eval_pred, how='inner', on = 'id')
out.to_csv("data/difference_v12.csv")

In [4]:
test_x = pd.read_csv("data/processed_test_goto_x_v12.csv")
x_test = test_x.drop(['id','pj_no'],axis=1)

model = models[6]
ans = model.predict(x_test)

submit = pd.DataFrame(test_x[['id']])
submit['keiyaku_pr']=pd.Series(ans).astype(np.int64)
submit.to_csv('data/submit_v12_worst.tsv',sep='\t',header=None, index=False)

In [5]:
model = models[9]
ans = model.predict(x_test)

submit = pd.DataFrame(test_x[['id']])
submit['keiyaku_pr']=pd.Series(ans).astype(np.int64)
submit.to_csv('data/submit_v12_best.tsv',sep='\t',header=None, index=False)

# V9での実施事項
- 土地売りと建て売りとで分けて訓練、予測を行う

#### ⇒効果ないことが判明した
#### それよりも、効果のないカラムを削除した方が良いのかもしれない

# V8での提出結果
MAPE ... 10.46

### 気づき事項
- 路線ごとにerrorが異なるのではないか？
- 上記の結果では、路線ごとにモデルを作るべきなのかもしれない
- 訓練データでのMAPEが3.56に対して、土地売りだと4.87。土地売りは別モデルとして学習すべき？
- 異常値は除去すべきかもしれない。

## 7/20 実施事項
- V9データと、XGBoostの固定のハイパーパラメタを使い、XGBoostに与える乱数を変更して複数のモデルを作ることで、精度が向上するかを試す

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor
# 事前準備処理
# x_train. y_train, x_eval, y_evalを作成する
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time


def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score

def learn( train_x, train_y, params, s ):
    model = XGBRegressor(**params, seed=s, n_jobs=-1)
    model.fit(train_x, train_y)
    
    return model

train_x = pd.read_csv("data/processed_train_goto_x_v11.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v11.csv")
X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, random_state = 19711022)

## 　ハイパーパラメタ
params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}

x_train = X_train.drop(['id','pj_no'],axis=1)
y_train = Y_train.drop(['id'],axis=1)
x_eval = X_eval.drop(['id','pj_no'],axis=1)



In [None]:
import time
models= []
preds = []

for i in range(30):
    print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = learn(x_train, y_train, params, np.random.randint(2143486417,high=None))
    pred = model.predict(x_eval)
    models.append(model)
    preds.append(pred)
    end = time.perf_counter()
    print('finished ', 'elapsed time : ', end-start)
    

In [None]:
df = pd.DataFrame(preds).T

In [None]:
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)

In [None]:
Y_eval_pred = pd.concat([Y_eval.reset_index(), df], axis=1)
Y_eval_pred['mean_error']=abs(Y_eval_pred['keiyaku_pr']-Y_eval_pred['mean'])/Y_eval_pred['keiyaku_pr']*100
Y_eval_pred['mean_error'].mean()

In [None]:
X_eval.head(2)

In [None]:
eval_out = pd.DataFrame(Y_eval_pred[['id','keiyaku_pr','mean','mean_error']])

In [None]:
eval_out = pd.merge(eval_out, X_eval,on='id',how='left')

In [None]:
eval_out.to_csv("data/submit_v11_error.csv")

#### これで次の提出データを作る

In [None]:
test_x = pd.read_csv("data/processed_test_goto_x_v11.csv")
x_test = test_x.drop(['id','pj_no'],axis=1)

In [None]:
anss = []

for i in range(30):
    #print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = models[i]
    ans = model.predict(x_test)
    anss.append(ans)
    end = time.perf_counter()
    #print('finished ', 'elapsed time : ', end-start)

In [None]:
df = pd.DataFrame(anss).T

In [None]:
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)

In [None]:
submit = pd.DataFrame(test_x[['id']])
submit['keiyaku_pr']=df['mean']
submit.to_csv('data/submit_v11.tsv',sep='\t',header=None, index=False)

### X_evalで再現テストする

In [None]:
x_test = X_eval.drop(['id','pj_no'],axis=1)

In [None]:
anss = []

for i in range(30):
    #print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = models[i]
    ans = model.predict(x_test)
    anss.append(ans)
    end = time.perf_counter()
    #print('finished ', 'elapsed time : ', end-start)

In [None]:
df = pd.DataFrame(anss).T
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)

In [None]:
Y_eval_pred2=pd.concat([Y_eval.reset_index(), df], axis=1)

In [None]:
Y_eval_pred2['error']=abs(Y_eval_pred2['keiyaku_pr']-Y_eval_pred2['mean'])/Y_eval_pred2['keiyaku_pr']*100

In [None]:
Y_eval_pred2['error'].mean()

### X_trainで再現テスト

In [None]:
x_test = X_train.drop(['id','pj_no'],axis=1)

In [None]:
anss = []

for i in range(30):
    #print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = models[i]
    ans = model.predict(x_test)
    anss.append(ans)
    end = time.perf_counter()
    #print('finished ', 'elapsed time : ', end-start)

In [None]:
df = pd.DataFrame(anss).T
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)
Y_eval_pred2=pd.concat([Y_train.reset_index(), df], axis=1)
Y_eval_pred2['error']=abs(Y_eval_pred2['keiyaku_pr']-Y_eval_pred2['mean'])/Y_eval_pred2['keiyaku_pr']*100
Y_eval_pred2['error'].mean()

## 訓練データ全体で予測し、訓練データと結合

In [None]:
test = train_x.drop(['id','pj_no'],axis=1)
anss = []
for i in range(30):
    #print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = models[i]
    ans = model.predict(test)
    anss.append(ans)
    end = time.perf_counter()
    #print('finished ', 'elapsed time : ', end-start)
df = pd.DataFrame(anss).T
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)
df2=pd.concat([train_y.reset_index(), df], axis=1)
df2['error']=abs(df2['keiyaku_pr']-df2['mean'])/df2['keiyaku_pr']*100
df2['error'].mean()

In [None]:
df3=pd.concat([train_y.reset_index(), df2['mean'],df2['error']], axis=1)
genba = pd.read_csv("data/train_genba.tsv", sep='\t')
goto = pd.read_csv("data/train_goto.tsv", sep='\t')
df4 = pd.merge(df3, goto, on='id', how='left')
df5 = pd.merge(df4, genba, on='pj_no', how='left')

In [None]:
df5.to_csv("data/submit_v11_error.csv")

In [None]:
import pandas_profiling as pdp
pdp.ProfileReport(df5)

## シリアライズしておく

In [None]:
import pickle
f = open("model_V11.pkl","wb")
pickle.dump(models, f)

## デシリアライズ

In [None]:
import pickle
f = open("model_V11.pkl", "rb")
models = pickle.load(f)

### 複数モデルから重要度を取得する

In [None]:
import pandas as pd
train_x = pd.read_csv("data/processed_train_goto_x_v11.csv").drop(['id','pj_no'],axis=1)
index = models[0].feature_importances_+models[1].feature_importances_
importances = models[0].feature_importances_
for i in range(len(models)-1):
    importances += models[i+1].feature_importances_
mean = importances / len(models)
df = pd.DataFrame(mean, index=train_x.columns)
df.to_csv("data/importance_V11.csv")

In [None]:
importances = models[0].feature_importances_
for i in range(len(models)-1):
    importances += models[i+1].feature_importances_

In [None]:
mean = importances / len(models)
df = pd.DataFrame(mean, index=x_train.columns)
df2 = pd.DataFrame(mean)

In [None]:
df.to_csv("data/tmp.csv")

In [None]:
df2.head()

#### 7/20 使わなくなったコード

In [None]:
def calc_values(x):
    return( pd.Series([x.min(), x.max(), x.mean(), x.median(), x.std()]))

df[['min', 'max', 'mean','median','std']]=df.apply(calc_values, axis=1)

Y_eval_pred = pd.concat([Y_eval.reset_index(), df], axis=1)

Y_eval_pred['mean_error']=abs(Y_eval_pred['keiyaku_pr']-Y_eval_pred['mean'])/Y_eval_pred['keiyaku_pr']*100
Y_eval_pred['median_error']=abs(Y_eval_pred['keiyaku_pr']-Y_eval_pred['median'])/Y_eval_pred['keiyaku_pr']*100

Y_eval_pred['mean_error'].mean()

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor
# 事前準備処理
# x_train. y_train, x_eval, y_evalを作成する
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score


train_x = pd.read_csv("data/processed_train_goto_x_v9.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v9.csv")
X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, random_state = 19711022)



In [None]:
def learn( train_x, train_y, params, s ):
    model = XGBRegressor(**params, seed=s, n_jobs=-1)
    model.fit(train_x, train_y)
    
    return model

def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score


In [None]:
train_x = pd.read_csv("data/processed_train_goto_x_v9.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v9.csv")
X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, random_state = 19711022)

## 土地売り・建て売りに分解せずに同じことをしてみる
params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}
model = learn(X_train.drop(['id','pj_no'],axis=1), Y_train.drop(['id'],axis=1), params, 42)
pred_y = model.predict(X_eval.drop(['id','pj_no'],axis=1))

In [None]:
Y_pred_all = pd.DataFrame(X_eval[['id','levelplan_土地売り']].copy().reset_index(drop=True))
Y_pred_all['pred_keiyaku_pr'] = pd.Series(pred_y).astype(np.int64)
Y_eval_pred = pd.merge(Y_eval, Y_pred_all, on='id', how='left')

In [None]:
print(mean_absolute_percentage_error(Y_eval_pred['keiyaku_pr'].values, Y_eval_pred['pred_keiyaku_pr'].values))

In [None]:
Y_eval_pred.head()

In [None]:
out = Y_eval_pred
out['error']=abs((out['keiyaku_pr']-out['pred_keiyaku_pr'])/out['keiyaku_pr'])*100
output = pd.merge(out, pd.read_csv("data/processed_train_goto_x_v9.csv"),on='id')

In [None]:
output.to_csv("data/tmp.csv")

### 以降はむだだったコード。建て売りか土地売りかで別モデルを作ったが、結局意味はなかった。

In [None]:
# 土地売りと建て売りとにデータを分割
X_train_tateuri = X_train[X_train['levelplan_土地売り']==0]
X_train_tochiuri = X_train[X_train['levelplan_土地売り']==1]
Y_train_tateuri = Y_train[X_train['levelplan_土地売り']==0]
Y_train_tochiuri = Y_train[X_train['levelplan_土地売り']==1]

X_eval_tateuri = X_eval[X_eval['levelplan_土地売り']==0]
X_eval_tochiuri = X_eval[X_eval['levelplan_土地売り']==1]
Y_eval_tateuri = Y_eval[X_eval['levelplan_土地売り']==0]
Y_eval_tochiuri = Y_eval[X_eval['levelplan_土地売り']==1]

In [None]:
## 土地売り以外のlevelplanを削除してみる
X_train_1 = X_train.drop(['levelplan_1F/2LDK','levelplan_1F/3LDK','levelplan_1F/4LDK','levelplan_1F/4LDK+S','levelplan_1F/5LDK'],axis=1)
X_train_2 = X_train_1.drop(['levelplan_2F/2LDK','levelplan_2F/2LDK+S','levelplan_2F/3DK','levelplan_2F/3LDK','levelplan_2F/3LDK+2S','levelplan_2F/3LDK+S','levelplan_2F/4DK','levelplan_2F/4LDK','levelplan_2F/4LDK+S','levelplan_2F/5DK','levelplan_2F/5LDK'],axis=1)
X_train_3 = X_train_2.drop(['levelplan_3F/2LDK','levelplan_3F/2LDK+2S','levelplan_3F/2LDK+S','levelplan_3F/3DK','levelplan_3F/3LDK','levelplan_3F/3LDK+2S','levelplan_3F/3LDK+S','levelplan_3F/4DK','levelplan_3F/4LDK','levelplan_3F/4LDK+S','levelplan_3F/5LDK'],axis=1)

X_eval_1 = X_eval.drop(['levelplan_1F/2LDK','levelplan_1F/3LDK','levelplan_1F/4LDK','levelplan_1F/4LDK+S','levelplan_1F/5LDK'],axis=1)
X_eval_2 = X_eval_1.drop(['levelplan_2F/2LDK','levelplan_2F/2LDK+S','levelplan_2F/3DK','levelplan_2F/3LDK','levelplan_2F/3LDK+2S','levelplan_2F/3LDK+S','levelplan_2F/4DK','levelplan_2F/4LDK','levelplan_2F/4LDK+S','levelplan_2F/5DK','levelplan_2F/5LDK'],axis=1)
X_eval_3 = X_eval_2.drop(['levelplan_3F/2LDK','levelplan_3F/2LDK+2S','levelplan_3F/2LDK+S','levelplan_3F/3DK','levelplan_3F/3LDK','levelplan_3F/3LDK+2S','levelplan_3F/3LDK+S','levelplan_3F/4DK','levelplan_3F/4LDK','levelplan_3F/4LDK+S','levelplan_3F/5LDK'],axis=1)


In [None]:
# 学習ルーチンを呼び出す。
params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}

tateuri_model = learn(X_train_tateuri.drop(['id','pj_no','levelplan_土地売り'],axis=1), Y_train_tateuri.drop(['id'],axis=1), params, 42)
tochiuri_model = learn(X_train_tochiuri.drop(['id','pj_no','levelplan_土地売り'],axis=1), Y_train_tochiuri.drop(['id'],axis=1), params, 42)

In [None]:
# 予測する
pred_y_tateuri = tateuri_model.predict(X_eval_tateuri.drop(['id','pj_no','levelplan_土地売り'],axis=1))
pred_y_tochiuri = tochiuri_model.predict(X_eval_tochiuri.drop(['id','pj_no','levelplan_土地売り'],axis=1))

In [None]:
# DataFrameの形で予測値を作成する
Y_pred_tateuri = pd.DataFrame(X_eval_tateuri['id'].copy().reset_index(drop=True))
Y_pred_tateuri['pred_keiyaku_pr'] = pd.Series(pred_y_tateuri)
Y_pred_tochiuri = pd.DataFrame(X_eval_tochiuri['id'].copy().reset_index(drop=True))
Y_pred_tochiuri['pred_keiyaku_pr'] = pd.Series(pred_y_tochiuri)
Y_pred_all = pd.concat([Y_pred_tateuri, Y_pred_tochiuri])
Y_eval_pred = pd.merge(Y_eval, Y_pred_all, on='id', how='left')

In [None]:
model = learn(X_train_3.drop(['id','pj_no'],axis=1), Y_train.drop(['id'],axis=1), params, 42)
pred_y = model.predict(X_eval_3.drop(['id','pj_no'],axis=1))

In [None]:
Y_pred_all = pd.DataFrame(X_eval_3[['id','levelplan_土地売り']].copy().reset_index(drop=True))
Y_pred_all['pred_keiyaku_pr'] = pd.Series(pred_y)
Y_eval_pred = pd.merge(Y_eval, Y_pred_all, on='id', how='left')

In [None]:
print(mean_absolute_percentage_error(Y_eval_pred['keiyaku_pr'].values, Y_eval_pred['pred_keiyaku_pr'].values))

In [None]:
# 共通処理
# x_train. y_train, x_eval, y_evalを作成する
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score


train_x = pd.read_csv("data/processed_train_goto_x_v8.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v8.csv")
X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, random_state = 19711022)

X_train.to_csv("data/X_train.csv", index=False)
X_eval.to_csv("data/X_eval.csv", index=False)
Y_train.to_csv("data/Y_train.csv", index=False)
Y_eval.to_csv("data/Y_eval.csv", index=False)

train_x = pd.read_csv('data/X_train.csv').drop(['id','pj_no'],axis=1)
train_y = pd.read_csv('data/Y_train.csv').drop(['id'],axis=1)

import xgboost as xgb
from xgboost import XGBRegressor
import time

params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}

print(f"start learning...")
xgboost_opt = XGBRegressor(**params, seed=42, n_jobs=-1)
start = time.perf_counter()
xgboost_opt.fit(train_x, train_y)
end = time.perf_counter()
print(end-start)

print(f"start estimating...")
eval_x = pd.read_csv('data/processed_train_goto_x_v8.csv').drop(['id','pj_no'],axis=1)
ans_y = pd.read_csv('data/processed_train_goto_y_v8.csv').drop(['id'],axis=1)
pred_y = xgboost_opt.predict(eval_x)
print( mean_absolute_percentage_error(ans_y.values,pred_y))

out = pd.read_csv('data/processed_train_goto_y_v8.csv')
out['pred_keiyaku_pr'] = pd.Series(pred_y).astype(np.int64)
out['error']=abs((out['keiyaku_pr']-out['pred_keiyaku_pr'])/out['keiyaku_pr'])*100
output = pd.merge(out, pd.read_csv("data/processed_train_goto_x_v8.csv"),on='id')
output.to_csv("data/train_data_error.csv")

In [None]:
print(f"start estimating...")
eval_x = pd.read_csv('data/X_eval.csv').drop(['id','pj_no'],axis=1)
ans_y = pd.read_csv('data/Y_eval.csv').drop(['id'],axis=1)
pred_y = xgboost_opt.predict(eval_x)
print( mean_absolute_percentage_error(ans_y.values,pred_y))

out = pd.read_csv('data/Y_eval.csv')
out['pred_keiyaku_pr'] = pd.Series(pred_y).astype(np.int64)
out['error']=abs((out['keiyaku_pr']-out['pred_keiyaku_pr'])/out['keiyaku_pr'])*100
output = pd.merge(out, pd.read_csv("data/X_eval.csv"),on='id')
output.to_csv("data/eval_data_error.csv")

In [None]:
importance = pd.DataFrame(xgboost_opt.feature_importances_, index=eval_x.columns)
importance.to_csv("data/feature_importances_V8.csv")

In [None]:
test_x = pd.read_csv("data/processed_test_goto_x_v8.csv")
test_pred = xgboost_opt.predict(test_x.drop(['id','pj_no'],axis=1))
submit = pd.DataFrame(test_x[['id']])
submit['keiyaku_pr']=pd.Series(test_pred).astype(np.int64)
submit.to_csv('data/submit_v8.tsv',sep='\t',header=None, index=False)

### n_estimatorsが700のケースでsubmitしてみることにする(7/7)

In [None]:
test_x = pd.read_csv("data/processed_test_goto_x.csv")
test_pred = xgboost_opt.predict(test_x.drop(['id','pj_no'],axis=1))
submit = pd.DataFrame(test_x[['id']])
submit['keiyaku_pr']=pd.Series(test_pred).astype(np.int64)
submit.to_csv('data/submit4.tsv',sep='\t',header=None, index=False)

### ここからSageMaker用のデータを作る処理

In [None]:
train_x = pd.read_csv('data/X_train.csv')
train_y = pd.read_csv('data/Y_train.csv')

In [None]:
train_input = pd.concat([train_y.drop(['id','keiyaku_pr','tc_mseki'],axis=1),train_x.drop(['id','pj_no'],axis=1)],axis=1)
train_input.to_csv('data/sagemaker_input.csv', header=None, index=False)
eval_x = pd.read_csv('data/X_eval.csv')
eval_x.drop(['id','pj_no'],axis=1).to_csv('data/sagemaker_eval_input.csv',header=None, index=False)


### SageMakerの出力から精度を計算する

In [None]:
pred2_y = pd.read_csv('data/sagemaker_eval_input.csv.out', header=None)
ans_y = pd.read_csv('data/Y_eval.csv').drop(['id','keiyaku_pr','tc_mseki'],axis=1)

In [None]:
print( mean_absolute_percentage_error(ans_y.values,pred2_y.values))

### SageMaker用予測データを作成する

In [None]:
test_x = pd.read_csv("data/processed_test_goto_x.csv")

In [None]:
test_input = test_x.drop(['id','pj_no'],axis=1)
test_input.to_csv('data/sagemaker_test_input.csv', header=None, index=False)

### SageMaker出力からsubmit用データを作る

In [None]:
tanka = pd.read_csv("data/sagemaker_test_input.csv.out", header=None )

In [None]:
test_x = pd.read_csv("data/processed_test_goto_x.csv")

In [None]:
submit = pd.DataFrame(test_x[['id', 'tc_mseki']])

In [None]:
submit['tanka_pr']=tanka

In [None]:
submit['price']=(submit['tc_mseki']*submit['tanka_pr']).astype(np.int64)

In [None]:
submit.loc[:,['id','price']].to_csv('data/submit3.tsv',sep='\t',header=None, index=False)

In [None]:
submit.head()