# V9での実施事項
- 土地売りと建て売りとで分けて訓練、予測を行う

#### ⇒効果ないことが判明した
#### それよりも、効果のないカラムを削除した方が良いのかもしれない

# V8での提出結果
MAPE ... 10.46

### 気づき事項
- 路線ごとにerrorが異なるのではないか？
- 上記の結果では、路線ごとにモデルを作るべきなのかもしれない
- 訓練データでのMAPEが3.56に対して、土地売りだと4.87。土地売りは別モデルとして学習すべき？
- 異常値は除去すべきかもしれない。

## 7/20 実施事項
- V9データと、XGBoostの固定のハイパーパラメタを使い、XGBoostに与える乱数を変更して複数のモデルを作ることで、精度が向上するかを試す

In [250]:
import xgboost as xgb
from xgboost import XGBRegressor
# 事前準備処理
# x_train. y_train, x_eval, y_evalを作成する
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time


def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score

def learn( train_x, train_y, params, s ):
    model = XGBRegressor(**params, seed=s, n_jobs=-1)
    model.fit(train_x, train_y)
    
    return model

train_x = pd.read_csv("data/processed_train_goto_x_v9.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v9.csv")
X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, random_state = 19711022)

## 　ハイパーパラメタ
params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}

x_train = X_train.drop(['id','pj_no'],axis=1)
y_train = Y_train.drop(['id'],axis=1)
x_eval = X_eval.drop(['id','pj_no'],axis=1)





In [251]:
import time
models= []
preds = []

for i in range(30):
    print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = learn(x_train, y_train, params, np.random.randint(2143486417,high=None))
    pred = model.predict(x_eval)
    models.append(model)
    preds.append(pred)
    end = time.perf_counter()
    print('finished ', 'elapsed time : ', end-start)
    

iter :  0  starting... finished  elapsed time :  15.24500465500023
iter :  1  starting... finished  elapsed time :  16.666418582000915
iter :  2  starting... finished  elapsed time :  16.273433882000973
iter :  3  starting... finished  elapsed time :  16.601685476001876
iter :  4  starting... finished  elapsed time :  16.38030853899909
iter :  5  starting... finished  elapsed time :  15.965131813998596
iter :  6  starting... finished  elapsed time :  15.863947455000016
iter :  7  starting... finished  elapsed time :  15.404040953999356
iter :  8  starting... finished  elapsed time :  15.285357493998163
iter :  9  starting... finished  elapsed time :  15.283609405996685
iter :  10  starting... finished  elapsed time :  15.39643827799955
iter :  11  starting... finished  elapsed time :  16.283910065998498
iter :  12  starting... finished  elapsed time :  16.247310938000737
iter :  13  starting... finished  elapsed time :  15.810116494001704
iter :  14  starting... finished  elapsed time 

In [252]:
df = pd.DataFrame(preds).T

In [202]:
def calc_values(x):
    return( pd.Series([x.min(), x.max(), x.mean(), x.median(), x.std()]))

df[['min', 'max', 'mean','median','std']]=df.apply(calc_values, axis=1)

In [212]:
Y_eval_pred = pd.concat([Y_eval.reset_index(), df], axis=1)

In [217]:
Y_eval_pred['mean_error']=abs(Y_eval_pred['keiyaku_pr']-Y_eval_pred['mean'])/Y_eval_pred['keiyaku_pr']*100
Y_eval_pred['median_error']=abs(Y_eval_pred['keiyaku_pr']-Y_eval_pred['median'])/Y_eval_pred['keiyaku_pr']*100

In [219]:
Y_eval_pred['mean_error'].mean()

8.554185100232068

In [253]:
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)

In [256]:
Y_eval_pred = pd.concat([Y_eval.reset_index(), df], axis=1)
Y_eval_pred['mean_error']=abs(Y_eval_pred['keiyaku_pr']-Y_eval_pred['mean'])/Y_eval_pred['keiyaku_pr']*100
Y_eval_pred['mean_error'].mean()

8.42161858338207

#### これで次の提出データを作る

In [280]:
test_x = pd.read_csv("data/processed_test_goto_x_v9.csv")
x_test = test_x.drop(['id','pj_no'],axis=1)

In [281]:
anss = []

for i in range(30):
    #print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = models[i]
    ans = model.predict(x_test)
    anss.append(ans)
    end = time.perf_counter()
    #print('finished ', 'elapsed time : ', end-start)

In [282]:
df = pd.DataFrame(anss).T

In [283]:
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)

In [284]:
submit = pd.DataFrame(test_x[['id']])
submit['keiyaku_pr']=df['mean']
submit.to_csv('data/submit_v10.tsv',sep='\t',header=None, index=False)

### X_evalで再現テストする

In [271]:
x_test = X_eval.drop(['id','pj_no'],axis=1)

In [272]:
anss = []

for i in range(30):
    #print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = models[i]
    ans = model.predict(x_test)
    anss.append(ans)
    end = time.perf_counter()
    #print('finished ', 'elapsed time : ', end-start)

In [273]:
df = pd.DataFrame(anss).T
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)

In [274]:
Y_eval_pred2=pd.concat([Y_eval.reset_index(), df], axis=1)

In [275]:
Y_eval_pred2['error']=abs(Y_eval_pred2['keiyaku_pr']-Y_eval_pred2['mean'])/Y_eval_pred2['keiyaku_pr']*100

In [276]:
Y_eval_pred2['error'].mean()

8.42161858338207

### X_trainで再現テスト

In [277]:
x_test = X_train.drop(['id','pj_no'],axis=1)

In [278]:
anss = []

for i in range(30):
    #print('iter : ', i,' starting...', end=' ')
    start = time.perf_counter()
    model = models[i]
    ans = model.predict(x_test)
    anss.append(ans)
    end = time.perf_counter()
    #print('finished ', 'elapsed time : ', end-start)

In [279]:
df = pd.DataFrame(anss).T
df['mean']=df.apply( lambda x: int(x.mean()),axis=1)
Y_eval_pred2=pd.concat([Y_train.reset_index(), df], axis=1)
Y_eval_pred2['error']=abs(Y_eval_pred2['keiyaku_pr']-Y_eval_pred2['mean'])/Y_eval_pred2['keiyaku_pr']*100
Y_eval_pred2['error'].mean()

2.088302589736845

In [26]:
import xgboost as xgb
from xgboost import XGBRegressor
# 事前準備処理
# x_train. y_train, x_eval, y_evalを作成する
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score


train_x = pd.read_csv("data/processed_train_goto_x_v9.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v9.csv")
X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, random_state = 19711022)





In [3]:
def learn( train_x, train_y, params, s ):
    model = XGBRegressor(**params, seed=s, n_jobs=-1)
    model.fit(train_x, train_y)
    
    return model

def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score


In [39]:
train_x = pd.read_csv("data/processed_train_goto_x_v9.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v9.csv")
X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, random_state = 19711022)

## 土地売り・建て売りに分解せずに同じことをしてみる
params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}
model = learn(X_train.drop(['id','pj_no'],axis=1), Y_train.drop(['id'],axis=1), params, 42)
pred_y = model.predict(X_eval.drop(['id','pj_no'],axis=1))



In [43]:
Y_pred_all = pd.DataFrame(X_eval[['id','levelplan_土地売り']].copy().reset_index(drop=True))
Y_pred_all['pred_keiyaku_pr'] = pd.Series(pred_y).astype(np.int64)
Y_eval_pred = pd.merge(Y_eval, Y_pred_all, on='id', how='left')

In [44]:
print(mean_absolute_percentage_error(Y_eval_pred['keiyaku_pr'].values, Y_eval_pred['pred_keiyaku_pr'].values))

8.58929204425


In [42]:
Y_eval_pred.head()

Unnamed: 0,id,keiyaku_pr,levelplan_土地売り,pred_keiyaku_pr
0,train_1255,21700000,0,24113422.0
1,train_2319,26300000,0,28799550.0
2,train_4409,30000000,0,32914426.0
3,train_5358,36800000,0,32592012.0
4,train_4592,21000000,0,26503988.0


In [46]:
out = Y_eval_pred
out['error']=abs((out['keiyaku_pr']-out['pred_keiyaku_pr'])/out['keiyaku_pr'])*100
output = pd.merge(out, pd.read_csv("data/processed_train_goto_x_v9.csv"),on='id')

In [48]:
output.to_csv("data/tmp.csv")

### 以降はむだだったコード。建て売りか土地売りかで別モデルを作ったが、結局意味はなかった。

In [None]:
# 土地売りと建て売りとにデータを分割
X_train_tateuri = X_train[X_train['levelplan_土地売り']==0]
X_train_tochiuri = X_train[X_train['levelplan_土地売り']==1]
Y_train_tateuri = Y_train[X_train['levelplan_土地売り']==0]
Y_train_tochiuri = Y_train[X_train['levelplan_土地売り']==1]

X_eval_tateuri = X_eval[X_eval['levelplan_土地売り']==0]
X_eval_tochiuri = X_eval[X_eval['levelplan_土地売り']==1]
Y_eval_tateuri = Y_eval[X_eval['levelplan_土地売り']==0]
Y_eval_tochiuri = Y_eval[X_eval['levelplan_土地売り']==1]

In [None]:
## 土地売り以外のlevelplanを削除してみる
X_train_1 = X_train.drop(['levelplan_1F/2LDK','levelplan_1F/3LDK','levelplan_1F/4LDK','levelplan_1F/4LDK+S','levelplan_1F/5LDK'],axis=1)
X_train_2 = X_train_1.drop(['levelplan_2F/2LDK','levelplan_2F/2LDK+S','levelplan_2F/3DK','levelplan_2F/3LDK','levelplan_2F/3LDK+2S','levelplan_2F/3LDK+S','levelplan_2F/4DK','levelplan_2F/4LDK','levelplan_2F/4LDK+S','levelplan_2F/5DK','levelplan_2F/5LDK'],axis=1)
X_train_3 = X_train_2.drop(['levelplan_3F/2LDK','levelplan_3F/2LDK+2S','levelplan_3F/2LDK+S','levelplan_3F/3DK','levelplan_3F/3LDK','levelplan_3F/3LDK+2S','levelplan_3F/3LDK+S','levelplan_3F/4DK','levelplan_3F/4LDK','levelplan_3F/4LDK+S','levelplan_3F/5LDK'],axis=1)

X_eval_1 = X_eval.drop(['levelplan_1F/2LDK','levelplan_1F/3LDK','levelplan_1F/4LDK','levelplan_1F/4LDK+S','levelplan_1F/5LDK'],axis=1)
X_eval_2 = X_eval_1.drop(['levelplan_2F/2LDK','levelplan_2F/2LDK+S','levelplan_2F/3DK','levelplan_2F/3LDK','levelplan_2F/3LDK+2S','levelplan_2F/3LDK+S','levelplan_2F/4DK','levelplan_2F/4LDK','levelplan_2F/4LDK+S','levelplan_2F/5DK','levelplan_2F/5LDK'],axis=1)
X_eval_3 = X_eval_2.drop(['levelplan_3F/2LDK','levelplan_3F/2LDK+2S','levelplan_3F/2LDK+S','levelplan_3F/3DK','levelplan_3F/3LDK','levelplan_3F/3LDK+2S','levelplan_3F/3LDK+S','levelplan_3F/4DK','levelplan_3F/4LDK','levelplan_3F/4LDK+S','levelplan_3F/5LDK'],axis=1)


In [None]:
# 学習ルーチンを呼び出す。
params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}

tateuri_model = learn(X_train_tateuri.drop(['id','pj_no','levelplan_土地売り'],axis=1), Y_train_tateuri.drop(['id'],axis=1), params, 42)
tochiuri_model = learn(X_train_tochiuri.drop(['id','pj_no','levelplan_土地売り'],axis=1), Y_train_tochiuri.drop(['id'],axis=1), params, 42)

In [None]:
# 予測する
pred_y_tateuri = tateuri_model.predict(X_eval_tateuri.drop(['id','pj_no','levelplan_土地売り'],axis=1))
pred_y_tochiuri = tochiuri_model.predict(X_eval_tochiuri.drop(['id','pj_no','levelplan_土地売り'],axis=1))

In [None]:
# DataFrameの形で予測値を作成する
Y_pred_tateuri = pd.DataFrame(X_eval_tateuri['id'].copy().reset_index(drop=True))
Y_pred_tateuri['pred_keiyaku_pr'] = pd.Series(pred_y_tateuri)
Y_pred_tochiuri = pd.DataFrame(X_eval_tochiuri['id'].copy().reset_index(drop=True))
Y_pred_tochiuri['pred_keiyaku_pr'] = pd.Series(pred_y_tochiuri)
Y_pred_all = pd.concat([Y_pred_tateuri, Y_pred_tochiuri])
Y_eval_pred = pd.merge(Y_eval, Y_pred_all, on='id', how='left')

In [135]:
model = learn(X_train_3.drop(['id','pj_no'],axis=1), Y_train.drop(['id'],axis=1), params, 42)
pred_y = model.predict(X_eval_3.drop(['id','pj_no'],axis=1))

In [136]:
Y_pred_all = pd.DataFrame(X_eval_3[['id','levelplan_土地売り']].copy().reset_index(drop=True))
Y_pred_all['pred_keiyaku_pr'] = pd.Series(pred_y)
Y_eval_pred = pd.merge(Y_eval, Y_pred_all, on='id', how='left')

In [137]:
print(mean_absolute_percentage_error(Y_eval_pred['keiyaku_pr'].values, Y_eval_pred['pred_keiyaku_pr'].values))

8.56302858604


In [1]:
# 共通処理
# x_train. y_train, x_eval, y_evalを作成する
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def mean_absolute_percentage_error( y_train: np.array, y_pred: np.array):
    diff = 0
    n = len(y_train)
    for i in range(n):
        diff += abs(y_train[i]-y_pred[i])/y_train[i]
    score = 100*diff / n
    
    return score


train_x = pd.read_csv("data/processed_train_goto_x_v8.csv")
train_y = pd.read_csv("data/processed_train_goto_y_v8.csv")
X_train, X_eval, Y_train, Y_eval = train_test_split( train_x, train_y, train_size=0.8, random_state = 19711022)

X_train.to_csv("data/X_train.csv", index=False)
X_eval.to_csv("data/X_eval.csv", index=False)
Y_train.to_csv("data/Y_train.csv", index=False)
Y_eval.to_csv("data/Y_eval.csv", index=False)

train_x = pd.read_csv('data/X_train.csv').drop(['id','pj_no'],axis=1)
train_y = pd.read_csv('data/Y_train.csv').drop(['id'],axis=1)

import xgboost as xgb
from xgboost import XGBRegressor
import time

params = {
    'n_estimators':700,
    'max_depth':6,
    'min_child_weight':9,
    'gamma':0,
    'subsample':1.0,
    'colsample_bytree':0.6,
    'learning_rate':0.1
}

print(f"start learning...")
xgboost_opt = XGBRegressor(**params, seed=42, n_jobs=-1)
start = time.perf_counter()
xgboost_opt.fit(train_x, train_y)
end = time.perf_counter()
print(end-start)

print(f"start estimating...")
eval_x = pd.read_csv('data/processed_train_goto_x_v8.csv').drop(['id','pj_no'],axis=1)
ans_y = pd.read_csv('data/processed_train_goto_y_v8.csv').drop(['id'],axis=1)
pred_y = xgboost_opt.predict(eval_x)
print( mean_absolute_percentage_error(ans_y.values,pred_y))

out = pd.read_csv('data/processed_train_goto_y_v8.csv')
out['pred_keiyaku_pr'] = pd.Series(pred_y).astype(np.int64)
out['error']=abs((out['keiyaku_pr']-out['pred_keiyaku_pr'])/out['keiyaku_pr'])*100
output = pd.merge(out, pd.read_csv("data/processed_train_goto_x_v8.csv"),on='id')
output.to_csv("data/train_data_error.csv")



start learning...
26.849811518000024
start estimating...
[ 3.48094355]


In [2]:
print(f"start estimating...")
eval_x = pd.read_csv('data/X_eval.csv').drop(['id','pj_no'],axis=1)
ans_y = pd.read_csv('data/Y_eval.csv').drop(['id'],axis=1)
pred_y = xgboost_opt.predict(eval_x)
print( mean_absolute_percentage_error(ans_y.values,pred_y))

out = pd.read_csv('data/Y_eval.csv')
out['pred_keiyaku_pr'] = pd.Series(pred_y).astype(np.int64)
out['error']=abs((out['keiyaku_pr']-out['pred_keiyaku_pr'])/out['keiyaku_pr'])*100
output = pd.merge(out, pd.read_csv("data/X_eval.csv"),on='id')
output.to_csv("data/eval_data_error.csv")

start estimating...
[ 8.70952682]


In [3]:
importance = pd.DataFrame(xgboost_opt.feature_importances_, index=eval_x.columns)
importance.to_csv("data/feature_importances_V8.csv")

In [4]:
test_x = pd.read_csv("data/processed_test_goto_x_v8.csv")
test_pred = xgboost_opt.predict(test_x.drop(['id','pj_no'],axis=1))
submit = pd.DataFrame(test_x[['id']])
submit['keiyaku_pr']=pd.Series(test_pred).astype(np.int64)
submit.to_csv('data/submit_v8.tsv',sep='\t',header=None, index=False)

### n_estimatorsが700のケースでsubmitしてみることにする(7/7)

In [None]:
test_x = pd.read_csv("data/processed_test_goto_x.csv")
test_pred = xgboost_opt.predict(test_x.drop(['id','pj_no'],axis=1))
submit = pd.DataFrame(test_x[['id']])
submit['keiyaku_pr']=pd.Series(test_pred).astype(np.int64)
submit.to_csv('data/submit4.tsv',sep='\t',header=None, index=False)

### ここからSageMaker用のデータを作る処理

In [None]:
train_x = pd.read_csv('data/X_train.csv')
train_y = pd.read_csv('data/Y_train.csv')

In [None]:
train_input = pd.concat([train_y.drop(['id','keiyaku_pr','tc_mseki'],axis=1),train_x.drop(['id','pj_no'],axis=1)],axis=1)
train_input.to_csv('data/sagemaker_input.csv', header=None, index=False)
eval_x = pd.read_csv('data/X_eval.csv')
eval_x.drop(['id','pj_no'],axis=1).to_csv('data/sagemaker_eval_input.csv',header=None, index=False)


### SageMakerの出力から精度を計算する

In [None]:
pred2_y = pd.read_csv('data/sagemaker_eval_input.csv.out', header=None)
ans_y = pd.read_csv('data/Y_eval.csv').drop(['id','keiyaku_pr','tc_mseki'],axis=1)

In [None]:
print( mean_absolute_percentage_error(ans_y.values,pred2_y.values))

### SageMaker用予測データを作成する

In [None]:
test_x = pd.read_csv("data/processed_test_goto_x.csv")

In [None]:
test_input = test_x.drop(['id','pj_no'],axis=1)
test_input.to_csv('data/sagemaker_test_input.csv', header=None, index=False)

### SageMaker出力からsubmit用データを作る

In [None]:
tanka = pd.read_csv("data/sagemaker_test_input.csv.out", header=None )

In [None]:
test_x = pd.read_csv("data/processed_test_goto_x.csv")

In [None]:
submit = pd.DataFrame(test_x[['id', 'tc_mseki']])

In [None]:
submit['tanka_pr']=tanka

In [None]:
submit['price']=(submit['tc_mseki']*submit['tanka_pr']).astype(np.int64)

In [None]:
submit.loc[:,['id','price']].to_csv('data/submit3.tsv',sep='\t',header=None, index=False)

In [None]:
submit.head()