In [63]:
import warnings
import pandas as pd

from sklearn.metrics import mean_absolute_error

pd.set_option("display.max_columns", 50)
warnings.simplefilter("ignore")

train  = pd.read_csv("train/01.csv")
test   = pd.read_csv("test.csv")
submit = pd.read_csv("sample_submission.csv")

In [64]:
# 種類：中古マンション等のみ 必要なし
del train["種類"]
del test["種類"]

# 種類：nanのみ 必要なし
del train["地域"]
del test["地域"]

# 重複 "市区町村名" 必要なし
del train["市区町村名"]
del test["市区町村名"]

- 'ID'
- '市区町村コード'
- '都道府県名'
- '市区町村名'
- '地区名'
- '間取り'
- '土地の形状'
- '間口', '延床面積（㎡）'
- '建物の構造'
- '用途'
- '今後の利用目的'
- '前面道路：方位'
- '前面道路：種類'
- '前面道路：幅員（ｍ）'
- '都市計画'
- '建ぺい率（％）'
- '容積率（％）'
- '取引時点'
- '改装'
- '取引の事情等'

In [78]:
train["市区町村名"].unique()

array(['札幌市中央区', '札幌市豊平区', '札幌市北区', '札幌市厚別区', '小樽市', '札幌市白石区', '苫小牧市',
       '札幌市東区', '旭川市', '札幌市手稲区', '札幌市南区', '札幌市西区', '函館市', '釧路市', '札幌市清田区',
       '江別市', '帯広市', '北見市'], dtype=object)

In [65]:
train["市区町村コード"].unique()

array([1101, 1105, 1102, 1108, 1203, 1104, 1213, 1103, 1204, 1109, 1106,
       1107, 1202, 1206, 1110, 1217, 1207, 1208])

In [79]:
test["面積（㎡）"] = test["面積（㎡）"].replace("2000㎡以上", "2000").astype(int)

train["最寄駅：距離（分）"] = train["最寄駅：距離（分）"].replace(["30分?60分","1H?1H30","1H30?2H", "2H?"], ["45","75","105","120"])
test["最寄駅：距離（分）"]  = test["最寄駅：距離（分）"].replace(["30分?60分","1H?1H30","1H30?2H","2H?"], ["45","75","105","120"])

train["市区町村コード"] = train["市区町村コード"].astype("category")
test["市区町村コード"] = test["市区町村コード"].astype("category")

features = ["面積（㎡）", "建築年", "最寄駅：距離（分）", "最寄駅：名称",
            "市区町村コード"
           ]

x_train = train[features]
y_train = train[["取引価格（総額）_log"]]
i_train = train[["ID"]]

x_test  = test[features]
i_test  = test[["ID"]]

In [80]:
for col in x_train.columns:
    if x_train[col].dtype == "O":
        x_train[col] = x_train[col].astype("category")

for col in x_train.columns:
    if x_test[col].dtype == "O":
        x_test[col] = x_test[col].astype("category")

In [81]:
params = {
    "boosting_type"  : "gbdt",
    "objective"      : "regression_l1",
    "metric"         : "mean_absolute_error",
    "learning_rate"  : 0.1,
    "num_leaves"     : 31,
    "n_estimators"   : 1000,
    "random_state"   : 2022,
    "importance_type": "gain"
}

In [82]:
import pickle
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import KFold

df_valid_pred = pd.DataFrame()
metrics = []
df_imp  = pd.DataFrame()

n_splits = 5
cv = list(KFold(n_splits=n_splits, shuffle=True, random_state=2022).split(x_train))

for nfold, i in enumerate(cv):
    print(f"Fold: {nfold}")
    i_tr, i_va = i[0], i[1]
    x_tr, y_tr, i_tr = x_train.loc[i_tr], y_train.loc[i_tr], i_train.loc[i_tr]
    x_va, y_va, i_va = x_train.loc[i_va], y_train.loc[i_va], i_train.loc[i_va]
    print(f"train_x: {x_tr.shape}, valid_x: {x_va.shape}")
    
    print("training start!")
    model = lgb.LGBMRegressor(**params)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr, y_tr), (x_va, y_va)],
              early_stopping_rounds=100,
              verbose=500)
    
    with open(f"model_lgb_fold{nfold}.h5", "wb") as f:
        pickle.dump(model, f, protocol=4)
    
    p_va = model.predict(x_va)
    tmp_pred = pd.DataFrame({"p_va": p_va, "y_va": np.array(y_va).reshape(-1)})
    
    df_valid_pred = pd.concat([df_valid_pred, tmp_pred], ignore_index=True)
    
    metric_va = mean_absolute_error(y_va, p_va)
    metrics.append([nfold, metric_va])
    
    tmp_imp = pd.DataFrame({"col"  : x_tr.columns,
                            "imp"  : model.feature_importances_,
                            "nfold": nfold
                           })
    tmp_imp.sort_values(by="imp", ascending=False)
    df_imp = pd.concat([df_imp, tmp_imp], ignore_index=True)
    
    print()

Fold: 0
train_x: (19840, 6), valid_x: (4960, 6)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0901971	valid_1's l1: 0.113672
Early stopping, best iteration is:
[729]	training's l1: 0.0883044	valid_1's l1: 0.113424

Fold: 1
train_x: (19840, 6), valid_x: (4960, 6)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0897953	valid_1's l1: 0.113208
Early stopping, best iteration is:
[899]	training's l1: 0.087094	valid_1's l1: 0.113092

Fold: 2
train_x: (19840, 6), valid_x: (4960, 6)
training start!
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[399]	training's l1: 0.0912774	valid_1's l1: 0.109902

Fold: 3
train_x: (19840, 6), valid_x: (4960, 6)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0903382	valid_1's l1: 0.112696
[1000]	training's l1: 0.0871327	valid_1's l1: 0.112566
Did not me

In [83]:
df_metrics = pd.DataFrame(metrics, columns=["nfold", "mae"])
mae_mean = df_metrics["mae"].mean()
print(f"MAE: {round(mae_mean, 6)}")

MAE: 0.111397


In [49]:
(df_valid_pred["y_va"] - df_valid_pred["p_va"]).mean()

-0.01013902078902145

In [50]:
df_imp.groupby("col")["imp"].agg(["mean"]).sort_values(by="mean", ascending=False)

Unnamed: 0_level_0,mean
col,Unnamed: 1_level_1
建築年,494545.485434
最寄駅：名称,183846.555234
面積（㎡）,164563.024951
最寄駅：距離（分）,94368.087038
市区町村コード,7441.152718


In [61]:
# n_folds = 5

# def predict_lgb(input_x, input_id, n_folds):
#     for nfold in range(n_folds):
#         print("-"*20, " Fold: ", nfold, " ", "-"*20)
#         print()
#         # モデル読み込み
#         with open(f"./model_lgb_fold{nfold}.h5", "rb") as f:
#             model = pickle.load(f)

#         # 推論
#         pred = model.predict(input_x)

#         # 予測値の格納
#         input_id[f"fold{nfold}"] = pred
            
#     return input_id

In [60]:
# # 学習モデルを用いた推論処理
# df_test_pred = predict_lgb(x_test, i_test, n_folds)

In [62]:
# submit["取引価格（総額）_log"] = df_test_pred[["fold0","fold1","fold2","fold3","fold4"]].mean(axis=1)
# submit.to_csv("submission.csv", index=False)

# learning_rate: 0.02
# cv: 0.111731
# public_score: なし
# public_score(all_data): 0.1140(cv: 0.091347)
# ---------------------------------------------
# learning_rate: 0.1
# cv: 0.111397
# public_score(all_data): 0.1127(cv: 0.090326)