In [2]:
import warnings
import pandas as pd

from sklearn.metrics import mean_absolute_error

pd.set_option("display.max_columns", 50)
warnings.simplefilter("ignore")

train  = pd.read_csv("train/01.csv")
test   = pd.read_csv("test.csv")
submit = pd.read_csv("sample_submission.csv")

# 種類：中古マンション等のみ 必要なし
del train["種類"]
del test["種類"]

# 種類：nanのみ 必要なし
del train["地域"]
del test["地域"]

- '都道府県名'
- '地区名'
- '間取り'
- '土地の形状'
- '間口', '延床面積（㎡）'
- '建物の構造'
- '用途'
- '今後の利用目的'
- '前面道路：方位'
- '前面道路：種類'
- '前面道路：幅員（ｍ）'
- '都市計画'
- '建ぺい率（％）'
- '容積率（％）'
- '取引時点'
- '改装'
- '取引の事情等'

In [12]:
train["地区名"].unique()

array(['南９条西', '平岸５条', '北７条西', '厚別南', '銭函', '中央１条', '三光町', '月寒東１条',
       '厚別中央２条', '中の島１条', '北５条東', '宮下通', '錦町', '平岸１条', '富丘１条', '朝里川温泉',
       '北４条西', '真駒内泉町', '北２条東', '南５条西', '北１条東', '北１３条西', '山の手３条', '大手町',
       '真駒内上町', '月寒中央通', '南８条西', '北４０条西', '北６条西', '北門町', '北１条西', '平岸２条',
       '北３条東', '南郷通', '北３５条西', '北３６条西', '宮の森３条', '北３条西', 'あいの里２条',
       '南１３条西', '南６条西', '東札幌１条', '北３４条西', '５条通', '二十四軒３条', '北１７条東',
       '発寒６条', '二十四軒４条', '南１６条西', '福住１条', '大谷地東', '厚別東５条', '北３２条西',
       '山の手１条', '青葉町', '新中野町', '二十四軒２条', '琴似１条', '前田５条', '菊水８条', '真駒内緑町',
       '豊平６条', '菊水７条', '豊平７条', '新発寒５条', '宮の森２条', '大通西', '屯田６条', '東札幌２条',
       '南１２条西', '南３２条西', '曙２条', '南４条西', '石山１条', '大成町', '本郷通', '北９条東',
       '北３５条東', '北１９条東', '北１０条西', '北４２条東', '潮見台', '菊水４条', '伏古４条', '発寒１５条',
       '西町北', '山の手２条', '澄川４条', '南１８条西', '豊平１条', '西宮の沢３条', '発寒７条', '浪花町',
       '福住３条', '宮前町', '月寒西４条', 'もみじ台北', '新琴似８条', '大谷地西', '麻生町', '北２６条東',
       '旭町', '３条通', '二十四軒１条', '山の手４条', '中の島２条', '北７条東', '北１９条西', '南２４条西',

In [13]:
test["面積（㎡）"] = test["面積（㎡）"].replace("2000㎡以上", "2000").astype(int)

train["最寄駅：距離（分）"] = train["最寄駅：距離（分）"].replace(["30分?60分","1H?1H30","1H30?2H", "2H?"], ["45","75","105","120"])
test["最寄駅：距離（分）"]  = test["最寄駅：距離（分）"].replace(["30分?60分","1H?1H30","1H30?2H","2H?"], ["45","75","105","120"])

train["市区町村コード"] = train["市区町村コード"].astype("category")
test["市区町村コード"] = test["市区町村コード"].astype("category")

features = ["面積（㎡）", "建築年", "最寄駅：距離（分）", "最寄駅：名称",
            "市区町村コード", "地区名"
           ]

x_train = train[features]
y_train = train[["取引価格（総額）_log"]]
i_train = train[["ID"]]

x_test  = test[features]
i_test  = test[["ID"]]

In [14]:
for col in x_train.columns:
    if x_train[col].dtype == "O":
        x_train[col] = x_train[col].astype("category")

for col in x_train.columns:
    if x_test[col].dtype == "O":
        x_test[col] = x_test[col].astype("category")

In [15]:
params = {
    "boosting_type"  : "gbdt",
    "objective"      : "regression_l1",
    "metric"         : "mean_absolute_error",
    "learning_rate"  : 0.1,
    "num_leaves"     : 31,
    "n_estimators"   : 1000,
    "random_state"   : 2022,
    "importance_type": "gain"
}

In [16]:
import pickle
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import KFold

df_valid_pred = pd.DataFrame()
metrics = []
df_imp  = pd.DataFrame()

n_splits = 5
cv = list(KFold(n_splits=n_splits, shuffle=True, random_state=2022).split(x_train))

for nfold, i in enumerate(cv):
    print(f"Fold: {nfold}")
    i_tr, i_va = i[0], i[1]
    x_tr, y_tr, i_tr = x_train.loc[i_tr], y_train.loc[i_tr], i_train.loc[i_tr]
    x_va, y_va, i_va = x_train.loc[i_va], y_train.loc[i_va], i_train.loc[i_va]
    print(f"train_x: {x_tr.shape}, valid_x: {x_va.shape}")
    
    print("training start!")
    model = lgb.LGBMRegressor(**params)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr, y_tr), (x_va, y_va)],
              early_stopping_rounds=100,
              verbose=500)
    
    with open(f"model_lgb_fold{nfold}.h5", "wb") as f:
        pickle.dump(model, f, protocol=4)
    
    p_va = model.predict(x_va)
    tmp_pred = pd.DataFrame({"p_va": p_va, "y_va": np.array(y_va).reshape(-1)})
    
    df_valid_pred = pd.concat([df_valid_pred, tmp_pred], ignore_index=True)
    
    metric_va = mean_absolute_error(y_va, p_va)
    metrics.append([nfold, metric_va])
    
    tmp_imp = pd.DataFrame({"col"  : x_tr.columns,
                            "imp"  : model.feature_importances_,
                            "nfold": nfold
                           })
    tmp_imp.sort_values(by="imp", ascending=False)
    df_imp = pd.concat([df_imp, tmp_imp], ignore_index=True)
    
    print()

Fold: 0
train_x: (19840, 7), valid_x: (4960, 7)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0879867	valid_1's l1: 0.111615
Early stopping, best iteration is:
[846]	training's l1: 0.0861055	valid_1's l1: 0.111409

Fold: 1
train_x: (19840, 7), valid_x: (4960, 7)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0876383	valid_1's l1: 0.112413
Early stopping, best iteration is:
[732]	training's l1: 0.0861423	valid_1's l1: 0.112355

Fold: 2
train_x: (19840, 7), valid_x: (4960, 7)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0880034	valid_1's l1: 0.108085
Early stopping, best iteration is:
[492]	training's l1: 0.0880743	valid_1's l1: 0.108072

Fold: 3
train_x: (19840, 7), valid_x: (4960, 7)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0876256	valid_1's l1: 0.110999
Early stop

In [17]:
df_metrics = pd.DataFrame(metrics, columns=["nfold", "mae"])
mae_mean = df_metrics["mae"].mean()
print(f"MAE: {round(mae_mean, 6)}")

MAE: 0.110013


In [18]:
(df_valid_pred["y_va"] - df_valid_pred["p_va"]).mean()

-0.009228932327402179

In [19]:
df_imp.groupby("col")["imp"].agg(["mean"]).sort_values(by="mean", ascending=False)

Unnamed: 0_level_0,mean
col,Unnamed: 1_level_1
建築年,116539.475197
地区名,102092.757532
最寄駅：距離（分）,36073.604515
面積（㎡）,35781.308751
最寄駅：名称,30941.481604
市区町村コード,726.45934
市区町村名,0.0


# cv: 0.110013
# cv(all_data): 0.089208