In [12]:
import warnings
import pandas as pd

from sklearn.metrics import mean_absolute_error

pd.set_option("display.max_columns", 50)
warnings.simplefilter("ignore")

train  = pd.read_csv("train/01.csv")
test   = pd.read_csv("test.csv")
submit = pd.read_csv("sample_submission.csv")

# 種類：中古マンション等のみ 必要なし
del train["種類"]
del test["種類"]

# 種類：nanのみ 必要なし
del train["地域"]
del test["地域"]

# 重複 "市区町村名" 必要なし
del train["市区町村名"]
del test["市区町村名"]

# テストデータ　全てnan
del train['土地の形状']
del test['土地の形状']

# テストデータ　全てnan
del train["間口"]
del test["間口"]

# テストデータ　全てnan
del train["延床面積（㎡）"]
del test["延床面積（㎡）"]

# 全てnan
del train["前面道路：方位"]
del test["前面道路：方位"]

# 全てnan
del train["前面道路：種類"]
del test["前面道路：種類"]

# 全てnan
del train["前面道路：幅員（ｍ）"]
del test["前面道路：幅員（ｍ）"]

- '都市計画'
- '建ぺい率（％）'
- '容積率（％）'
- '取引時点'
- '改装'
- '取引の事情等'

In [19]:
train["都市計画"].value_counts()

近隣商業地域          5443
第１種住居地域         4629
商業地域            4519
第２種中高層住居専用地域    3186
第１種中高層住居専用地域    2076
準工業地域           1646
準住居地域           1179
第２種住居地域          407
第１種低層住居専用地域      242
工業地域             139
工業専用地域            15
第２種低層住居専用地域        9
Name: 都市計画, dtype: int64

In [20]:
test["面積（㎡）"] = test["面積（㎡）"].replace("2000㎡以上", "2000").astype(int)

train["最寄駅：距離（分）"] = train["最寄駅：距離（分）"].replace(["30分?60分","1H?1H30","1H30?2H", "2H?"], ["45","75","105","120"])
test["最寄駅：距離（分）"]  = test["最寄駅：距離（分）"].replace(["30分?60分","1H?1H30","1H30?2H","2H?"], ["45","75","105","120"])

train["市区町村コード"] = train["市区町村コード"].astype("category")
test["市区町村コード"] = test["市区町村コード"].astype("category")

features = ["面積（㎡）", "建築年", "最寄駅：距離（分）", "最寄駅：名称",
            "市区町村コード", "地区名", "間取り", "建物の構造", "用途",
            "今後の利用目的", "都市計画"
           ]

x_train = train[features]
y_train = train[["取引価格（総額）_log"]]
i_train = train[["ID"]]

x_test  = test[features]
i_test  = test[["ID"]]

for col in x_train.columns:
    if x_train[col].dtype == "O":
        x_train[col] = x_train[col].astype("category")

for col in x_train.columns:
    if x_test[col].dtype == "O":
        x_test[col] = x_test[col].astype("category")

params = {
    "boosting_type"  : "gbdt",
    "objective"      : "regression_l1",
    "metric"         : "mean_absolute_error",
    "learning_rate"  : 0.1,
    "num_leaves"     : 31,
    "n_estimators"   : 1000,
    "random_state"   : 2022,
    "importance_type": "gain"
}

import pickle
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import KFold

df_valid_pred = pd.DataFrame()
metrics = []
df_imp  = pd.DataFrame()

n_splits = 5
cv = list(KFold(n_splits=n_splits, shuffle=True, random_state=2022).split(x_train))

for nfold, i in enumerate(cv):
    print(f"Fold: {nfold}")
    i_tr, i_va = i[0], i[1]
    x_tr, y_tr, i_tr = x_train.loc[i_tr], y_train.loc[i_tr], i_train.loc[i_tr]
    x_va, y_va, i_va = x_train.loc[i_va], y_train.loc[i_va], i_train.loc[i_va]
    print(f"train_x: {x_tr.shape}, valid_x: {x_va.shape}")
    
    print("training start!")
    model = lgb.LGBMRegressor(**params)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr, y_tr), (x_va, y_va)],
              early_stopping_rounds=100,
              verbose=500)
    
    with open(f"model_lgb_fold{nfold}.h5", "wb") as f:
        pickle.dump(model, f, protocol=4)
    
    p_va = model.predict(x_va)
    tmp_pred = pd.DataFrame({"p_va": p_va, "y_va": np.array(y_va).reshape(-1)})
    
    df_valid_pred = pd.concat([df_valid_pred, tmp_pred], ignore_index=True)
    
    metric_va = mean_absolute_error(y_va, p_va)
    metrics.append([nfold, metric_va])
    
    tmp_imp = pd.DataFrame({"col"  : x_tr.columns,
                            "imp"  : model.feature_importances_,
                            "nfold": nfold
                           })
    tmp_imp.sort_values(by="imp", ascending=False)
    df_imp = pd.concat([df_imp, tmp_imp], ignore_index=True)
    
    print()

Fold: 0
train_x: (19840, 11), valid_x: (4960, 11)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0774522	valid_1's l1: 0.102205
[1000]	training's l1: 0.0746399	valid_1's l1: 0.10195
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.0746399	valid_1's l1: 0.10195

Fold: 1
train_x: (19840, 11), valid_x: (4960, 11)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0777702	valid_1's l1: 0.102771
[1000]	training's l1: 0.0748009	valid_1's l1: 0.102517
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.0748009	valid_1's l1: 0.102517

Fold: 2
train_x: (19840, 11), valid_x: (4960, 11)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.0779347	valid_1's l1: 0.0992912
Early stopping, best iteration is:
[603]	training's l1: 0.0771548	valid_1's l1: 0.0992545

Fold: 3
train_x: (19840, 11), valid_x: (4960, 

In [21]:
df_metrics = pd.DataFrame(metrics, columns=["nfold", "mae"])
mae_mean = df_metrics["mae"].mean()
print(f"MAE: {round(mae_mean, 6)}")

MAE: 0.100279


In [17]:
df_imp.groupby("col")["imp"].agg(["mean"]).sort_values(by="mean", ascending=False)

Unnamed: 0_level_0,mean
col,Unnamed: 1_level_1
建築年,126585.055883
地区名,120256.712136
最寄駅：距離（分）,41794.790414
最寄駅：名称,39515.578516
面積（㎡）,37747.683897
今後の利用目的,9560.567694
間取り,9043.804749
都市計画,2208.967304
市区町村コード,1227.470817
用途,1214.738141


# cv: 0.100279
# cv(all_data): 0.084151
# public_score:(all_data) 0.0998