In [1]:
import warnings
import pandas as pd

from sklearn.metrics import mean_absolute_error

pd.set_option("display.max_columns", 50)
warnings.simplefilter("ignore")

train  = pd.read_csv("train/01.csv")
test   = pd.read_csv("test.csv")
submit = pd.read_csv("sample_submission.csv")

In [2]:
# 種類：中古マンション等のみ 必要なし
del train["種類"]
del test["種類"]

# 種類：nanのみ 必要なし
del train["地域"]
del test["地域"]

In [3]:
train["最寄駅：距離（分）"].unique()

array(['5', '11', '3', '1', '8', '30分?60分', '2', '15', nan, '28', '9',
       '10', '6', '4', '18', '7', '0', '21', '13', '12', '29', '14', '24',
       '25', '20', '23', '16', '27', '19', '22', '1H?1H30', '17', '26',
       '2H?', '1H30?2H'], dtype=object)

array(['5', '11', '3', '1', '8', '30分?60分', '2', '15', nan, '28', '9',
       '10', '6', '4', '18', '7', '0', '21', '13', '12', '29', '14', '24',
       '25', '20', '23', '16', '27', '19', '22', '1H?1H30', '17', '26',
       '2H?', '1H30?2H'], dtype=object)

In [4]:
test["面積（㎡）"] = test["面積（㎡）"].replace("2000㎡以上", "2000").astype(int)

train["最寄駅：距離（分）"] = train["最寄駅：距離（分）"].replace(["30分?60分","1H?1H30","1H30?2H", "2H?"], ["45","75","105","120"])
test["最寄駅：距離（分）"]  = test["最寄駅：距離（分）"].replace(["30分?60分","1H?1H30","1H30?2H","2H?"], ["45","75","105","120"])

features = ["面積（㎡）", "建築年", "最寄駅：距離（分）"]

x_train = train[features]
y_train = train[["取引価格（総額）_log"]]
i_train = train[["ID"]]

x_test  = test[features]
i_test  = test[["ID"]]

In [5]:
for col in x_train.columns:
    if x_train[col].dtype == "O":
        x_train[col] = x_train[col].astype("category")

for col in x_test.columns:
    if x_test[col].dtype == "O":
        x_test[col] = x_test[col].astype("category")

In [6]:
params = {
    "boosting_type"  : "gbdt",
    "objective"      : "regression_l1",
    "metric"         : "mean_absolute_error",
    "learning_rate"  : 0.02,
    "num_leaves"     : 31,
    "n_estimators"   : 1000,
    "random_state"   : 2022,
    "importance_type": "gain"
}

In [7]:
import pickle
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import KFold

df_valid_pred = pd.DataFrame()
metrics = []
df_imp  = pd.DataFrame()

n_splits = 5
cv = list(KFold(n_splits=n_splits, shuffle=True, random_state=2022).split(x_train))

for nfold, i in enumerate(cv):
    print(f"Fold: {nfold}")
    i_tr, i_va = i[0], i[1]
    x_tr, y_tr, i_tr = x_train.loc[i_tr], y_train.loc[i_tr], i_train.loc[i_tr]
    x_va, y_va, i_va = x_train.loc[i_va], y_train.loc[i_va], i_train.loc[i_va]
    print(f"train_x: {x_tr.shape}, valid_x: {x_va.shape}")
    
    print("training start!")
    model = lgb.LGBMRegressor(**params)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr, y_tr), (x_va, y_va)],
              early_stopping_rounds=100,
              verbose=500)
    
    with open(f"model_lgb_fold{nfold}.h5", "wb") as f:
        pickle.dump(model, f, protocol=4)
    
    p_va = model.predict(x_va)
    tmp_pred = pd.DataFrame({"p_va": p_va, "y_va": np.array(y_va).reshape(-1)})
    
    df_valid_pred = pd.concat([df_valid_pred, tmp_pred], ignore_index=True)
    
    metric_va = mean_absolute_error(y_va, p_va)
    metrics.append([nfold, metric_va])
    
    tmp_imp = pd.DataFrame({"col"  : x_tr.columns,
                            "imp"  : model.feature_importances_,
                            "nfold": nfold
                           })
    tmp_imp.sort_values(by="imp", ascending=False)
    df_imp = pd.concat([df_imp, tmp_imp], ignore_index=True)
    
    print()

Fold: 0
train_x: (19840, 3), valid_x: (4960, 3)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.119631	valid_1's l1: 0.128775
[1000]	training's l1: 0.116105	valid_1's l1: 0.128301
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.116105	valid_1's l1: 0.128301

Fold: 1
train_x: (19840, 3), valid_x: (4960, 3)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.118821	valid_1's l1: 0.13177
[1000]	training's l1: 0.115158	valid_1's l1: 0.130945
Did not meet early stopping. Best iteration is:
[1000]	training's l1: 0.115158	valid_1's l1: 0.130945

Fold: 2
train_x: (19840, 3), valid_x: (4960, 3)
training start!
Training until validation scores don't improve for 100 rounds
[500]	training's l1: 0.120058	valid_1's l1: 0.123635
Early stopping, best iteration is:
[736]	training's l1: 0.117982	valid_1's l1: 0.123475

Fold: 3
train_x: (19840, 3), valid_x: (4960, 3)
training star

In [8]:
df_metrics = pd.DataFrame(metrics, columns=["nfold", "mae"])
mae_mean = df_metrics["mae"].mean()
print(f"MAE: {round(mae_mean, 6)}")

MAE: 0.127293


In [9]:
(df_valid_pred["y_va"] - df_valid_pred["p_va"]).mean()

-0.012248011713519127

In [10]:
df_imp.groupby("col")["imp"].agg(["mean"]).sort_values(by="mean", ascending=False)

Unnamed: 0_level_0,mean
col,Unnamed: 1_level_1
建築年,456844.268695
面積（㎡）,172898.548331
最寄駅：距離（分）,77273.748341


In [97]:
n_folds = 5

def predict_lgb(input_x, input_id, n_folds):
    for nfold in range(n_folds):
        print("-"*20, " Fold: ", nfold, " ", "-"*20)
        print()
        # モデル読み込み
        with open(f"./model_lgb_fold{nfold}.h5", "rb") as f:
            model = pickle.load(f)

        # 推論
        pred = model.predict(input_x)

        # 予測値の格納
        input_id[f"fold{nfold}"] = pred
            
    return input_id

In [98]:
# 学習モデルを用いた推論処理
df_test_pred = predict_lgb(x_test, i_test, n_folds)

--------------------  Fold:  0   --------------------

--------------------  Fold:  1   --------------------

--------------------  Fold:  2   --------------------

--------------------  Fold:  3   --------------------

--------------------  Fold:  4   --------------------



In [99]:
submit["取引価格（総額）_log"] = df_test_pred[["fold0","fold1","fold2","fold3","fold4"]].mean(axis=1)
submit.to_csv("submission.csv", index=False)

# cv: 0.127293
# public_score: 0.3252