In [1]:
import warnings
import pandas as pd

from sklearn.metrics import mean_absolute_error

pd.set_option("display.max_columns", 50)
warnings.simplefilter("ignore")

train  = pd.read_csv("train/01.csv")
test   = pd.read_csv("test.csv")
submit = pd.read_csv("sample_submission.csv")

In [2]:
train.shape, test.shape

((24800, 28), (21405, 27))

In [5]:
print("-------- Train --------")
print(train["面積（㎡）"].unique())
print()
print("-------- Test --------")
print(test["面積（㎡）"].unique())

-------- Train --------
[ 70  80  60  85  95  75  55  30  90  65  15  20 100  45  50  25  40 110
 135 105  35 200 115 145 120 130 125 150 165 170 140 155 360 250 260  10
 190 710 160 180 210 440 185 220]

-------- Test --------
['50' '15' '45' '70' '40' '30' '55' '75' '65' '80' '100' '20' '60' '120'
 '35' '90' '105' '125' '95' '85' '25' '110' '115' '145' '130' '2000㎡以上'
 '10' '135' '140' '160' '180' '560' '150' '200' '170' '175' '230' '210'
 '1100' '490' '500' '600' '155' '360' '400' '165' '250' '195']


In [6]:
test["面積（㎡）"] = test["面積（㎡）"].replace("2000㎡以上", "2000").astype(int)

x_train = train[["面積（㎡）"]]
y_train = train[["取引価格（総額）_log"]]
i_train = train[["ID"]]

x_test  = test[["面積（㎡）"]]
i_test  = test[["ID"]]

In [29]:
import pickle
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import KFold

params = {
    "boosting_type"  : "gbdt",
    "objective"      : "regression_l1",
    "metric"         : "mean_absolute_error",
    "learning_rate"  : 0.02,
    "num_leaves"     : 31,
    "n_estimators"   : 1000,
    "random_state"   : 2022,
    "importance_type": "gain"
}

df_imp  = pd.DataFrame()
df_lgb_pred = pd.DataFrame()

metrics = []

n_splits = 5
cv = list(KFold(n_splits=n_splits, shuffle=True, random_state=2022).split(x_train))

for nfold, i in enumerate(cv):
    print(f"Fold: {nfold}")
    i_tr, i_va = i[0], i[1]
    x_tr, y_tr, i_tr = x_train.loc[i_tr], y_train.loc[i_tr], i_train.loc[i_tr]
    x_va, y_va, i_va = x_train.loc[i_va], y_train.loc[i_va], i_train.loc[i_va]
    print(f"train_x: {x_tr.shape}, valid_x: {x_va.shape}")
    
    print("training start!")
    model = lgb.LGBMRegressor(**params)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr, y_tr), (x_va, y_va)],
              categorical_feature=[],
              early_stopping_rounds=100,
              verbose=0)
        
    # valid
    p_va = model.predict(x_va)
    tmp_pred = pd.DataFrame({"p_va": p_va, "y_va": np.array(y_va).reshape(-1)})
    df_lgb_pred = pd.concat([df_lgb_pred, tmp_pred], ignore_index=True)
    
    # metrics
    metric_va = mean_absolute_error(y_va, p_va)
    metrics.append([nfold, metric_va])
    
    # importance
    tmp_imp = pd.DataFrame({"col"  : x_tr.columns,
                            "imp"  : model.feature_importances_,
                            "nfold": nfold
                           })
    tmp_imp = tmp_imp.sort_values(by="imp", ascending=False)
    df_imp = pd.concat([df_imp, tmp_imp], ignore_index=True)
    
    print()

Fold: 0
train_x: (19840, 1), valid_x: (4960, 1)
training start!

Fold: 1
train_x: (19840, 1), valid_x: (4960, 1)
training start!

Fold: 2
train_x: (19840, 1), valid_x: (4960, 1)
training start!

Fold: 3
train_x: (19840, 1), valid_x: (4960, 1)
training start!

Fold: 4
train_x: (19840, 1), valid_x: (4960, 1)
training start!



In [30]:
df_metrics = pd.DataFrame(metrics, columns=["nfold", "mae"])
mae_mean = df_metrics["mae"].mean()
print(f"MAE: {round(mae_mean, 6)}")

MAE: 0.188119


In [11]:
np.abs(df_lgb_pred["y_va"] - df_lgb_pred["p_va"]).mean()

0.18811914634765328

# cv : 0.188119
# cv(all_data): 0.074967
# public_score:(all_data): 0.0781

## ------------------------------------
# XGBoost

In [10]:
import xgboost as xgb

params = {
    "objective": "reg:linear",
    "n_estimators": 1000,
    "learning_rate": 0.1,
    "eval_metric": "mae",
    "random_state": 2023,
}

df_xgb_pred = pd.DataFrame()
metrics = []

for nfold, (tr_i, va_i) in enumerate(cv):
    print(f"Fold: {nfold}")
    x_tr, y_tr = x_train.loc[tr_i], y_train.loc[tr_i]
    x_va, y_va = x_train.loc[va_i], y_train.loc[va_i]
    
    print("training start!")
    model = xgb.XGBRegressor(**params)
    model.fit(x_tr, y_tr, 
              eval_set=[(x_tr, y_tr), (x_va, y_va)],
              early_stopping_rounds=20, verbose=0)

    # valid
    p_va = model.predict(x_va)
    tmp_pred = pd.DataFrame({"p_va": p_va, "y_va": y_va.values.reshape(-1)})
    df_xgb_pred = pd.concat([df_xgb_pred, tmp_pred], ignore_index=True)
    
    # metrics
    metric_va = mean_absolute_error(y_va, p_va)
    metrics.append([nfold, metric_va])
    print(metric_va)
    print()

Fold: 0
training start!
0.1891005196616664

Fold: 1
training start!
0.19420852900052982

Fold: 2
training start!
0.18566349212412278

Fold: 3
training start!
0.19029452247143466

Fold: 4
training start!
0.18702449582522881



In [11]:
pd.DataFrame(metrics, columns=["nfold", "mae"])["mae"].mean()

0.1892583118165965

In [12]:
np.abs(df_xgb_pred["y_va"] - df_xgb_pred["p_va"]).mean()

0.18925831181659103

# Catboost

In [34]:
import catboost as cat

params = {
    "loss_function": "MAE",
    "learning_rate": 0.1,
    
}

df_cat_pred = pd.DataFrame()
metrics = []

for nfold, (tr_i, va_i) in enumerate(cv):
    print(f"Fold: {nfold}")
    x_tr, y_tr = x_train.loc[tr_i], y_train.loc[tr_i]
    x_va, y_va = x_train.loc[va_i], y_train.loc[va_i]
    
    print("training start!")
    model = cat.CatBoostRegressor(**params)
    model.fit(x_tr, y_tr, 
              eval_set=[(x_tr, y_tr), (x_va, y_va)],
              cat_features=[],
              early_stopping_rounds=20,
              verbose=0)

    # valid
    p_va = model.predict(x_va)
    tmp_pred = pd.DataFrame({"p_va": p_va, "y_va": y_va.values.reshape(-1)})
    df_cat_pred = pd.concat([df_cat_pred, tmp_pred], ignore_index=True)
    
    # metrics
    metric_va = mean_absolute_error(y_va, p_va)
    metrics.append([nfold, metric_va])
    print(metric_va)
    print()

Fold: 0
training start!
0.18810220677198206

Fold: 1
training start!
0.1932535717880487

Fold: 2
training start!
0.184779154167478

Fold: 3
training start!
0.18945039554784857

Fold: 4
training start!
0.18487999956506546



In [35]:
pd.DataFrame(metrics, columns=["nfold", "mae"])["mae"].mean()

0.18809306556808458

In [24]:
np.abs(df_cat_pred["y_va"] - df_cat_pred["p_va"]).mean()

0.18809306556808678