In [None]:
# import lib
import pandas as pd
import numpy as np

# read csv
xy_train = pd.read_csv("./data/train.csv")
x_test = pd.read_csv("./data/test.csv")
# union
xy_all = pd.concat([xy_train, x_test], axis=0)
cat_features = xy_all.columns[xy_all.dtypes == "object"]

# progressing value
# object(string) type to in32, NAN and missing value to -1
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(
    dtype=np.int32,
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-1,
).set_output(transform="pandas")

xy_all[cat_features] = ordinal_encoder.fit_transform(xy_all[cat_features])

# Split
# find the SalePrice is NAN
xy_train = xy_all[~xy_all["SalePrice"].isna()]

x_test = xy_all[xy_all["SalePrice"].isna()].drop(columns="SalePrice")
x_train = xy_train.drop(columns=["SalePrice"])
y_train = xy_train["SalePrice"]

In [None]:
# set model
import xgboost as xgb


model = xgb.XGBRegressor(
    colsample_bytree=0.2,
    gamma=0.0,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=1.5,
    n_estimators=7200,
    reg_alpha=0.9,
    reg_lambda=0.6,
    subsample=0.2,
    seed=42,
    random_state=7,
)

# train
model.fit(x_train, y_train, verbose=False)

# pred
y_xgb_pred = model.predict(x_test)

In [None]:
# lightgbm model
import lightgbm as lgb

model = lgb.LGBMRegressor(
    objective="regression",
    num_leaves=5,
    learning_rate=0.05,
    n_estimators=720,
    max_bin=55,
    bagging_fraction=0.8,
    bagging_freq=5,
    feature_fraction=0.2319,
    feature_fraction_seed=9,
    bagging_seed=9,
    min_data_in_leaf=6,
    min_sum_hessian_in_leaf=11,
)
# train
model.fit(x_train, y_train)
# predict
y_lgbm_pred = model.predict(x_test)

In [None]:
y_pred = 0.7 * y_lgbm_pred + 0.3 * y_xgb_pred

In [None]:
# save .csv
pd.DataFrame({"Id": x_test["Id"], "SalePrice": y_pred}).to_csv(
    "./output/LGBM+XGB_output.csv", index=False
)