In [1]:
# import lib
import pandas as pd
import numpy as np

In [2]:
# read csv
xy_train = pd.read_csv("./data/train.csv")
x_test = pd.read_csv("./data/test.csv")

In [3]:
# delete improper value
xy_train = xy_train.drop(
    xy_train[(xy_train["GrLivArea"] > 4000) & (xy_train["SalePrice"] < 300000)].index
)

In [11]:
# union dataset
xy_all = pd.concat([xy_train, x_test], axis=0)
print(xy_all)

        Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  \
0        1          60         3         65.0     8450       1     -1   
1        2          20         3         80.0     9600       1     -1   
2        3          60         3         68.0    11250       1     -1   
3        4          70         3         60.0     9550       1     -1   
4        5          60         3         84.0    14260       1     -1   
...    ...         ...       ...          ...      ...     ...    ...   
1454  2915         160         4         21.0     1936       1     -1   
1455  2916         160         4         21.0     1894       1     -1   
1456  2917          20         3        160.0    20000       1     -1   
1457  2918          85         3         62.0    10441       1     -1   
1458  2919          60         3         74.0     9627       1     -1   

      LotShape  LandContour  Utilities  ...  PoolArea  PoolQC  Fence  \
0            3            3          0  ...        

In [5]:
# progressing value
# object(string) type to in32, NAN and missing value to -1
from sklearn.preprocessing import OrdinalEncoder

cat_features = xy_all.columns[xy_all.dtypes == "object"]
ordinal_encoder = OrdinalEncoder(
    dtype=np.int32,
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-1
).set_output(transform="pandas")

xy_all[cat_features] = ordinal_encoder.fit_transform(xy_all[cat_features])

In [6]:
# Split
# find the SalePrice is NAN
xy_train = xy_all[~xy_all["SalePrice"].isna()]

x_test = xy_all[xy_all["SalePrice"].isna()].drop(columns="SalePrice")
x_train = xy_train.drop(columns=["SalePrice"])
y_train = xy_train["SalePrice"]

In [7]:
# lightgbm model
import lightgbm as lgb
model = lgb.LGBMRegressor(
    objective="regression",
    num_leaves=5,
    learning_rate=0.05,
    n_estimators=720,
    max_bin=55,
    bagging_fraction=0.8,
    bagging_freq=5,
    feature_fraction=0.2319,
    feature_fraction_seed=9,
    bagging_seed=9,
    min_data_in_leaf=6,
    min_sum_hessian_in_leaf=11,
)

In [8]:
# train
model.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 1458, number of used features: 79
[LightGBM] [Info] Start training from score 180932.919067


In [9]:
# predict
y_pred = model.predict(x_test)



In [10]:
# save .csv
pd.DataFrame({
    "Id": x_test["Id"],
    "SalePrice": y_pred
}).to_csv("./output/lightgbm_output.csv", index=False)