In [20]:
# import lib
import pandas as pd
import numpy as np

# read csv and delete RainTomorrow rows with empty
train = pd.read_csv("./data/train.csv")

# train = train.dropna(subset=["RainTomorrow"])
test = pd.read_csv("./data/test.csv")

# add missing value
columns_miss_object = ["RainToday", "RainTomorrow"]

for column in columns_miss_object:
    train[column] = train[column].ffill().bfill()

# union and cherk object and null data
xy_all = pd.concat([train, test], axis=0)
cat_features = [
    "Date",
    "Location",
    "WindGustDir",
    "WindDir9am",
    "WindDir3pm",
    "RainToday",
    "RainTomorrow",
    "Evaporation",
    "Sunshine",
    "Cloud9am",
    "Cloud3pm",
]

# processing value
# object(string) type to in32, NAN and missing value to -1
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder(
    dtype=np.int32,
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-1,
).set_output(transform="pandas")

xy_all[cat_features] = ordinal_encoder.fit_transform(xy_all[cat_features])
print("xy_all size:", xy_all.shape)

# Split
# RainTomorrow with -1 is x_test
# RainTomorrow without -1 is xy_train

xy_train = xy_all[xy_all["RainTomorrow"] != -1]
x_train = xy_train.drop(columns=["RainTomorrow"])
y_train = xy_train["RainTomorrow"]
x_test = xy_all[xy_all["RainTomorrow"] == -1].drop(columns="RainTomorrow")

xy_all size: (145460, 24)


In [21]:
# lightgbm model
import lightgbm as lgb

model = lgb.LGBMRegressor(
    boosting_type="gbdt",
    num_leaves=165,
    max_depth=-1,
    learning_rate=0.04902036001758038,
    n_estimators=380,
    subsample_for_bin=200000,
    objective=None,
    class_weight=None,
    min_split_gain=0.0,
    min_child_weight=0.001,
    min_child_samples=100,
    subsample=0.8983143759937497,
    colsample_bytree=0.8051100520465713,
    reg_alpha=0.77069295356252,
    reg_lambda= 0.17987509891243725,
    random_state=42,
    n_jobs=None,
    importance_type="split",
)
# train
model.fit(x_train, y_train)

# predict
y_lgbm_pred = model.predict(x_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3160
[LightGBM] [Info] Number of data points in the train set: 101822, number of used features: 23
[LightGBM] [Info] Start training from score 0.225973


In [22]:
# xgboost model
param = {
    "subsample": 0.6245374117658219,
    "reg_lambda": 0.1556072714142416,
    "reg_alpha": 0.993476368183754,
    "n_estimators": 347,
    "max_depth": 6,
    "learning_rate": 0.051759029191354194,
    "colsample_bytree": 0.9492330511297755,
}
import xgboost as xgb

model = xgb.XGBRegressor(**param)
# train
model.fit(x_train, y_train, verbose=False)
# pred
y_xgb_pred = model.predict(x_test)

In [23]:
y_pred = 0.5 * y_lgbm_pred + 0.5 * y_xgb_pred
y_pred_str = np.where(y_pred > 0.5, "Yes", "No")

In [24]:
# save .csv
pd.DataFrame({"id": x_test["id"], "RainTomorrow": y_pred_str}).to_csv(
    "./output/LGBM+XGB.csv", index=False
)