Find Param

通过随机算法，寻找LightGBM和XGBoost的参数

In [19]:
# 预处理
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np

all = pd.read_csv("./data/train.csv")
all = all.dropna(subset=["RainTomorrow"])

columns_miss_object = ["RainToday", "RainTomorrow"]

for column in columns_miss_object:
    all[column] = all[column].ffill().bfill()

cat_features = [
    "Date",
    "Location",
    "WindGustDir",
    "WindDir9am",
    "WindDir3pm",
    "RainToday",
    "RainTomorrow",
    "Evaporation",
    "Sunshine",
    "Cloud9am",
    "Cloud3pm",
]
ordinal_encoder = OrdinalEncoder(
    dtype=np.int32,
    handle_unknown="use_encoded_value",
    unknown_value=-1,
    encoded_missing_value=-1,
).set_output(transform="pandas")

all[cat_features] = ordinal_encoder.fit_transform(all[cat_features])
X = all.drop(columns=["RainTomorrow"])
y = all["RainTomorrow"]

In [20]:
# spilit dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [21]:
# # find params
# import lightgbm as lgb
# from sklearn.model_selection import RandomizedSearchCV

# param = {
#     "num_leaves": np.random.randint(64, 150, size=10),
#     "learning_rate": np.random.uniform(0.001, 0.1, size=10),
#     "reg_alpha": np.random.uniform(0.5, 1.0, size=10),
#     "reg_lambda": np.random.uniform(0.0, 0.5, size=10),
#     "n_estimators": np.random.randint(100, 200, size=10),
#     "min_child_samples": np.random.randint(60, 120, size=10),
#     "subsample": np.random.uniform(0.8, 1.2, size=10), 
#     "colsample_bytree": np.random.uniform(0.6, 1.0, size=10),
# }

# model = lgb.LGBMRegressor(verbosity=0)

In [22]:
# 随机查找
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

param = {
    "learning_rate": np.random.uniform(0.01, 0.1, size=10),
    "reg_alpha": np.random.uniform(0.5, 1, size=10),
    "reg_lambda": np.random.uniform(0.1, 0.2, size=10),
    "n_estimators": np.random.randint(200, 400, size=10),
    "subsample": np.random.uniform(0.6, 0.8, size=10),
    "colsample_bytree": np.random.uniform(0.6, 1.0, size=10),
    "max_depth": np.random.randint(5, 10, size=10),
}

model = xgb.XGBRegressor(booster="gbtree",verbosity=0)

In [23]:
# 负平方根
random_search = RandomizedSearchCV(
    model, param, n_iter=10, cv=5, scoring="neg_mean_squared_error"
)

random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
best_score = random_search.best_score_
print("Best score (neg_root_mean_squared_error): ", best_score)

Best Parameters: {'subsample': 0.6245374117658219, 'reg_lambda': 0.1556072714142416, 'reg_alpha': 0.993476368183754, 'n_estimators': 347, 'max_depth': 6, 'learning_rate': 0.051759029191354194, 'colsample_bytree': 0.9492330511297755}
Best score (neg_root_mean_squared_error):  -0.09934863128950841
