In [6]:
# # PyPI
# !pip install optuna

# # Boost
# !pip install catboost
# !pip install lightgbm
# !pip install xgboost

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
import optuna
import numpy as np
import joblib
import os
import json

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# 讀取資料
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df

In [10]:
# 設置 new_per_ping 為 y，其餘行數為 X
def prepare_data(df):
    y = df['new_per_ping']
    X = df.drop(columns=['new_per_ping','year_month_for_combine'])
    return X, y

In [11]:
# 數據切割
def data_split():
  X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
  X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)
  return X_train, X_val, X_test, y_train, y_val, y_test

In [12]:
# 超參數挑選
# 目標函數，用於 Optuna 優化
def objective(trial):
    # 選擇基模型
    model_name = trial.suggest_categorical("model", ["SVR", "KNN", "DecisionTree", "RandomForest", "MLP", "XGBoost", "CatBoost", "LightGBM"])

    if model_name == "SVR":
        C = trial.suggest_float("C", 1e-3, 1e3, log=True)
        reg = SVR(C=C)
    elif model_name == "KNN":
        n_neighbors = trial.suggest_int("n_neighbors", 1, 20)
        reg = KNeighborsRegressor(n_neighbors=n_neighbors)
    elif model_name == "DecisionTree":
        max_depth = trial.suggest_int("max_depth", 1, 32, log=True)
        reg = DecisionTreeRegressor(max_depth=max_depth)
    elif model_name == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 10, 100)
        max_depth = trial.suggest_int("max_depth", 1, 32, log=True)
        reg = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    elif model_name == "MLP":
        hidden_layer_sizes = trial.suggest_int("hidden_layer_sizes", 50, 500)
        alpha = trial.suggest_float("alpha", 1e-5, 1e-1, log=True)
        reg = MLPRegressor(hidden_layer_sizes=(hidden_layer_sizes,), alpha=alpha, max_iter=5000)
    elif model_name == "XGBoost":
        n_estimators = trial.suggest_int("n_estimators", 10, 100)
        max_depth = trial.suggest_int("max_depth", 1, 32, log=True)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
        reg = xgb.XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)
    elif model_name == "CatBoost":
        iterations = trial.suggest_int("iterations", 10, 100)
        depth = trial.suggest_int("depth", 1, 10)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
        reg = CatBoostRegressor(iterations=iterations, depth=depth, learning_rate=learning_rate, verbose=0)
    elif model_name == "LightGBM":
        n_estimators = trial.suggest_int("n_estimators", 10, 100)
        max_depth = trial.suggest_int("max_depth", 1, 32, log=True)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
        reg = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)

    # 創建 Bagging Regressor
    bagging_reg = BaggingRegressor(base_estimator=reg, n_estimators=10, random_state=42)

    # 進行交叉驗證
    score = cross_val_score(bagging_reg, X_train, y_train, n_jobs=-1, cv=3, scoring='neg_mean_squared_error').mean()
    return score

In [13]:
# 訓練 bagging regressors
def train_bagging_regressors(bagging_regressors, X_train, y_train):
    for name, model in bagging_regressors.items():
        model.fit(X_train, y_train)
        print(f"Trained {name} Bagging Regressor")
    return bagging_regressors

In [14]:
# 預測 bagging regressors
def make_predictions(bagging_regressors, X_test):
    predictions = {}
    for name, model in bagging_regressors.items():
        # 進行預測並保存結果
        predictions[name] = model.predict(X_test)
    return predictions

In [15]:
# 評估回歸器
def evaluate_and_print_errors(regressors, X, y):
    mse_scorer = make_scorer(mean_squared_error)  # 均方誤差
    rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)))  # 均方根誤差
    r2_scorer = make_scorer(r2_score)  # R² 分數

    cv = 5  # 交叉驗證折數
    error_results = {}
    for name, reg in regressors.items():
        mse_scores = cross_val_score(reg, X, y, cv=cv, scoring=mse_scorer)  # 交叉驗證 MSE
        rmse_scores = cross_val_score(reg, X, y, cv=cv, scoring=rmse_scorer)  # 交叉驗證 RMSE
        r2_scores = cross_val_score(reg, X, y, cv=cv, scoring=r2_scorer)  # 交叉驗證 R²
        error_results[name] = {
            "MSE": np.mean(mse_scores),
            "RMSE": np.mean(rmse_scores),
            "R²": np.mean(r2_scores)
        }
    return error_results

In [16]:
# 打印每個袋裝回歸器的誤差值
def print_errors(cv_results):
    for name, scores in cv_results.items():
        print(f"{name} Bagging Regressor Mean MSE: {scores['MSE']}")
        print(f"{name} Bagging Regressor Mean RMSE: {scores['RMSE']}")
        print(f"{name} Bagging Regressor Mean R²: {scores['R²']}")

In [17]:
# 保存 Bagging Regressor 模型
def save_bagging_regressor_model(bagging_regressor, model_path):
    joblib.dump(bagging_regressor, model_path)
    print(f"Model saved to {model_path}")

In [18]:
# 保存 Bagging Regressor 的超參數
def save_bagging_regressor_params(bagging_regressor, params_path):
    params = bagging_regressor.get_params()
    with open(params_path, 'w') as f:
        json.dump(params, f)
    print(f"Parameters saved to {params_path}")

In [19]:
import joblib

def main():
    # 讀取資料
    df = load_data('/content/drive/MyDrive/Colab Notebooks/EDA_main_data_for_xgb_version5_store_drop.csv')

    # 設置資料和標籤
    global X_train, X_temp, y_train, y_temp
    X, y = prepare_data(df)

    # 切割資料集成訓練集、測試集和驗證集
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

    # 創建並運行 Optuna 學習實驗
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

    # 保存模型
    joblib.dump(model, 'model_optuna.pkl')
    print("Model saved to model_optuna.pkl")

    # 獲取最佳試驗的結果
    trial = study.best_trial

    print("Best Score: {}".format(trial.value))
    print("Best Hyperparameters: {}".format(trial.params))

    # 使用最佳超參數訓練最終模型
    best_model_name = trial.params["model"]
    best_params = {k: v for k, v in trial.params.items() if k != "model"}

    if best_model_name == "SVR":
        reg = SVR(**best_params)
    elif best_model_name == "KNN":
        reg = KNeighborsRegressor(**best_params)
    elif best_model_name == "DecisionTree":
        reg = DecisionTreeRegressor(**best_params)
    elif best_model_name == "RandomForest":
        reg = RandomForestRegressor(**best_params)
    elif best_model_name == "MLP":
        reg = MLPRegressor(**best_params, max_iter=5000)
    elif best_model_name == "XGBoost":
        reg = xgb.XGBRegressor(**best_params)
    elif best_model_name == "CatBoost":
        reg = CatBoostRegressor(**best_params, verbose=0)
    elif best_model_name == "LightGBM":
        reg = LGBMRegressor(**best_params)

    # 創建 Bagging Regressor
    bagging_reg = BaggingRegressor(base_estimator=reg, n_estimators=10, random_state=42)
    bagging_reg.fit(X_train, y_train)

    # 保存模型和超參數
    save_bagging_regressor_model(bagging_reg, 'bagging_regressor.pkl')
    save_bagging_regressor_params(bagging_reg, 'bagging_regressor_params_01.json')

    # 進行預測並打印日誌
    print("Making Predictions...")
    predictions = bagging_reg.predict(X_test)
    print(predictions)

    # 評估模型
    errors = evaluate_and_print_errors({"Best Bagging Regressor": bagging_reg}, X_test, y_test)
    print_errors(errors)

In [None]:
# 執行主函數
if __name__ == "__main__":
    main()

[I 2024-07-01 15:49:45,871] A new study created in memory with name: no-name-e305d954-d2fd-447d-9a0c-eb9a0acdb191
[I 2024-07-01 15:50:22,309] Trial 0 finished with value: -26177929588.77118 and parameters: {'model': 'LightGBM', 'n_estimators': 24, 'max_depth': 1, 'learning_rate': 0.05100327326086448}. Best is trial 0 with value: -26177929588.77118.
