In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
import joblib

one_m_data = pd.read_csv('./processed_data.csv')

y = one_m_data["popularity"].values
X = one_m_data.drop(["popularity"],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgbr = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

param_grid = {
    'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4],  # 学习率
    'max_depth': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21],  # 最大深度
    'n_estimators': [100, 150, 200, 250, 300],  # 树的数量
    'colsample_bytree': [0.4, 0.6, 0.8],
    'tree_method': ["auto", "exact", "approx"]
}

grid_search = GridSearchCV(estimator=xgbr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 輸出最佳超参數
print("Best Parameters:", grid_search.best_params_)

# 獲得最佳模型
best_xgbr = grid_search.best_estimator_

joblib.dump(best_xgbr, 'xgboost.pkl')
y_pred = best_xgbr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")