In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

one_m_data = pd.read_csv('./processed_data.csv')

y = one_m_data["popularity"].values
X = one_m_data.drop(["popularity"],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

xgbr = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

param_grid = {
    'max_depth': [3, 5, 7, 9, 11, 13, 15],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.3, 0.4, 0.5],
    'n_estimators': [50, 65, 75, 100, 120, 150, 180, 200]
}

grid_search = GridSearchCV(xgbr, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)

best_xgbr = grid_search.best_estimator_

y_pred = best_xgbr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# Best Parameters: {'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 200}
# Mean Squared Error (MSE): 123.51
# R-squared (R2): 0.59

C:\Users\User\.conda\envs\python39\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\User\.conda\envs\python39\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


Best Parameters: {'learning_rate': 0.05, 'max_depth': 9, 'n_estimators': 200}
Mean Squared Error (MSE): 123.51
R-squared (R2): 0.59
