In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('../../data/kaggle_20m/movie_genres_ratings.csv')

# Splitting features and target variable
X = df.drop(columns=['movieId', 'title', 'rating'])
y = df['rating']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:
# Convert the dataset into DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Hyperparameters
params = {
    'objective': 'reg:squarederror',  # Regression
    'eval_metric': 'rmse',  # Root Mean Squared Error
    'booster': 'gbtree',  # Use tree-based models
    'verbosity': 1,  # Printing logs
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100
}

In [3]:
# Train the model
num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

# Predict on the test set
xgb_predictions = bst.predict(dtest)

# Compute MSE, RMSE, and MAE
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MSE on Test Set: {xgb_mse:.4f}")
print(f"XGBoost RMSE on Test Set: {xgb_rmse:.4f}")
print(f"XGBoost MAE on Test Set: {xgb_mae:.4f}")

Parameters: { "n_estimators" } are not used.

XGBoost MSE on Test Set: 0.3826
XGBoost RMSE on Test Set: 0.6186
XGBoost MAE on Test Set: 0.4589
