In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

import pandas as pd
df = pd.read_csv('../../data/kaggle_20m/movie_genres_ratings.csv')

# Splitting features and target variable
X = df.drop(columns=['movieId', 'title', 'rating'])
y = df['rating']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
import xgboost as xgb

# Convert the dataset into DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set parameters
params = {
    'objective': 'reg:squarederror',  # Regression
    'eval_metric': 'rmse',  # Root Mean Squared Error
    'booster': 'gbtree',  # Use tree-based models
    'verbosity': 1  # Printing logs
}

# Train the model
num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)


In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Predict on the test set
xgb_predictions = bst.predict(dtest)

# Compute MSE and MAE
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)

print(f"XGBoost MSE on Test Set: {xgb_mse:.4f}")
print(f"XGBoost MAE on Test Set: {xgb_mae:.4f}")


XGBoost MSE on Test Set: 0.3861
XGBoost MAE on Test Set: 0.4611


In [6]:
def recommend_movies_xgb(model, user_data, df, top_n=5):
    unseen_movies = df[~df['movieId'].isin(user_data['movieId'])]
    features = xgb.DMatrix(unseen_movies.drop(columns=['movieId', 'title', 'rating']))
    predicted_ratings = model.predict(features)
    unseen_movies = unseen_movies.assign(predicted_rating=predicted_ratings)
    return unseen_movies.nlargest(top_n, 'predicted_rating')[['title', 'predicted_rating']]

user_data = df[df['title'].isin(['Toy Story (1995)', 'Jumanji (1995)'])]
recommendations = recommend_movies_xgb(bst, user_data, df)
print(recommendations)


                                               title  predicted_rating
14661                         Dead Time: Kala (2007)          4.524859
9005   Life On A String (Bian chang Bian Zou) (1991)          4.302702
24702                        The Bloody Olive (1997)          4.294578
15840        Man Vanishes, A (Ningen Johatsu) (1967)          4.234326
12559                 Aerial, The (La antena) (2007)          4.206073
