In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score, average_precision_score
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

from clearml import Task
from clearml.automation import UniformParameterRange, UniformIntegerParameterRange
from clearml.automation import HyperParameterOptimizer
from clearml.automation.optuna import OptimizerOptuna

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
data_path = 'data/'
train_data = pd.read_parquet(data_path + 'ratings_train.pq')
test_data = pd.read_parquet(data_path + 'ratings_test.pq')
groups5 = pd.read_parquet(data_path + 'groups5.pq')
groups6 = pd.read_parquet(data_path + 'groups6.pq')
groups7 = pd.read_parquet(data_path + 'groups7.pq')

for i, group in enumerate([groups5, groups6, groups7]):
    test_data = test_data.merge(group, on='userId').rename(columns={'group': f'group{i+5}'})
test_data

del groups5, groups6, groups7

In [3]:
task = Task.init(
    project_name = 'MoviesGRS_MFDP', 
    task_name = 'SVDRecommender', 
    task_type=Task.TaskTypes.optimizer,
    tags = ['SVD', 'HyperParameterTuning'],
    reuse_last_task_id=True
)

ClearML Task: created new task id=1f9bdbe5e891426988539c51376dce9a
2023-05-28 00:14:23,103 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/f3cb8157bfe7443abdc531a44bb15332/experiments/1f9bdbe5e891426988539c51376dce9a/output/log


In [4]:
min_rating = train_data.rating.min()
max_rating = train_data.rating.max()
 
reader = Reader(rating_scale=(min_rating, max_rating))
surprise_train_dataset = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
del train_data

In [5]:
param_grid = {
  'n_factors': [75, 100, 125],
  'n_epochs': [25, 50, 75]
}
 
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(surprise_train_dataset)

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


In [17]:
print(gs.best_score)
print(gs.best_params)

{'rmse': 0.8173886828255589, 'mae': 0.6184700213543655}
{'rmse': {'n_factors': 75, 'n_epochs': 25}, 'mae': {'n_factors': 75, 'n_epochs': 25}}


In [10]:
param_grid = {
  'n_factors': [25, 50, 75],
  'n_epochs': [10, 15, 25]
}
 
gs2 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs2.fit(surprise_train_dataset)

In [16]:
print(gs2.best_score)
print(gs2.best_params)

{'rmse': 0.8145793811843798, 'mae': 0.6160393481400541}
{'rmse': {'n_factors': 25, 'n_epochs': 25}, 'mae': {'n_factors': 25, 'n_epochs': 25}}


In [20]:
param_grid = {
  'n_factors': [10, 17, 25],
  'n_epochs': [25, 30, 35]
}
 
gs3 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs3.fit(surprise_train_dataset)

In [21]:
print(gs3.best_score)
print(gs3.best_params)

{'rmse': 0.8127499756266663, 'mae': 0.6132689833958272}
{'rmse': {'n_factors': 17, 'n_epochs': 35}, 'mae': {'n_factors': 17, 'n_epochs': 35}}


In [22]:
param_grid = {
  'n_factors': [15, 17, 20],
  'n_epochs': [35, 40, 45]
}
 
gs4 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs4.fit(surprise_train_dataset)

In [23]:
print(gs4.best_score)
print(gs4.best_params)

{'rmse': 0.8125109695871677, 'mae': 0.6124017024963415}
{'rmse': {'n_factors': 15, 'n_epochs': 35}, 'mae': {'n_factors': 17, 'n_epochs': 40}}


In [24]:
task.close()