In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score, average_precision_score
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

from clearml import Task
from clearml.automation import UniformParameterRange, UniformIntegerParameterRange
from clearml.automation import HyperParameterOptimizer
from clearml.automation.optuna import OptimizerOptuna

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
data_path = 'data/'
train_data = pd.read_parquet(data_path + 'ratings_train.pq')
test_data = pd.read_parquet(data_path + 'ratings_test.pq')
groups5 = pd.read_parquet(data_path + 'groups5.pq')
groups6 = pd.read_parquet(data_path + 'groups6.pq')
groups7 = pd.read_parquet(data_path + 'groups7.pq')

for i, group in enumerate([groups5, groups6, groups7]):
    test_data = test_data.merge(group, on='userId').rename(columns={'group': f'group{i+5}'})
del groups5, groups6, groups7
test_data

Unnamed: 0,userId,movieId,rating,group5,group6,group7
0,41988,790,4.0,31361,14281,11298
1,41988,524,3.0,31361,14281,11298
2,41988,608,4.0,31361,14281,11298
3,41988,695,3.0,31361,14281,11298
4,41988,566,4.0,31361,14281,11298
...,...,...,...,...,...,...
3596663,7343,164,5.0,17403,2054,4080
3596664,7343,193,4.0,17403,2054,4080
3596665,7343,253,4.0,17403,2054,4080
3596666,7343,483,2.0,17403,2054,4080


In [3]:
task = Task.init(
    project_name = 'MoviesGRS_MFDP', 
    task_name = 'SVDRecommender', 
    task_type=Task.TaskTypes.optimizer,
    tags = ['SVD', 'HyperParameterTuning'],
    reuse_last_task_id=True
)

ClearML Task: created new task id=7c6ccd3fcd294026829e5c2f3a67ba63
2023-05-29 00:08:11,008 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/f3cb8157bfe7443abdc531a44bb15332/experiments/7c6ccd3fcd294026829e5c2f3a67ba63/output/log


In [4]:
min_rating = 1
max_rating = 5
 
reader = Reader(rating_scale=(min_rating, max_rating))
surprise_train_dataset = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
del train_data

In [5]:
param_grid = {
  'n_factors': [75, 100, 125],
  'n_epochs': [25, 50, 75]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(surprise_train_dataset)

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


In [6]:
print(gs.best_score)
print(gs.best_params)

{'rmse': 0.8260481067360091, 'mae': 0.6250276877525159}
{'rmse': {'n_factors': 75, 'n_epochs': 25}, 'mae': {'n_factors': 75, 'n_epochs': 25}}


In [7]:
param_grid = {
  'n_factors': [25, 50, 75],
  'n_epochs': [10, 15, 25]
}
 
gs2 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs2.fit(surprise_train_dataset)

In [8]:
print(gs2.best_score)
print(gs2.best_params)

{'rmse': 0.823731995095111, 'mae': 0.6224853277777231}
{'rmse': {'n_factors': 25, 'n_epochs': 25}, 'mae': {'n_factors': 25, 'n_epochs': 25}}


In [9]:
param_grid = {
  'n_factors': [10, 17, 25],
  'n_epochs': [25, 30, 35]
}
 
gs3 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs3.fit(surprise_train_dataset)

In [10]:
print(gs3.best_score)
print(gs3.best_params)

{'rmse': 0.8218143757251857, 'mae': 0.6202771955576911}
{'rmse': {'n_factors': 17, 'n_epochs': 30}, 'mae': {'n_factors': 17, 'n_epochs': 30}}


In [12]:
task.close()