In [1]:
import os
import sys

os.chdir('../')
!ls

[1m[36mDataset[m[m         [1m[36mLangPTune-main[m[m  README.md       test.ipynb
[1m[36mExp[m[m             [1m[36mPrototype[m[m       [1m[36mRecSysFramework[m[m


---

# Dataset definition

In [2]:
import scipy.sparse as sps
from pathlib import Path

data_path = Path('Prototype/data/')
train_path = data_path / 'train_recommendations.csv'
test_path = data_path / 'test_recommendations.csv'
user_embeddings_path = data_path / 'user_embeddings_compressed.npz'

## Create URM train and test

In [3]:
import pandas as pd
train_data = pd.read_csv(train_path)[['user_id', 'review_id']]
test_data = pd.read_csv(test_path)[['user_id', 'review_id']]

train_data

Unnamed: 0,user_id,review_id
0,11806206,39239901
1,4525451,28576430
2,7498875,16553108
3,11647801,30312446
4,11432281,28308884
...,...,...
5421506,11527454,32741657
5421507,8259408,21574511
5421508,1347453,40362916
5421509,865903,39994452


Now, we concat the two dataframes, find unique client_ids and reviews_id for defining the mapping. Then, we define URM_train and URM_test and we run optuna

In [4]:
import numpy as np

x = np.load(user_embeddings_path)
user_embeddings = x['embeddings']
user_ids = x['user_ids']

# Since user_ids are stored as strings, we need to convert them to integers
user_ids = [int(i) for i in user_ids]
# Convert user_ids to numpy array to use with sorted_indices
user_ids = np.array(user_ids)
# Now we sort it to ensure consistency
sorted_indices = np.argsort(user_ids)
user_embeddings = user_embeddings[sorted_indices]
user_ids = user_ids[sorted_indices]

user_ids

array([       0,      198,      212, ..., 14305852, 14305954, 14305966])

In [10]:
unique_user_ids = np.unique(np.concatenate((train_data['user_id'].values, test_data['user_id'].values)))
unique_review_ids = np.unique(np.concatenate((train_data['review_id'].values, test_data['review_id'].values)))

unique_user_ids = np.array(sorted(unique_user_ids))
unique_review_ids = np.array(sorted(unique_review_ids))
unique_user_ids

array([       0,      198,      212, ..., 14305852, 14305954, 14305966])

In [11]:
len(unique_user_ids), len(user_ids), len(unique_review_ids)

(295398, 295398, 6776889)

In [12]:
set(unique_user_ids) == (set(user_ids))

True

#

In [13]:
# Mapping user_ids and review_ids to indices
user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
review_id_to_index = {review_id: index for index, review_id in enumerate(unique_review_ids)}

user_id_to_index

{np.int64(0): 0,
 np.int64(198): 1,
 np.int64(212): 2,
 np.int64(232): 3,
 np.int64(257): 4,
 np.int64(370): 5,
 np.int64(397): 6,
 np.int64(464): 7,
 np.int64(611): 8,
 np.int64(697): 9,
 np.int64(708): 10,
 np.int64(737): 11,
 np.int64(814): 12,
 np.int64(1072): 13,
 np.int64(1209): 14,
 np.int64(1239): 15,
 np.int64(1256): 16,
 np.int64(1403): 17,
 np.int64(1405): 18,
 np.int64(1519): 19,
 np.int64(1555): 20,
 np.int64(1596): 21,
 np.int64(1611): 22,
 np.int64(1678): 23,
 np.int64(1699): 24,
 np.int64(1768): 25,
 np.int64(1834): 26,
 np.int64(1898): 27,
 np.int64(1959): 28,
 np.int64(1976): 29,
 np.int64(1985): 30,
 np.int64(2146): 31,
 np.int64(2241): 32,
 np.int64(2257): 33,
 np.int64(2354): 34,
 np.int64(2683): 35,
 np.int64(2821): 36,
 np.int64(2881): 37,
 np.int64(2919): 38,
 np.int64(2987): 39,
 np.int64(3011): 40,
 np.int64(3050): 41,
 np.int64(3074): 42,
 np.int64(3117): 43,
 np.int64(3128): 44,
 np.int64(3179): 45,
 np.int64(3190): 46,
 np.int64(3201): 47,
 np.int64(3223): 

# URM definition

In [14]:
train_data['user_id'] = train_data['user_id'].map(user_id_to_index)
train_data['review_id'] = train_data['review_id'].map(review_id_to_index)

test_data['user_id'] = test_data['user_id'].map(user_id_to_index)
test_data['review_id'] = test_data['review_id'].map(review_id_to_index)

train_data

Unnamed: 0,user_id,review_id
0,244129,6132950
1,63556,2683615
2,139867,946084
3,238404,3390928
4,230518,2564958
...,...,...
5421506,233916,4304572
5421507,158312,1692743
5421508,19620,6550634
5421509,13166,6423388


In [16]:
train_data['interaction'] = 1
test_data['interaction'] = 1

In [17]:
# Create URM (User-Rating Matrix) for train and test data
n_users = len(user_ids)
n_items = len(unique_review_ids)

# Convert train data to sparse matrix
URM_train = sps.coo_matrix((train_data['interaction'].values, 
                           (train_data['user_id'].values, train_data['review_id'].values)),
                           shape=(n_users, n_items))

# Convert test data to sparse matrix
URM_test = sps.coo_matrix((test_data['interaction'].values, 
                          (test_data['user_id'].values, test_data['review_id'].values)),
                          shape=(n_users, n_items))

# Convert to CSR format for efficient row slicing
URM_train = URM_train.tocsr()
URM_test = URM_test.tocsr()

print(f"URM train shape: {URM_train.shape}, nonzero: {URM_train.nnz}")
print(f"URM test shape: {URM_test.shape}, nonzero: {URM_test.nnz}")

URM train shape: (295398, 6776889), nonzero: 5421511
URM test shape: (295398, 6776889), nonzero: 1355378


# Optuna

In [22]:
# Defining Recommender
from Prototype.Decoder.RecommenderDecoder import RecommenderDecoder
from RecSysFramework.Evaluation.Evaluator import EvaluatorHoldout

evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10], verbose=True)

recommender = RecommenderDecoder(URM_train)

EvaluatorHoldout: Ignoring 12612 ( 4.3%) Users that have less than 1 test interactions
RecommenderDecoder: URM Detected 1355378 (20.0%) items with no interactions.


In [23]:
user_embeddings.shape

(295398, 4096)

In [28]:
user_factors = user_embeddings.shape[1]
METRIC = 'NDCG'
METRIC_K = 10

def objective_function(trial):
    
    '''
            user_factors,
            epochs = 300,
            num_factors = 20,
            alpha = 1.0,
            epsilon = 1.0,
            reg = 1e-3,
            init_mean=0.0,
            init_std=0.1,
            **earlystopping_kwargs):
    '''
    params = {
        "user_factors": user_factors,
        "epochs": trial.suggest_int("epochs", 1, 25),
        "num_factors": user_factors,
        "regularization": trial.suggest_float("regularization", 1e-5, 1e-1, log=True),
        "epsilon": trial.suggest_float("epsilon", 1e-5, 1.0, log=True),
        "confidence_scaling": trial.suggest_categorical("confidence_scaling", ["linear", "log"]),
        "alpha": trial.suggest_float("alpha", 0.0, 50.0),
    }


    recommender.fit(**params)

    result_dict, _ = evaluator_test.evaluateRecommender(recommender)

    result = result_dict.loc[METRIC_K][METRIC]
    print("Current {} = {:.4f} with parameters {}".format(METRIC, result, params))
    
    return result

In [29]:
import optuna

csv_path = "Prototype/logs/RecommenderDecoder/trials_results.csv"

class SaveResults(object):
    
    def __init__(self):
        self.results_df = pd.DataFrame(columns = ["result"])
    
    def __call__(self, optuna_study, optuna_trial):
        hyperparam_dict = optuna_trial.params.copy()
        hyperparam_dict["result"] = optuna_trial.values[0]
        
        self.results_df = pd.concat([self.results_df, pd.DataFrame([hyperparam_dict])], ignore_index=True)
        self.results_df.to_csv(csv_path, index = False)
    
optuna_study = optuna.create_study(direction="maximize", study_name="RecommenderDecoder_Study", load_if_exists=True, storage="sqlite:///Prototype/optuna_study.db")
        
save_results = SaveResults()

optuna_study.optimize(objective_function,
                      callbacks=[save_results],
                      n_trials = 100)

[I 2025-07-05 21:41:27,501] Using an existing study with name 'RecommenderDecoder_Study' instead of creating a new one.


: 