In [1]:
import os
import sys

os.chdir('../')
!ls

[1m[36mDataset[m[m         [1m[36mPrototype[m[m       [1m[36mRecSysFramework[m[m
[1m[36mLangPTune-main[m[m  README.md       test.ipynb


---

# Dataset definition

In [2]:
import scipy.sparse as sps
from pathlib import Path

data_path = Path('Prototype/data/')
train_path = data_path / 'train_recommendations.csv'
test_path = data_path / 'test_recommendations.csv'
user_embeddings_path = data_path / 'user_embeddings_compressed.npz'

## Create URM train and test

In [3]:
import pandas as pd
train_data = pd.read_csv(train_path)[['user_id', 'app_id']]
test_data = pd.read_csv(test_path)[['user_id', 'app_id']]

train_data

Unnamed: 0,user_id,app_id
0,11806206,434520
1,4525451,632070
2,7498875,17480
3,11647801,12710
4,11432281,45750
...,...,...
5421506,11527454,214340
5421507,8259408,269210
5421508,1347453,1296510
5421509,865903,1800730


Now, we concat the two dataframes, find unique client_ids and reviews_id for defining the mapping. Then, we define URM_train and URM_test and we run optuna

In [4]:
import numpy as np

x = np.load(user_embeddings_path)
user_embeddings = x['embeddings']
user_ids = x['user_ids']

# Since user_ids are stored as strings, we need to convert them to integers
user_ids = [int(i) for i in user_ids]
# Convert user_ids to numpy array to use with sorted_indices
user_ids = np.array(user_ids)
# Now we sort it to ensure consistency
sorted_indices = np.argsort(user_ids)
user_embeddings = user_embeddings[sorted_indices]
user_ids = user_ids[sorted_indices]

user_ids

array([       0,      198,      212, ..., 14305852, 14305954, 14305966])

In [5]:
unique_user_ids = np.unique(np.concatenate((train_data['user_id'].values, test_data['user_id'].values)))
unique_item_ids = np.unique(np.concatenate((train_data['app_id'].values, test_data['app_id'].values)))

unique_user_ids = np.array(sorted(unique_user_ids))
unique_item_ids = np.array(sorted(unique_item_ids))
unique_user_ids

array([       0,      198,      212, ..., 14305852, 14305954, 14305966])

In [6]:
len(unique_user_ids), len(user_ids), len(unique_item_ids)

(295398, 295398, 25840)

In [7]:
set(unique_user_ids) == (set(user_ids))

True

#

In [8]:
# Mapping user_ids and review_ids to indices
user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
item_id_to_index = {review_id: index for index, review_id in enumerate(unique_item_ids)}

user_id_to_index

{np.int64(0): 0,
 np.int64(198): 1,
 np.int64(212): 2,
 np.int64(232): 3,
 np.int64(257): 4,
 np.int64(370): 5,
 np.int64(397): 6,
 np.int64(464): 7,
 np.int64(611): 8,
 np.int64(697): 9,
 np.int64(708): 10,
 np.int64(737): 11,
 np.int64(814): 12,
 np.int64(1072): 13,
 np.int64(1209): 14,
 np.int64(1239): 15,
 np.int64(1256): 16,
 np.int64(1403): 17,
 np.int64(1405): 18,
 np.int64(1519): 19,
 np.int64(1555): 20,
 np.int64(1596): 21,
 np.int64(1611): 22,
 np.int64(1678): 23,
 np.int64(1699): 24,
 np.int64(1768): 25,
 np.int64(1834): 26,
 np.int64(1898): 27,
 np.int64(1959): 28,
 np.int64(1976): 29,
 np.int64(1985): 30,
 np.int64(2146): 31,
 np.int64(2241): 32,
 np.int64(2257): 33,
 np.int64(2354): 34,
 np.int64(2683): 35,
 np.int64(2821): 36,
 np.int64(2881): 37,
 np.int64(2919): 38,
 np.int64(2987): 39,
 np.int64(3011): 40,
 np.int64(3050): 41,
 np.int64(3074): 42,
 np.int64(3117): 43,
 np.int64(3128): 44,
 np.int64(3179): 45,
 np.int64(3190): 46,
 np.int64(3201): 47,
 np.int64(3223): 

# URM definition

In [9]:
train_data['user_id'] = train_data['user_id'].map(user_id_to_index)
train_data['review_id'] = train_data['app_id'].map(item_id_to_index)

test_data['user_id'] = test_data['user_id'].map(user_id_to_index)
test_data['review_id'] = test_data['app_id'].map(item_id_to_index)

train_data

Unnamed: 0,user_id,app_id,review_id
0,244129,434520,6546
1,63556,632070,10640
2,139867,17480,376
3,238404,12710,303
4,230518,45750,754
...,...,...,...
5421506,233916,214340,1278
5421507,158312,269210,2236
5421508,19620,1296510,20015
5421509,13166,1800730,24167


In [10]:
train_data['interaction'] = 1
test_data['interaction'] = 1

In [11]:
# Create URM (User-Rating Matrix) for train and test data
n_users = len(user_ids)
n_items = len(unique_item_ids)

# Convert train data to sparse matrix
URM_train = sps.coo_matrix((train_data['interaction'].values, 
                           (train_data['user_id'].values, train_data['review_id'].values)),
                           shape=(n_users, n_items))

# Convert test data to sparse matrix
URM_test = sps.coo_matrix((test_data['interaction'].values, 
                          (test_data['user_id'].values, test_data['review_id'].values)),
                          shape=(n_users, n_items))

# Convert to CSR format for efficient row slicing
URM_train = URM_train.tocsr()
URM_test = URM_test.tocsr()

print(f"URM train shape: {URM_train.shape}, nonzero: {URM_train.nnz}")
print(f"URM test shape: {URM_test.shape}, nonzero: {URM_test.nnz}")

URM train shape: (295398, 25840), nonzero: 5421508
URM test shape: (295398, 25840), nonzero: 1355378


# Optuna

In [12]:
# Defining Recommender
from RecSysFramework.Recommenders.MatrixFactorization.IALSRecommender import IALSRecommender
from RecSysFramework.Evaluation.Evaluator import EvaluatorHoldout

evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10], verbose=True)

recommender = IALSRecommender(URM_train)

EvaluatorHoldout: Ignoring 12612 ( 4.3%) Users that have less than 1 test interactions


In [13]:
user_embeddings.shape

(295398, 4096)

In [14]:
METRIC = 'NDCG'
METRIC_K = 10
import optuna


def objective_function(trial):
    try:
        params = {
            "epochs": trial.suggest_int("epochs", 1, 10),
            "num_factors": trial.suggest_int("num_factors", 10, 1200),
            "reg": trial.suggest_float("regularization", 1e-5, 1e-1, log=True),
            "epsilon": trial.suggest_float("epsilon", 1e-5, 1.0, log=True),
            "confidence_scaling": trial.suggest_categorical("confidence_scaling", ["linear", "log"]),
            "alpha": trial.suggest_float("alpha", 0.0, 50.0),
        }

        print(f"Current parameters: {params}")
        recommender.fit(**params)

        result_dict, _ = evaluator_test.evaluateRecommender(recommender)

        result = result_dict.loc[METRIC_K][METRIC]
        print("Current {} = {:.4f} with parameters {}".format(METRIC, result, params))
    
    except Exception as e:
        print(f"Error during evaluation: {e}")
        import gc
        gc.collect()
        raise optuna.TrialPruned(f"Trial failed with error: {e}")
    return result

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

csv_path = "Prototype/logs/IALS/trials_results.csv"

class SaveResults(object):
    
    def __init__(self):
        os.makedirs(os.path.dirname(csv_path), exist_ok=True, parents=True)
        self.results_df = pd.DataFrame(columns = ["result"])
    
    def __call__(self, optuna_study, optuna_trial):
        hyperparam_dict = optuna_trial.params.copy()
        hyperparam_dict["result"] = optuna_trial.values[0]
        
        self.results_df = pd.concat([self.results_df, pd.DataFrame([hyperparam_dict])], ignore_index=True)
        self.results_df.to_csv(csv_path, index = False)
    
optuna_study = optuna.create_study(direction="maximize", study_name="IALS_Study", load_if_exists=True, storage="sqlite:///Prototype/optuna_study.db")
        
save_results = SaveResults()

optuna_study.optimize(objective_function,
                      callbacks=[save_results],
                      n_trials = 100)

[I 2025-07-06 16:50:31,275] Using an existing study with name 'IALS_Study' instead of creating a new one.


Current parameters: {'epochs': 9, 'num_factors': 12, 'reg': 0.030775598673061455, 'epsilon': 0.26762766804783306, 'confidence_scaling': 'linear', 'alpha': 34.052896636189224}
IALSRecommender: Epoch 1 of 9. Elapsed time 4.50 sec
IALSRecommender: Epoch 2 of 9. Elapsed time 8.97 sec
IALSRecommender: Epoch 3 of 9. Elapsed time 13.53 sec
IALSRecommender: Epoch 4 of 9. Elapsed time 17.97 sec
IALSRecommender: Epoch 5 of 9. Elapsed time 22.41 sec
IALSRecommender: Epoch 6 of 9. Elapsed time 27.03 sec
IALSRecommender: Epoch 7 of 9. Elapsed time 31.48 sec
IALSRecommender: Epoch 8 of 9. Elapsed time 35.99 sec
IALSRecommender: Epoch 9 of 9. Elapsed time 40.50 sec
IALSRecommender: Terminating at epoch 9. Elapsed time 40.51 sec
EvaluatorHoldout: Processed 282786 (100.0%) in 2.14 min. Users per second: 2204


[I 2025-07-06 16:53:20,246] Trial 3 finished with value: 0.0720847132374586 and parameters: {'epochs': 9, 'num_factors': 12, 'regularization': 0.030775598673061455, 'epsilon': 0.26762766804783306, 'confidence_scaling': 'linear', 'alpha': 34.052896636189224}. Best is trial 3 with value: 0.0720847132374586.


Current NDCG = 0.0721 with parameters {'epochs': 9, 'num_factors': 12, 'reg': 0.030775598673061455, 'epsilon': 0.26762766804783306, 'confidence_scaling': 'linear', 'alpha': 34.052896636189224}


  self.results_df = pd.concat([self.results_df, pd.DataFrame([hyperparam_dict])], ignore_index=True)


OSError: Cannot save file into a non-existent directory: 'Prototype/logs/IALS'