<a href="https://colab.research.google.com/github/RecSys-lab/movifex_dataset/blob/main/examples/benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **MoViFex Dataset - Benchmarking (CF-Side)**

🎬 Dataset: [link](https://huggingface.co/datasets/alitourani/MoViFex_Dataset/tree/main)

🎬 Framework: [link](https://github.com/RecSys-lab/MoViFex)

In [1]:
!pip install -q numpy==1.25.2 cornac==2.2.1

In [2]:
# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block 0 – EXPERIMENT CONFIGURATION
# ╚════════════════════════════════════════════════════════════════════════════╝

# Experiment Parameters
MODEL_CHOICE    = 'cf'            # 'cf' | 'vbpr' | 'amr' | 'vmf'
FAST_Prtye      = True            # Fast prototype → n_epochs = 1
PARALLEL_HPO    = True            # CPU-parallelization flag
USE_GPU_FOR_HPO = False           # CuPy + Cornac use_gpu where available
SEED            = 42              # Seed for reproducibility
VERBOSE         = True            # Logging level
SPLIT_MODE      = 'random'        # 'temporal' | 'random' | 'per-user'
TEXT_MAX_PARTS  = 15
N_EPOCHS        = 20
TEST_RATIO      = 0.2
K_CORE          = 0
COLD_TH         = 5

# MoViFex Dataset
MOVIE_VARIANT = "full_movies"     # 'full_movies' | 'movie_shots' | 'movie_trailers'
FEATURE_EXTRACTOR = "incp3"       # 'incp3' | 'vgg19'
FEATURE_AGGREGATOR = "max"        # 'max' | 'mean'

# Text-Augmented Dataset
LLM_VARIANT = 'openai'            # 'openai' | 'st' | 'llama'
TEXT_AUGMENTED  = True            # True → use augmented textual path

# MovieLenz Dataset
ML_VARIANT = "ml-25m"             # 'ml-25m' | '100k' | '1m'

# Fusion
MULTI_VARIANTS = [
  ('concat', None),
  ('pca',   0.95),
  ('cca',   40),
]

## **[Step 1] Clone & Install Libraries**

In [3]:
import os
import math
import json, copy
import numpy as np
import pandas as pd
import scipy.sparse
import inspect, time, itertools
# SKLearn
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
# Cornac
import cornac
from cornac.models import MF, VBPR, VMF, AMR, VAECF, MostPop
from cornac.data import ImageModality, FeatureModality, Dataset

# Initialization
rng = np.random.RandomState(SEED)
np.random.seed(SEED)

In [4]:
# Check if we need to clone the helper framework
if os.path.exists('/content/MoViFex'):
  print("✨ The framework is already cloned!")
else:
  # Clone the repo
  print("✨ Cloning the repository")
  !git clone https://github.com/RecSys-lab/MoViFex.git

  # Install the required library
  %cd MoViFex
  !pip install -e .

  # Add the repository to the Python path
  import sys
  sys.path.append('/content/MoViFex')

  # Go back to the root
  %cd ..

✨ The framework is already cloned!


## 🚀 **[Step 2] Load and Prepare Data**

### I. *Load the Dataset Metadata File*

In [5]:
import movifex
from movifex.utils import loadDataFromCSV, loadJsonFromUrl
from movifex.datasets.movielens.downloader import downloadMovielens25m
from movifex.datasets.movifex.helper_visualfeats_agg import generatedAggFeatureAddresses
from movifex.datasets.movifex.helper_visualfeats_agg import loadAggregatedFeaturesIntoDataFrame

# Variables
configs = {
  "name": "MoViFex-visual",
  "path_metadata": "https://huggingface.co/datasets/alitourani/MoViFex_Dataset/resolve/main/stats.json",
  "path_raw": "https://huggingface.co/datasets/alitourani/MoViFex_Dataset/raw/main/",
  "feature_sources": ["full_movies", "movie_shots", "movie_trailers"],
  "agg_feature_sources": ["full_movies_agg", "movie_shots_agg", "movie_trailers_agg"],
  "feature_models": ["incp3", "vgg19"],
  "aggregation_models": ["Max", "Mean"]
}

# Metadata fetching
datasetMetadataUrl = configs['path_metadata']
print(f"✨ Fetching the dataset metadata from '{datasetMetadataUrl}' ...")
jsonData = loadJsonFromUrl(datasetMetadataUrl)
movifexDF_meta = pd.DataFrame(jsonData)
movifexDF_meta = movifexDF_meta.rename(columns={'id': 'itemId'})
movifexDF_meta['itemId'] = movifexDF_meta['itemId'].astype(str).astype(int)
print("'MoViFex-visual' dataset is loaded into a DataFrame:")
print(movifexDF_meta.head(5))

# Aggregated features fetching
print(f"\n✨ Preparing the addresses of aggregated features ...")
aggFeatureAddresses = generatedAggFeatureAddresses(configs)
print(f"\n✨ Now, loading the aggregated features into DataFrames for '{MOVIE_VARIANT}' extracted by '{FEATURE_EXTRACTOR}' ...")
movifexDF_featMax, movifexDF_featMean = loadAggregatedFeaturesIntoDataFrame(aggFeatureAddresses[f'{MOVIE_VARIANT}_agg'][FEATURE_EXTRACTOR])
movifexDF_featMax['embedding'] = movifexDF_featMax['embedding'].apply(lambda x: np.fromstring(x, sep=','))
movifexDF_featMean['embedding'] = movifexDF_featMean['embedding'].apply(lambda x: np.fromstring(x, sep=','))
print(f"\nThe data loaded into DataFrames! Sample of the 'Max' DataFrame:")
movifexDF_featMax.head(3)

✨ Fetching the dataset metadata from 'https://huggingface.co/datasets/alitourani/MoViFex_Dataset/resolve/main/stats.json' ...
'MoViFex-visual' dataset is loaded into a DataFrame:
   itemId                       title  year                      genres
0       6                        Heat  1995   [Action, Crime, Thriller]
1      50         Usual Suspects, The  1995  [Crime, Mystery, Thriller]
2     111                 Taxi Driver  1976    [Crime, Drama, Thriller]
3     150                   Apollo 13  1995    [Adventure, Drama, IMAX]
4     165  Die Hard: With a Vengeance  1995   [Action, Crime, Thriller]

✨ Preparing the addresses of aggregated features ...
- Fetching URL from 'https://huggingface.co/datasets/alitourani/MoViFex_Dataset/resolve/main/stats.json' ...
- Fetching all movie IDs ...
- Found 274 movie IDs ...
- Generating a list of addresses to fetch the aggregated features ...
- Generated 1644 aggregated feature addresses, e.g., https://huggingface.co/datasets/alitourani/MoViF

Unnamed: 0,itemId,embedding
0,6,"[2.20092, 2.158851, 1.767559, 1.588628, 1.9214..."
1,50,"[2.608933, 2.313115, 1.61709, 2.527633, 1.2831..."
2,111,"[2.064346, 1.855269, 1.985471, 2.009896, 1.377..."


### II. *Load MovieLenz-25M*

In [6]:
# Variables
datasetPath = "/content/ML25"
movielenzUrl = f"https://files.grouplens.org/datasets/movielens/{ML_VARIANT}.zip"

def loadGenresMovieLenz(df):
  df['itemId'] = df['itemId'].astype(str)
  df['genres'] = df['genres'].map(lambda s: s.split('|') if isinstance(s,str) else [])
  return df[['itemId','genres']]

# Download the MovieLenz Dataset
if os.path.exists('/content/ML25'):
  print("✨ The dataset is already downloaded!")
  datasetPath = os.path.join(datasetPath, "ml-25m")
else:
  print(f"Downloading the '{ML_VARIANT}' dataset from '{movielenzUrl}' ...")
  isDownloadSuccessful = downloadMovielens25m(movielenzUrl, datasetPath)
  if not isDownloadSuccessful:
    print('- Seems like there was a problem while downloading!')
  datasetPath = os.path.join(datasetPath, "ml-25m")

# Load the Files
print(f"\nLoading '{ML_VARIANT}' files from '{datasetPath}' ...")
mlMoviesDF = loadDataFromCSV(os.path.join(datasetPath, "movies.csv"))
mlRatingsDF = loadDataFromCSV(os.path.join(datasetPath, "ratings.csv"))
# Normalization
mlMoviesDF = mlMoviesDF.rename(columns={'movieId': 'itemId'})
mlRatingsDF = mlRatingsDF.rename(columns={'movieId': 'itemId'})
print(f"{len(mlMoviesDF)} movies and {len(mlRatingsDF)} ratings have been loaded!")
mlMoviesDF.head(5)

# Load Genres
mlGenresDF = loadGenresMovieLenz(mlMoviesDF)
mlGenresDict  = dict(zip(mlGenresDF.itemId, mlGenresDF.genres))

✨ The dataset is already downloaded!

Loading 'ml-25m' files from '/content/ML25/ml-25m' ...
62423 movies and 25000095 ratings have been loaded!


### IV. *Load LLM-Augmented Text*

In [7]:
# Variables
textAugBase = ("https://raw.githubusercontent.com/yasdel/Poison-RAG-Plus/"
                 "main/AttackData/Embeddings_from_Augmentation_Attack_Data/"
                 "ml-latest-small/")
textAugPrefix_aug = "enriched_description_part"
textAugPrefix_raw = "originalraw_combined_all_part"

# Parser Utility
def _parse_safe(s: str) -> np.ndarray:
  vec = np.fromstring(str(s).replace(',', ' '), sep=' ', dtype=np.float32)
  if not np.all(np.isfinite(vec)):
    vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0)
  return vec

parse = _parse_safe

def loadTextAugmented(model, augmented, max_parts=15, verbose=True):
  prefix = f'{model}_{textAugPrefix_aug}' if augmented else f'{model}_{textAugPrefix_raw}'
  dfs = []
  for i in range(1, max_parts+1):
    url = f"{textAugBase}{prefix}{i}.csv.gz"
    try:
        df = pd.read_csv(url, compression='gzip')
        df['text'] = df.embeddings.map(parse)
        dfs.append(df[['itemId','text']])
    except:
        break
  out = pd.concat(dfs).drop_duplicates('itemId')
  if verbose:
      tag = 'AUG' if augmented else 'ORIG'
      print(f"[Text]  {tag} parts={len(dfs)} items={len(out):,}")
  return out

# Load
print(f"\nLoading Textual '{LLM_VARIANT}' files '{'with' if TEXT_AUGMENTED else 'without'}' augmentation ...")
textAugDF = loadTextAugmented(LLM_VARIANT, TEXT_AUGMENTED)
textAugDF.head(5)


Loading Textual 'openai' files 'with' augmentation ...
[Text]  AUG parts=3 items=1,606


Unnamed: 0,itemId,text
0,1516,"[-0.009714896, -0.024003807, -0.0416483, -0.02..."
1,5952,"[0.0024696812, -0.03361401, -0.019164726, -0.0..."
2,370,"[-0.0020823667, -0.027629452, 0.006294715, -0...."
3,292,"[-0.011372974, -0.038963087, -0.024515806, -0...."
4,1209,"[0.007154904, -0.025495825, -0.011659123, -0.0..."


## **📊 [Step 3] Experiments**

In [8]:
#  Block A – DATASET SPLIT
# ╔════════════════════════════════════════════════════════════════════════════╗

# Trian/Test Split
ratings = mlRatingsDF.copy()
if SPLIT_MODE=='random':
  ratings = ratings.sample(frac=1, random_state=SEED).reset_index(drop=True)
  sz = int(len(ratings)*TEST_RATIO)
  train_df, test_df = ratings.iloc[:-sz].copy(), ratings.iloc[-sz:].copy()
elif SPLIT_MODE=='temporal':
  ratings = ratings.sort_values('timestamp')
  sz = int(len(ratings)*TEST_RATIO)
  train_df, test_df = ratings.iloc[:-sz].copy(), ratings.iloc[-sz:].copy()
else:
  trs, tes = [], []
  for uid, grp in ratings.groupby('userId'):
      grp = grp.sort_values('timestamp')
      tes.append(grp.iloc[-1]); trs.extend(grp.iloc[:-1].to_dict('records'))
  train_df, test_df = pd.DataFrame(trs), pd.DataFrame(tes)
# Get the
if VERBOSE: print(f"✔ Split train={len(train_df):,}, test={len(test_df):,}")
train_set = Dataset.from_uir(train_df[['userId','itemId','rating']].values.tolist())

# Preparation
for df in (movifexDF_featMax, movifexDF_featMean, textAugDF):
  df['itemId']=df.itemId.astype(str)

# Create a DataFrame of common merged data
common = set(movifexDF_featMax.itemId) & set(movifexDF_featMean.itemId) & set(textAugDF.itemId)
movifexDF_featMax, movifexDF_featMean, textAugDF = [df[df.itemId.isin(common)].reset_index(drop=True)
                        for df in (movifexDF_featMax, movifexDF_featMean, textAugDF)]
merged = (
    movifexDF_featMax.merge(movifexDF_featMean, on='itemId')
                     .merge(textAugDF, on='itemId')
                     .rename(columns={
                         'embedding_x': 'visual_max',
                         'embedding_y': 'visual_mean'
                     })
)
merged['all'] = merged.apply(lambda r: np.hstack([r.visual_max, r.visual_mean, r.text]), axis=1)

# Apply on Train/Test sets
keep = set(merged.itemId)
test_df = test_df[test_df.itemId.astype(str).isin(keep)].reset_index(drop=True)
train_df = train_df[train_df.itemId.astype(str).isin(keep)].reset_index(drop=True)
train_set = Dataset.from_uir(train_df[['userId','itemId','rating']].values.tolist())
if VERBOSE:
  print(f"✔ Embeddings intersect keeps {len(keep):,} items!")

#  Block B – MULTIMODALITY VARIANTS
# ╔════════════════════════════════════════════════════════════════════════════╗

def imageModality(col):
  return ImageModality(
    features = np.vstack(merged[col]),
    ids = merged.itemId,
    normalized = True
  )

def featModality(col):
  return FeatureModality(
    features = np.vstack(merged[col]),
    ids = merged.itemId,
    normalized = True
  )

modalitiesDict = {}
modalitiesDict['concat'] = {
  'all_image': imageModality('all'),
  'all_feature': featModality('all'),
  'text_image': imageModality('text'),
  'visual_image_max': imageModality('visual_max'),
  'visual_image_mean': imageModality('visual_mean'),
}
if VERBOSE: print("✔ Concating results are ready!")

for tag, param in MULTI_VARIANTS:
  if tag =='concat': continue
  if tag == 'pca':
    ratio = param; tag_name=f"pca_{int(ratio*100)}"
    mat = StandardScaler().fit_transform(np.vstack(merged['all']))
    mat = PCA(ratio,random_state=SEED).fit_transform(mat)
    merged[tag_name] = list(mat.astype(np.float32))
    modalitiesDict[tag_name] = {
        'all_image':imageModality(tag_name),'all_feature':featModality(tag_name)
    }
    if VERBOSE: print(f"✔ PCA {int(ratio*100)} dims={mat.shape[1]}")
  elif tag == 'cca':
    comps = param; tag_name=f"cca_{comps}"
    half = len(merged['all'][0])//2
    big = np.vstack(merged['all']);X,Y=big[:,:half],big[:,half:]
    cca = CCA(n_components=comps).fit(X,Y)
    merged[tag_name] = list(cca.transform(X,Y)[0].astype(np.float32))
    modalitiesDict[tag_name] = {'all_image':imageModality(tag_name),'all_feature':featModality(tag_name)}
    if VERBOSE: print(f"✔ CCA {comps} dims={comps}")

#  Block C – HELPERS
# ╔════════════════════════════════════════════════════════════════════════════╗
def model_is_selected(tag:str)->bool:
  return (MODEL_CHOICE=='cf'  and tag in {'MF','VAECF','TopPop'}) or \
        (MODEL_CHOICE=='vbpr'and tag=='VBPR') or \
        (MODEL_CHOICE=='vmf' and tag=='VMF') or \
        (MODEL_CHOICE=='amr' and tag=='AMR')

def _fit_with_modalities(model,base_ds,item_img=None,item_feat=None):
  ds=copy.deepcopy(base_ds)
  if item_img is not None: ds.item_image=item_img
  if item_feat is not None:ds.item_feature=item_feat
  model.fit(ds)

✔ Split train=20,000,076, test=5,000,019
✔ Embeddings intersect keeps 121 items!
✔ Concating results are ready!
✔ PCA 95 dims=103
✔ CCA 40 dims=40


In [9]:
#  Block D – GRID SEARCH (parallel + GPU)
# ╔════════════════════════════════════════════════════════════════════════════╗

# Monkey‑patch so that csr_matrix.A → csr_matrix.toarray()
if not hasattr(scipy.sparse.csr_matrix, 'A'):
    scipy.sparse.csr_matrix.A = property(lambda self: self.toarray())

CUPY = False
if USE_GPU_FOR_HPO:
  try:
    import cupy as cp; CUPY=True; print("✔ CuPy enabled")
  except ImportError:
    print("✖ CuPy not found"); USE_GPU_FOR_HPO=False

# Validation split ------------------------------------------------------------
train_fit_df, val_df = train_test_split(train_df, test_size=0.1, random_state=SEED)
val_grp = val_df.groupby('userId')['itemId'].apply(list).to_dict()
train_seen = train_fit_df.groupby('userId')['itemId'].apply(set).to_dict()
train_fit_set = Dataset.from_uir(train_fit_df[['userId','itemId','rating']].values.tolist())
all_iids, iid_map = train_fit_set.item_ids, train_fit_set.iid_map

def metric(model,topN=10):
  rec,ndcg=[],[]
  for uid,gt in val_grp.items():
    if uid not in train_fit_set.uid_map:continue
    uidx=train_fit_set.uid_map[uid]
    scores=model.score(uidx)
    if USE_GPU_FOR_HPO and CUPY:scores=cp.asarray(scores)
    seen=train_seen.get(uid,set())
    cand=[(it,scores[iid_map[it]]) for it in all_iids if it not in seen]
    cand.sort(key=lambda x:float(x[1]),reverse=True)
    top=[c[0] for c in cand[:topN]]
    rec.append(len(set(top)&set(gt))/len(gt) if gt else 0)
    dcg=sum(1/math.log2(r+2) for r,it in enumerate(top) if it in gt)
    idcg=sum(1/math.log2(r+2) for r in range(min(len(gt),topN)))
    ndcg.append(dcg/idcg if idcg else 0)
  return 0.5*(np.mean(rec)+np.mean(ndcg))

def gridSearch(cls,name,scenario,param_grid,*fit_args):
  start=time.time();print(f"🔄 HPO {name} {scenario} – {len(param_grid)} configs")
  def _eval(p):
    p2=p.copy()
    if USE_GPU_FOR_HPO and 'use_gpu' in inspect.signature(cls).parameters:
        p2['use_gpu']=True
    m=cls(seed=SEED,**p2)
    _fit_with_modalities(m,*fit_args)
    s=metric(m)
    if VERBOSE: print(f"    ↳ {p2}  → {s:.4f}")
    return s,m,p2
  if PARALLEL_HPO and len(param_grid)>1:
    with ThreadPoolExecutor(max_workers=min(8,len(param_grid))) as ex:
      results=list(ex.map(_eval,param_grid))
  else: results=[_eval(p) for p in param_grid]
  best=max(results,key=lambda x:x[0])
  print(f"✔ best {name} {scenario} = {best[2]} ({best[0]:.4f}) "
        f"[{time.time()-start:.1f}s]")
  return best[1],best[2]

# Hyper‑param grids
GR_MF = [{'k':k,'learning_rate':lr,'lambda_reg':0.01,'max_iter':50}
         for k in (32,64,128) for lr in (0.01,0.005)][0:5]

GR_VAECF=[{'k':k,'learning_rate':lr,'beta':0.01}
          for k in (32,64,128) for lr in (0.001,0.0005)][0:5]

if FAST_Prtye:
  GR_VBPR = [
      {'k': k, 'k2': k2, 'learning_rate': lr, 'lambda_w': 0.01, 'lambda_b': 0.01, 'n_epochs': 1}
      for k in (32, 64, 128)
      for k2 in (8, 16)
      for lr in (0.001,)
  ][0:5]
else:
  GR_VBPR = [
      {'k': k, 'k2': k2, 'learning_rate': lr, 'lambda_w': 0.01, 'lambda_b': 0.01, 'n_epochs': N_EPOCHS}
      for k in (32, 64, 128)
      for k2 in (8, 16)
      for lr in (0.001,)
  ][0:5]

if FAST_Prtye:
  GR_VMF=[{'k':k,'learning_rate':lr,'n_epochs':1}
          for k in (32,64,128) for lr in (0.01,)][0:5]
else:
  GR_VMF=[{'k':k,'learning_rate':lr, 'n_epochs': N_EPOCHS}
          for k in (32,64,128) for lr in (0.01,)][0:5]

if FAST_Prtye:
  GR_AMR=[{'k':k,'k2':k2,'learning_rate':lr,'n_epochs':1}
          for k in (32,64,128) for k2 in (16,32) for lr in (0.001,)][0:5]
else:
  GR_AMR=[{'k':k,'k2':k2,'learning_rate':lr}
          for k in (32,64,128) for k2 in (16,32) for lr in (0.001,)][0:5]

# Model Configurations
modelsCfg={}

if model_is_selected('MF'):
  modelsCfg['MF'] = gridSearch(MF,'MF','(na)',GR_MF,train_fit_set)

# if model_is_selected('VAECF'):
#   modelsCfg['VAECF'] = gridSearch(VAECF,'VAECF','(na)',GR_VAECF,train_fit_set)

if model_is_selected('VBPR'):
  for mod in ('visual_max','visual_mean','text'):
    modelsCfg[f'VBPR_{mod}'] = gridSearch(VBPR,'VBPR', mod, GR_VBPR,
                                        train_fit_set, modalitiesDict['concat'][f'{mod}_image'])
  for mv in modalitiesDict:
    if mv=='concat':continue
    modelsCfg[f'VBPR_{mv}'] = gridSearch(VBPR,'VBPR',mv,GR_VBPR,
                                    train_fit_set,
                                    modalitiesDict[mv]['all_image'])

if model_is_selected('VMF'):
  for mod in ('visual_max','visual_mean','text'):
    modelsCfg[f'VMF_{mod}'] = gridSearch(VMF,'VMF',mod,GR_VMF,
                                    train_fit_set,
                                    modalitiesDict['concat'][f'{mod}_image'])
  for mv in modalitiesDict:
    if mv == 'concat':continue
    modelsCfg[f'VMF_{mv}'] = gridSearch(VMF,'VMF',mv,GR_VMF,
                                  train_fit_set,
                                  modalitiesDict[mv]['all_image'])

if model_is_selected('AMR'):
  for mod in ('visual_max','visual_mean','text'):
    modelsCfg[f'AMR_{mod}'] = gridSearch(AMR,'AMR',mod,GR_AMR,
                                    train_fit_set,
                                    modalitiesDict['concat'][f'{mod}_image'],
                                    modalitiesDict['concat']['all_feature'])
  for mv in modalitiesDict:
    if mv=='concat':continue
    modelsCfg[f'AMR_{mv}'] = gridSearch(AMR,'AMR',mv,GR_AMR,
                                  train_fit_set,
                                  modalitiesDict[mv]['all_image'],
                                  modalitiesDict[mv]['all_feature'])

print(f"✔ HPO done – {len(modelsCfg)} configs kept")


#  Block F‑2 – REFIT BEST CONFIGS ON FULL TRAIN
# ╔════════════════════════════════════════════════════════════════════════════╗
finalModels = {}

if model_is_selected('TopPop'):
  mp = MostPop(); mp.fit(train_set)
  finalModels[('TopPop','NA')] = mp

for tag,(best_model,cfg) in modelsCfg.items():
  mdl,variant = tag.split('_',1) if '_' in tag else (tag,'NA')
  extras={}
  if USE_GPU_FOR_HPO and 'use_gpu' in inspect.signature(best_model.__class__).parameters:
    extras['use_gpu']=True
  if mdl in {'MF','VAECF'}:
    new=best_model.__class__(seed=SEED,**cfg,**extras)
    _fit_with_modalities(new,train_set)
    finalModels[(mdl,variant)]=new
  else:
    if variant in ('visual','audio','text'):
      img = modalitiesDict['concat'][f'{variant}_image']; feat=None
    else:
      img = modalitiesDict[variant]['all_image']
      feat = modalitiesDict[variant].get('all_feature')
    new = best_model.__class__(seed=SEED,**cfg,**extras)
    _fit_with_modalities(new,train_set,img,feat)
    finalModels[(mdl,variant)] = new
  if VERBOSE: print(f"✔ Re‑fit {mdl}_{variant}")

print(f"✔ Total final models = {len(finalModels)}")

🔄 HPO MF (na) – 5 configs
    ↳ {'k': 32, 'learning_rate': 0.01, 'lambda_reg': 0.01, 'max_iter': 50}  → 0.2039
    ↳ {'k': 32, 'learning_rate': 0.005, 'lambda_reg': 0.01, 'max_iter': 50}  → 0.2227
    ↳ {'k': 64, 'learning_rate': 0.01, 'lambda_reg': 0.01, 'max_iter': 50}  → 0.2258
    ↳ {'k': 64, 'learning_rate': 0.005, 'lambda_reg': 0.01, 'max_iter': 50}  → 0.2341
    ↳ {'k': 128, 'learning_rate': 0.01, 'lambda_reg': 0.01, 'max_iter': 50}  → 0.2451
✔ best MF (na) = {'k': 128, 'learning_rate': 0.01, 'lambda_reg': 0.01, 'max_iter': 50} (0.2451) [163.9s]
✔ HPO done – 1 configs kept
✔ Re‑fit MF_NA
✔ Total final models = 2


In [13]:
# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block G – GENERATE TOP‑N LISTS + PER‑USER METRICS  (adds CR & CV)
# ╚════════════════════════════════════════════════════════════════════════════╝
import collections
topN_k = 10
train_seen = train_df.groupby('userId')['itemId'].apply(set).to_dict()
all_iids, iid_map = train_set.item_ids, train_set.iid_map

def _topN(model, uid, N):
    if uid not in train_set.uid_map:
        return []
    scores = model.score(train_set.uid_map[uid])
    cand = [(it, scores[iid_map[it]]) for it in all_iids
            if it not in train_seen.get(uid, set())]
    cand.sort(key=lambda x: float(x[1]), reverse=True)
    return [c[0] for c in cand[:N]]

# Helpers (with fixed ild)
train_pop = {}
for _, iids_r in train_set.user_data.items():
    for ii in iids_r[0]:
        train_pop[ii] = train_pop.get(ii, 0) + 1
max_pop = max(train_pop.values())

cold_items    = {i for i, c in train_pop.items() if c <= COLD_TH}
coverage_dict = collections.defaultdict(set)

def gini(x):
    if not x: return 0.0
    sx = sorted(x); n = len(sx); tot = sum(sx)
    if tot == 0: return 0.0
    cum = sum((i + 1) * val for i, val in enumerate(sx))
    return (2 * cum) / (n * tot) - (n + 1) / n

def ild(genres_list):
    if len(genres_list) <= 1:
        return 0.0
    pairs = itertools.combinations(genres_list, 2)
    dissim = [
        1 - len(set(a) & set(b)) / len(set(a) | set(b))
        for a, b in pairs
    ]
    return float(np.mean(dissim))

def kl_div(p, q, eps=1e-8):
    keys = set(p) | set(q)
    p_vec = np.array([p.get(k, eps) for k in keys], dtype=float)
    q_vec = np.array([q.get(k, eps) for k in keys], dtype=float)
    p_vec /= p_vec.sum(); q_vec /= q_vec.sum()
    return (p_vec * np.log(p_vec / q_vec)).sum()

# Generate per‑user rows
rows = []
for uid, grp in test_df.groupby('userId'):
    gt          = set(grp.itemId.tolist())
    train_items = train_seen.get(uid, set())
    user_genres = list(itertools.chain(*(mlGenresDict.get(str(it), []) for it in train_items)))
    user_gen_dist = pd.Series(user_genres).value_counts().to_dict()

    r = {
        'userId': uid,
        'train' : list(train_items),
        'gt'    : list(gt)
    }

    for (mdl, scn), mod in finalModels.items():
        rec = _topN(mod, uid, topN_k)
        r[f"rec_{mdl}_{scn}"] = rec

        # Cold‑start Rate
        cold_rate = sum(it in cold_items for it in rec) / len(rec) if rec else 0
        r[f"CR_{mdl}_{scn}"] = cold_rate
        coverage_dict[(mdl, scn)].update(rec)

        pop_bias = np.mean([train_pop.get(it, 0) / max_pop for it in rec]) if rec else 0
        fairness = 1 - gini([train_pop.get(it, 0) for it in rec])
        novelty  = np.mean([-math.log2(train_pop.get(it, 1) /
                                       len(train_set.user_data)) for it in rec]) if rec else 0

        rec_gen = [mlGenresDict.get(str(it), ['(none)']) for it in rec]
        diversity = ild(rec_gen)

        rec_gen_flat = list(itertools.chain(*rec_gen))
        rec_gen_dist = pd.Series(rec_gen_flat).value_counts().to_dict()
        calib = kl_div(user_gen_dist, rec_gen_dist)

        dcg   = sum(1 / math.log2(rnk + 2) for rnk, it in enumerate(rec) if it in gt)
        idcg  = sum(1 / math.log2(rnk + 2) for rnk in range(min(len(gt), 10)))
        ndcg  = dcg / idcg if idcg else 0
        recall = len(set(rec) & gt) / len(gt) if gt else 0

        r.update({
            f"PB_{mdl}_{scn}": pop_bias,
            f"FA_{mdl}_{scn}": fairness,
            f"NO_{mdl}_{scn}": novelty,
            f"DI_{mdl}_{scn}": diversity,
            f"CB_{mdl}_{scn}": calib,
            f"RC_{mdl}_{scn}": recall,
            f"ND_{mdl}_{scn}": ndcg
        })

    rows.append(r)

recs = pd.DataFrame(rows)

# Broadcast coverage columns
for (mdl, scn), items in coverage_dict.items():
    coverage = len(items) / len(all_iids)
    recs[f"CV_{mdl}_{scn}"] = coverage

fn_suffix = f"{ML_VARIANT}_{MODEL_CHOICE}"
recs.to_csv(f"reclist_df_{fn_suffix}.csv", index=False)
print(f"✔ reco lists saved → reclist_df_{fn_suffix}.csv")

display(recs.head(5))

✔ reco lists saved → reclist_df_ml-25m_cf.csv


Unnamed: 0,userId,train,gt,rec_TopPop_NA,CR_TopPop_NA,PB_TopPop_NA,FA_TopPop_NA,NO_TopPop_NA,DI_TopPop_NA,CB_TopPop_NA,...,CR_MF_NA,PB_MF_NA,FA_MF_NA,NO_MF_NA,DI_MF_NA,CB_MF_NA,RC_MF_NA,ND_MF_NA,CV_TopPop_NA,CV_MF_NA
0,2,"[4993, 33794, 356, 5574, 553, 5418, 266, 2028,...","[5952, 480, 4995, 1584, 1201, 4022, 3510]","[296.0, 480.0, 2959.0, 50.0, 5952.0, 608.0, 58...",0.0,0.035538,0.1,15.801615,0.0,18.423243,...,0.0,0.035538,0.1,15.801615,0.0,18.423243,0.285714,0.256174,0.636364,1.0
1,3,"[4993, 4226, 4995, 33794, 778, 2571, 48780, 74...","[122912, 122882, 5956, 114180, 293, 99114, 122...","[318.0, 527.0, 2959.0, 150.0, 608.0, 2028.0, 1...",0.0,0.0,1.0,17.25163,0.0,18.34686,...,0.0,0.027602,0.1,15.838075,0.0,18.34686,0.357143,0.617284,0.636364,1.0
2,4,"[4993, 122882, 33794, 2571, 48780, 7438, 11175...","[5952, 2028, 589, 1036, 7153, 103249, 122904, ...","[318.0, 356.0, 480.0, 527.0, 2959.0, 589.0, 50...",0.0,0.035538,0.1,15.801615,0.0,18.446887,...,0.0,0.0,1.0,17.25163,0.0,18.446887,0.444444,0.393706,0.636364,1.0
3,5,"[608, 356, 231, 296, 553, 50, 150]",[778],"[318.0, 2571.0, 480.0, 527.0, 2959.0, 589.0, 4...",0.0,0.0,1.0,17.25163,0.0,18.525612,...,0.0,0.0,1.0,17.25163,0.0,18.525612,0.0,0.0,0.636364,1.0
4,6,"[608, 1704, 2028, 527, 912, 1203, 1213, 318]",[858],"[356.0, 296.0, 2571.0, 480.0, 2959.0, 589.0, 4...",0.0,0.035538,0.1,15.801615,0.0,19.094225,...,0.0,0.035538,0.1,15.801615,0.0,19.094225,1.0,0.5,0.636364,1.0


In [14]:
# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block H – AGGREGATE METRICS (now includes Cold‑start & Coverage)
# ╚════════════════════════════════════════════════════════════════════════════╝
metricRows = []
for col in [c for c in recs.columns if c.startswith('rec_')]:
    mdl, scn = col.split('_', 2)[1:]

    pb = recs[f"PB_{mdl}_{scn}"].mean()
    fa = recs[f"FA_{mdl}_{scn}"].mean()
    no = recs[f"NO_{mdl}_{scn}"].mean()
    di = recs[f"DI_{mdl}_{scn}"].mean()
    cb = recs[f"CB_{mdl}_{scn}"].mean()
    rc = recs[f"RC_{mdl}_{scn}"].mean()
    nd = recs[f"ND_{mdl}_{scn}"].mean()

    # ---------- NEW metrics ---------------------------------------------------
    cr = recs[f"CR_{mdl}_{scn}"].mean()          # Cold‑start Rate @ 10
    cv = recs[f"CV_{mdl}_{scn}"].mean()          # Catalogue Coverage @ 10
    # -------------------------------------------------------------------------

    metricRows.append({
        'model': mdl, 'scenario': scn,
        'Recall@10': rc, 'NDCG@10': nd,
        'ColdRate@10': cr, 'Coverage@10': cv,
        'PopularityBias': pb, 'Fairness': fa,
        'Novelty': no, 'Diversity': di,
        'CalibrationBias': cb
    })

agg = pd.DataFrame(metricRows)
agg.to_csv(f"agg_metrics_{fn_suffix}.csv", index=False)
print(f"✔ Metrics saved → agg_metrics_{fn_suffix}.csv")

pd.options.display.float_format = lambda x: f"{x:8.3f}"
print("\n═════ FINAL METRICS ═════")
print(agg.sort_values(['model', 'scenario']).to_string(index=False))

✔ Metrics saved → agg_metrics_ml-25m_cf.csv

═════ FINAL METRICS ═════
 model scenario  Recall@10  NDCG@10  ColdRate@10  Coverage@10  PopularityBias  Fairness  Novelty  Diversity  CalibrationBias
    MF       NA      0.303    0.263        0.000        1.000           0.016     0.558   16.307      0.000           18.417
TopPop       NA      0.397    0.324        0.000        0.636           0.024     0.392   16.058      0.000           18.417


In [16]:
# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block H‑2 – SAVE ADDITIONAL METADATA (NEW)
# ╚════════════════════════════════════════════════════════════════════════════╝
mlGenresDF.to_csv("item_metadata_genres.csv", index=False)
merged[['itemId', 'visual_max', 'visual_mean', 'text']].to_csv(
    "item_embeddings_summary.csv", index=False)

display(mlGenresDF)
print("✔ item_metadata_genres.csv and item_embeddings_summary.csv saved")

Unnamed: 0,itemId,genres
0,1,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,"[Adventure, Children, Fantasy]"
2,3,"[Comedy, Romance]"
3,4,"[Comedy, Drama, Romance]"
4,5,[Comedy]
...,...,...
62418,209157,[Drama]
62419,209159,[Documentary]
62420,209163,"[Comedy, Drama]"
62421,209169,[(no genres listed)]


✔ item_metadata_genres.csv and item_embeddings_summary.csv saved
