<a href="https://colab.research.google.com/github/RecSys-lab/MM-VideoRec/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
###############################################################################
# Multimodal MovieLens Pipeline  (2025‑04‑22 ‑ safe loader edition)
###############################################################################

# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block 0 – EXPERIMENT CONFIGURATION
# ╚════════════════════════════════════════════════════════════════════════════╝
MODEL_CHOICE    = 'amr'         # 'cf' | 'vbpr' | 'amr' | 'vmf'
FAST_Prtye      = False           # fast prototype → n_epochs = 1
USE_GPU_FOR_HPO = False
PARALLEL_HPO    = True
SEED            = 42
VERBOSE         = True

# I/O flags
LLM_PREFIX = 'st'           # openai | st | llama
TEXT_AUGMENTED  = False          # True → use augmented textual path
AUDIO_VARIANT   = 'i_ivector'    # 'blf_pca' | 'blf_delta' | 'i_ivector'
VISUAL_VARIANT  = 'mmtf_alexnet_avg' # 'mmtf_alexnet_avg' | 'mmtf_avf_avg'
TEXT_MAX_PARTS  = 15
N_EPOCHS        = 20

# multimodal fusion variants
MULTI_VARIANTS = [
    ('concat', None),
    ('pca',   0.95),
    ('cca',   40),
]
###############################################################################


# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block A – INSTALLS, IMPORTS, SEEDS
# ╚════════════════════════════════════════════════════════════════════════════╝
!pip install -q numpy==1.25.2 cornac==2.2.1

import io, os, math, random, copy, zipfile, requests, gzip, time, inspect, itertools
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt

from concurrent.futures          import ThreadPoolExecutor
from sklearn.model_selection      import train_test_split
from sklearn.decomposition        import PCA
from sklearn.preprocessing        import StandardScaler
from sklearn.cross_decomposition  import CCA

import cornac
from   cornac.data   import ImageModality, FeatureModality, Dataset
from   cornac.models import MF, VBPR, VMF, AMR, VAECF, MostPop

rng = np.random.RandomState(SEED)
np.random.seed(SEED)
###############################################################################


# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block B – MOVIELENS DOWNLOAD + SPLIT + GENRES
# ╚════════════════════════════════════════════════════════════════════════════╝
DATASET    = '1m'
SPLIT_MODE = 'random'
TEST_RATIO = 0.2
K_CORE     = 0

ML100K_URL  = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
ML100K_ITEM = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"
ML1M_URL    = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"

def _dl(url, dest):
    if not os.path.exists(dest):
        if VERBOSE: print(f"⏬ Download {dest}")
        open(dest, 'wb').write(requests.get(url).content)

if DATASET == '100k':
    _dl(ML100K_URL,  'u.data')
    _dl(ML100K_ITEM, 'u.item')
    ratings_file, delim, eng = 'u.data', '\t', None
else:
    _dl(ML1M_URL, 'ml-1m.zip')
    if not os.path.exists('ml-1m'):
        zipfile.ZipFile('ml-1m.zip').extractall('ml-1m')
    ratings_file = ('ml-1m/ml-1m/ratings.dat'
                    if os.path.exists('ml-1m/ml-1m/ratings.dat')
                    else 'ml-1m/ratings.dat')
    delim, eng = '::', 'python'

ratings = pd.read_csv(ratings_file, sep=delim,
                      names=['user_id','item_id','rating','timestamp'],
                      engine=eng, header=None)
if VERBOSE: print(f"✔ Ratings rows = {len(ratings):,}")

def _load_genres():
    if DATASET == '100k':
        genre_cols = [
            "unknown","Action","Adventure","Animation","Children's","Comedy","Crime",
            "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery",
            "Romance","Sci-Fi","Thriller","War","Western"
        ]
        cols = ["item_id","title","release_date","video_release_date","IMDb_URL"] + genre_cols
        movies = pd.read_csv('u.item', sep='|', header=None,
                             names=cols, encoding='latin-1')
        movies['genres'] = movies[genre_cols].apply(
            lambda row: [g for g in genre_cols if row[g]==1], axis=1)
        movies['item_id'] = movies['item_id'].astype(str)
    else:
        path = 'ml-1m/movies.dat'
        if not os.path.exists(path):
            path = 'ml-1m/ml-1m/movies.dat'
        movies = pd.read_csv(path, sep='::', engine='python',
                             names=['item_id','title','genres'], encoding='latin-1')
        movies['item_id'] = movies['item_id'].astype(str)
        movies['genres'] = movies['genres'].map(
            lambda s: s.split('|') if isinstance(s,str) else [])
    return movies[['item_id','genres']]

genres_df  = _load_genres()
genre_dict = dict(zip(genres_df.item_id, genres_df.genres))
if VERBOSE: print(f"✔ genres loaded items={len(genres_df):,}")

if K_CORE>0:
    def _kcore(df,k):
        changed=True
        while changed:
            before=len(df)
            vc=df.user_id.value_counts(); df=df[df.user_id.isin(vc[vc>=k].index)]
            vc=df.item_id.value_counts(); df=df[df.item_id.isin(vc[vc>=k].index)]
            changed=len(df)<before
        return df
    ratings=_kcore(ratings,K_CORE)
    if VERBOSE: print(f"✔ After {K_CORE}-core rows={len(ratings):,}")

if SPLIT_MODE=='random':
    ratings=ratings.sample(frac=1,random_state=SEED).reset_index(drop=True)
    sz=int(len(ratings)*TEST_RATIO)
    train_df,test_df=ratings.iloc[:-sz].copy(),ratings.iloc[-sz:].copy()
elif SPLIT_MODE=='temporal':
    ratings=ratings.sort_values('timestamp')
    sz=int(len(ratings)*TEST_RATIO)
    train_df,test_df=ratings.iloc[:-sz].copy(),ratings.iloc[-sz:].copy()
else:
    trs,tes=[],[]
    for uid,grp in ratings.groupby('user_id'):
        grp=grp.sort_values('timestamp')
        tes.append(grp.iloc[-1]); trs.extend(grp.iloc[:-1].to_dict('records'))
    train_df,test_df=pd.DataFrame(trs),pd.DataFrame(tes)

if VERBOSE: print(f"✔ Split  train={len(train_df):,}  test={len(test_df):,}")
train_set=Dataset.from_uir(train_df[['user_id','item_id','rating']].values.tolist())
###############################################################################


# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block C – LOAD EMBEDDINGS
# ╚════════════════════════════════════════════════════════════════════════════╝
def _parse_safe(s: str) -> np.ndarray:
    vec = np.fromstring(str(s).replace(',', ' '), sep=' ', dtype=np.float32)
    if not np.all(np.isfinite(vec)):
        vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0)
    return vec

parse = _parse_safe

# -- VISUAL -------------------------------------------------------------------
VIS_BASE = ("https://raw.githubusercontent.com/RecSys-lab/"
            "reproducibility_data/refs/heads/main/fused_textual_visual/")
VIS_MAP = {
    "mmtf_alexnet_med": "fused_llm_mmtf_med.csv",
    "mmtf_alexnet_avg": "fused_llm_mmtf_avg.csv",
    "mmtf_avf_med"    : "fused_llm_mmtf_avf_med.csv",
    "mmtf_avf_avg"    : "fused_llm_mmtf_avf_avg.csv",
}

def load_visual(v=VISUAL_VARIANT, verbose=True):
    df = pd.read_csv(VIS_BASE + VIS_MAP[v])
    df['visual'] = df.embedding.map(parse)
    if verbose:
        print(f"[Visual] Loaded {v:<18} items={len(df):,}")
    return df[['itemId','visual']]

# -- AUDIO --------------------------------------------------------------------
AUD_BASE = ("https://raw.githubusercontent.com/RecSys-lab/"
            "reproducibility_data/refs/heads/main/fused_textual_audio/")
AUD_FILE_MAP = {
    "mmtf_corr"  : "fused_llm_mmtf_audio_correlation.csv",
    "mmtf_delta" : "fused_llm_mmtf_audio_delta.csv",
    "mmtf_log"   : "fused_llm_mmtf_audio_log.csv",
    "mmtf_spect" : "fused_llm_mmtf_audio_spectral.csv",
    "i_ivector"  : "i-vector/fused_llm_mmtf_audio_IVec_splitItem_fold_1_gmm_128_tvDim_20.csv",
}

def _read_audio_csv(url):
    df=pd.read_csv(url,low_memory=False)
    df.drop(columns=['title','genres'],errors='ignore',inplace=True)
    df.rename(columns={'embedding':'embeddings'},inplace=True)
    df['embeddings']=df['embeddings'].astype(str).str.replace(',',' ')
    df['embeddings']=df['embeddings'].apply(parse)
    return df[['itemId','embeddings']]

def load_audio(variant=AUDIO_VARIANT,pca_ratio=0.95,verbose=True):
    if variant=='blf_delta':
        df=_read_audio_csv(AUD_BASE+AUD_FILE_MAP['mmtf_delta'])
        df.rename(columns={'embeddings':'audio'},inplace=True)
        if verbose: print(f"[Audio] BLF‑delta            items={len(df):,}")
        return df
    if variant=='i_ivector':
        df=_read_audio_csv(AUD_BASE+AUD_FILE_MAP['i_ivector'])
        df.rename(columns={'embeddings':'audio'},inplace=True)
        if verbose: print(f"[Audio] i‑vector             items={len(df):,}")
        return df
    if variant=='blf_pca':
        dfs=[]
        for key in ('mmtf_corr','mmtf_delta','mmtf_log','mmtf_spect'):
            dfs.append(_read_audio_csv(AUD_BASE+AUD_FILE_MAP[key]).rename(columns={'embeddings':f'{key}_emb'}))
        dfm=dfs[0]
        for d in dfs[1:]: dfm=dfm.merge(d,on='itemId',how='inner')
        dfm['concat']=dfm.apply(lambda r:np.concatenate([r['mmtf_corr_emb'],r['mmtf_delta_emb'],r['mmtf_log_emb'],r['mmtf_spect_emb']]),axis=1)
        X=np.vstack(dfm['concat'].values)
        Xs=StandardScaler().fit_transform(X)
        pca=PCA(n_components=pca_ratio,svd_solver='full',random_state=SEED)
        Xp=pca.fit_transform(Xs).astype(np.float32)
        df_audio=pd.DataFrame({'itemId':dfm['itemId'],'audio':list(Xp)})
        if verbose: print(f"[Audio] BLF‑concat→PCA95 dims={Xp.shape[1]:<4} var={pca.explained_variance_ratio_.sum():.2f}  items={len(df_audio):,}")
        return df_audio
    raise ValueError(f"Unknown audio variant: {variant}")

# -- TEXT ---------------------------------------------------------------------

print(f'Use of LLM Generated Augmentation Data = {TEXT_AUGMENTED}')
TXT_BASE_ORIG = ("https://raw.githubusercontent.com/yasdel/Poison-RAG-Plus/"
                 "main/AttackData/Embeddings_from_Augmentation_Attack_Data/"
                 "ml-latest-small/")
TXT_BASE_AUG  = TXT_BASE_ORIG

TXT_PREFIX_ORIG      = f"{LLM_PREFIX}_originalraw_combined_all_part" # st_originalraw_combined_all_part
TXT_PREFIX_AUGMENTED = f"{LLM_PREFIX}_enriched_description_part"     # st_originalraw_combined_all_part

def load_text(max_parts=TEXT_MAX_PARTS,
              augmented=TEXT_AUGMENTED, verbose=True):
    base   = TXT_BASE_AUG  if augmented else TXT_BASE_ORIG
    prefix = TXT_PREFIX_AUGMENTED if augmented else TXT_PREFIX_ORIG
    dfs = []
    for i in range(1, max_parts+1):
        url = f"{base}{prefix}{i}.csv.gz"
        try:
            df = pd.read_csv(url, compression='gzip')
            df['text'] = df.embeddings.map(parse)
            dfs.append(df[['itemId','text']])
        except:
            break
    out = pd.concat(dfs).drop_duplicates('itemId')
    if verbose:
        tag = 'AUG' if augmented else 'ORIG'
        print(f"[Text]  {tag} parts={len(dfs)} items={len(out):,}")
    return out

# -- execute loaders & intersect ---------------------------------------------
vis_df = load_visual()
aud_df = load_audio()
txt_df = load_text()

for df in (vis_df, aud_df, txt_df):
    df['itemId'] = df.itemId.astype(str)

common = set(vis_df.itemId)&set(aud_df.itemId)&set(txt_df.itemId)
vis_df, aud_df, txt_df = [df[df.itemId.isin(common)].reset_index(drop=True)
                          for df in (vis_df, aud_df, txt_df)]
merged = vis_df.merge(aud_df,on='itemId').merge(txt_df,on='itemId')

# guard against NaN/Inf
for col in ('audio','visual','text'):
    merged[col] = merged[col].apply(lambda v: np.nan_to_num(v, nan=0.0, posinf=0.0, neginf=0.0))

merged['all'] = merged.apply(lambda r: np.hstack([r.audio,r.visual,r.text]), axis=1)

keep = set(merged.itemId)
train_df = train_df[train_df.item_id.astype(str).isin(keep)].reset_index(drop=True)
test_df  = test_df [test_df .item_id.astype(str).isin(keep)].reset_index(drop=True)
train_set = Dataset.from_uir(train_df[['user_id','item_id','rating']].values.tolist())

if VERBOSE:
    print(f"✔ Embeddings intersect – items={len(keep):,}")
###############################################################################


# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block D – MULTIMODALITY VARIANTS
# ╚════════════════════════════════════════════════════════════════════════════╝
modalities_dict = {}
def _im(col): return ImageModality(features=np.vstack(merged[col]), ids=merged.itemId, normalized=True)
def _ft(col): return FeatureModality(features=np.vstack(merged[col]), ids=merged.itemId, normalized=True)

modalities_dict['concat'] = {
    'audio_image': _im('audio'),
    'visual_image':_im('visual'),
    'text_image':  _im('text'),
    'all_image':   _im('all'),
    'all_feature': _ft('all'),
}
if VERBOSE: print("✔ concat ready")

for tag,param in MULTI_VARIANTS:
    if tag=='concat': continue
    if tag=='pca':
        ratio=param; name=f"pca_{int(ratio*100)}"
        mat=StandardScaler().fit_transform(np.vstack(merged['all']))
        mat=PCA(ratio,random_state=SEED).fit_transform(mat)
        merged[name]=list(mat.astype(np.float32))
        modalities_dict[name]={'all_image':_im(name),'all_feature':_ft(name)}
        if VERBOSE: print(f"✔ PCA {int(ratio*100)} dims={mat.shape[1]}")
    elif tag=='cca':
        comps=param; name=f"cca_{comps}"
        half=len(merged['all'][0])//2
        big=np.vstack(merged['all']);X,Y=big[:,:half],big[:,half:]
        cca=CCA(n_components=comps).fit(X,Y)
        merged[name]=list(cca.transform(X,Y)[0].astype(np.float32))
        modalities_dict[name]={'all_image':_im(name),'all_feature':_ft(name)}
        if VERBOSE: print(f"✔ CCA {comps} dims={comps}")
###############################################################################


# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block E – HELPERS
# ╚════════════════════════════════════════════════════════════════════════════╝
def model_is_selected(tag:str)->bool:
    return ((MODEL_CHOICE=='cf'   and tag in {'MF','VAECF','TopPop'}) or
            (MODEL_CHOICE=='vbpr' and tag=='VBPR')               or
            (MODEL_CHOICE=='vmf'  and tag=='VMF')                or
            (MODEL_CHOICE=='amr'  and tag=='AMR'))

def _fit_with_modalities(model, base_ds, item_img=None, item_feat=None):
    ds = copy.deepcopy(base_ds)
    if item_img  is not None: ds.item_image   = item_img
    if item_feat is not None: ds.item_feature = item_feat
    model.fit(ds)
###############################################################################


⏬ Download ml-1m.zip
✔ Ratings rows = 1,000,209
✔ genres loaded items=3,883
✔ Split  train=800,168  test=200,041
Use of LLM Generated Augmentation Data = False
[Visual] Loaded mmtf_alexnet_avg   items=1,807
[Audio] i‑vector             items=1,807
[Text]  ORIG parts=1 items=1,606
✔ Embeddings intersect – items=958
✔ concat ready
✔ PCA 95 dims=171
✔ CCA 40 dims=40


In [3]:
len(txt_df)

958

In [2]:
import scipy.sparse

# Monkey‑patch so that csr_matrix.A → csr_matrix.toarray()
if not hasattr(scipy.sparse.csr_matrix, 'A'):
    scipy.sparse.csr_matrix.A = property(lambda self: self.toarray())

#  Block F – GRID SEARCH (parallel + GPU)
# ╔════════════════════════════════════════════════════════════════════════════╗
CUPY=False
if USE_GPU_FOR_HPO:
    try:
        import cupy as cp; CUPY=True; print("✔ CuPy enabled")
    except ImportError:
        print("✖ CuPy not found"); USE_GPU_FOR_HPO=False

# validation split ------------------------------------------------------------
train_fit_df,val_df=train_test_split(train_df,test_size=0.1,random_state=SEED)
val_grp=val_df.groupby('user_id')['item_id'].apply(list).to_dict()
train_seen=train_fit_df.groupby('user_id')['item_id'].apply(set).to_dict()
train_fit_set=Dataset.from_uir(train_fit_df[['user_id','item_id','rating']].values.tolist())
all_iids,iid_map=train_fit_set.item_ids,train_fit_set.iid_map

def _metric(model,topN=10):
    rec,ndcg=[],[]
    for uid,gt in val_grp.items():
        if uid not in train_fit_set.uid_map:continue
        uidx=train_fit_set.uid_map[uid]
        scores=model.score(uidx)
        if USE_GPU_FOR_HPO and CUPY:scores=cp.asarray(scores)
        seen=train_seen.get(uid,set())
        cand=[(it,scores[iid_map[it]]) for it in all_iids if it not in seen]
        cand.sort(key=lambda x:float(x[1]),reverse=True)
        top=[c[0] for c in cand[:topN]]
        rec.append(len(set(top)&set(gt))/len(gt) if gt else 0)
        dcg=sum(1/math.log2(r+2) for r,it in enumerate(top) if it in gt)
        idcg=sum(1/math.log2(r+2) for r in range(min(len(gt),topN)))
        ndcg.append(dcg/idcg if idcg else 0)
    return 0.5*(np.mean(rec)+np.mean(ndcg))

def _grid(cls,name,scenario,param_grid,*fit_args):
    start=time.time();print(f"🔄 HPO {name} {scenario} – {len(param_grid)} configs")
    def _eval(p):
        p2=p.copy()
        if USE_GPU_FOR_HPO and 'use_gpu' in inspect.signature(cls).parameters:
            p2['use_gpu']=True
        m=cls(seed=SEED,**p2)
        _fit_with_modalities(m,*fit_args)
        s=_metric(m)
        if VERBOSE: print(f"    ↳ {p2}  → {s:.4f}")
        return s,m,p2
    if PARALLEL_HPO and len(param_grid)>1:
        with ThreadPoolExecutor(max_workers=min(8,len(param_grid))) as ex:
            results=list(ex.map(_eval,param_grid))
    else: results=[_eval(p) for p in param_grid]
    best=max(results,key=lambda x:x[0])
    print(f"✔ best {name} {scenario} = {best[2]} ({best[0]:.4f}) "
          f"[{time.time()-start:.1f}s]")
    return best[1],best[2]

# hyper‑param grids (4‑5 params each) -----------------------------------------
GR_MF = [{'k':k,'learning_rate':lr,'lambda_reg':0.01,'max_iter':50}
         for k in (32,64,128) for lr in (0.01,0.005)][0:5]

GR_VAECF=[{'k':k,'learning_rate':lr,'beta':0.01}
          for k in (32,64,128) for lr in (0.001,0.0005)][0:5]

if FAST_Prtye:
  GR_VBPR = [
      {'k': k, 'k2': k2, 'learning_rate': lr, 'lambda_w': 0.01, 'lambda_b': 0.01, 'n_epochs': 1}
      for k in (32, 64, 128)
      for k2 in (8, 16)
      for lr in (0.001,)
  ][0:5]
else:
  GR_VBPR = [
      {'k': k, 'k2': k2, 'learning_rate': lr, 'lambda_w': 0.01, 'lambda_b': 0.01, 'n_epochs': N_EPOCHS}
      for k in (32, 64, 128)
      for k2 in (8, 16)
      for lr in (0.001,)
  ][0:5]

if FAST_Prtye:
  GR_VMF=[{'k':k,'learning_rate':lr,'n_epochs':1}
          for k in (32,64,128) for lr in (0.01,)][0:5]
else:
  GR_VMF=[{'k':k,'learning_rate':lr}
          for k in (32,64,128) for lr in (0.01,)][0:5]

if FAST_Prtye:
  GR_AMR=[{'k':k,'k2':k2,'learning_rate':lr,'n_epochs':1}
          for k in (32,64,128) for k2 in (16,32) for lr in (0.001,)][0:5]
else:
  GR_AMR=[{'k':k,'k2':k2,'learning_rate':lr, 'n_epochs': N_EPOCHS}
          for k in (32,64,128) for k2 in (16,32) for lr in (0.001,)][0:5]

models_cfg={}

if model_is_selected('MF'):
    models_cfg['MF']=_grid(MF,'MF','(na)',GR_MF,train_fit_set)

if model_is_selected('VAECF'):
    models_cfg['VAECF']=_grid(VAECF,'VAECF','(na)',GR_VAECF,train_fit_set)

if model_is_selected('VBPR'):
    for mod in ('visual','audio','text'):
        models_cfg[f'VBPR_{mod}']=_grid(VBPR,'VBPR',mod,GR_VBPR,
                                        train_fit_set,
                                        modalities_dict['concat'][f'{mod}_image'])
    for mv in modalities_dict:
        if mv=='concat':continue
        models_cfg[f'VBPR_{mv}']=_grid(VBPR,'VBPR',mv,GR_VBPR,
                                       train_fit_set,
                                       modalities_dict[mv]['all_image'])

if model_is_selected('VMF'):
    for mod in ('visual','audio','text'):
        models_cfg[f'VMF_{mod}']=_grid(VMF,'VMF',mod,GR_VMF,
                                       train_fit_set,
                                       modalities_dict['concat'][f'{mod}_image'])
    for mv in modalities_dict:
        if mv=='concat':continue
        models_cfg[f'VMF_{mv}']=_grid(VMF,'VMF',mv,GR_VMF,
                                      train_fit_set,
                                      modalities_dict[mv]['all_image'])

if model_is_selected('AMR'):
    for mod in ('visual','audio','text'):
        models_cfg[f'AMR_{mod}']=_grid(AMR,'AMR',mod,GR_AMR,
                                       train_fit_set,
                                       modalities_dict['concat'][f'{mod}_image'],
                                       modalities_dict['concat']['all_feature'])
    for mv in modalities_dict:
        if mv=='concat':continue
        models_cfg[f'AMR_{mv}']=_grid(AMR,'AMR',mv,GR_AMR,
                                      train_fit_set,
                                      modalities_dict[mv]['all_image'],
                                      modalities_dict[mv]['all_feature'])

print(f"✔ HPO done – {len(models_cfg)} configs kept")
###############################################################################




#  Block F‑2 – REFIT BEST CONFIGS ON FULL TRAIN
# ╔════════════════════════════════════════════════════════════════════════════╗
final_models={}

if model_is_selected('TopPop'):
    mp=MostPop(); mp.fit(train_set)
    final_models[('TopPop','NA')]=mp

for tag,(best_model,cfg) in models_cfg.items():
    mdl,variant=tag.split('_',1) if '_' in tag else (tag,'NA')
    extras={}
    if USE_GPU_FOR_HPO and 'use_gpu' in inspect.signature(best_model.__class__).parameters:
        extras['use_gpu']=True
    if mdl in {'MF','VAECF'}:
        new=best_model.__class__(seed=SEED,**cfg,**extras)
        _fit_with_modalities(new,train_set)
        final_models[(mdl,variant)]=new
    else:
        if variant in ('visual','audio','text'):
            img=modalities_dict['concat'][f'{variant}_image']; feat=None
        else:
            img=modalities_dict[variant]['all_image']
            feat=modalities_dict[variant].get('all_feature')
        new=best_model.__class__(seed=SEED,**cfg,**extras)
        _fit_with_modalities(new,train_set,img,feat)
        final_models[(mdl,variant)]=new
    if VERBOSE: print(f"✔ Re‑fit {mdl}_{variant}")

print(f"✔ Total final models = {len(final_models)}")
###############################################################################




🔄 HPO AMR visual – 5 configs


Epoch 1/20:   0%|          | 0/3188 [00:00<?, ?it/s]

Epoch 1/20:   0%|          | 0/3188 [00:00<?, ?it/s]

Epoch 1/20:   0%|          | 0/3188 [00:00<?, ?it/s]

Epoch 1/20:   0%|          | 0/3188 [00:00<?, ?it/s]

Epoch 1/20:   0%|          | 0/3188 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block G – GENERATE TOP‑N LISTS + PER‑USER METRICS  (adds CR & CV)
# ╚════════════════════════════════════════════════════════════════════════════╝
import collections                       # ▲ NEW
topN_k = 10
train_seen = train_df.groupby('user_id')['item_id'].apply(set).to_dict()
all_iids, iid_map = train_set.item_ids, train_set.iid_map

COLD_TH = 5                              # ≤ 5 ratings = “cold”

def _topN(model, uid, N):
    if uid not in train_set.uid_map:
        return []
    scores = model.score(train_set.uid_map[uid])
    cand = [(it, scores[iid_map[it]]) for it in all_iids
            if it not in train_seen.get(uid, set())]
    cand.sort(key=lambda x: float(x[1]), reverse=True)
    return [c[0] for c in cand[:N]]

# --- helpers (with fixed ild) -------------------------------------------------
train_pop = {}
for _, iids_r in train_set.user_data.items():
    for ii in iids_r[0]:
        train_pop[ii] = train_pop.get(ii, 0) + 1
max_pop = max(train_pop.values())

cold_items    = {i for i, c in train_pop.items() if c <= COLD_TH}   # ▲ MOVED here
coverage_dict = collections.defaultdict(set)                        # ▲ MOVED here

def gini(x):
    if not x: return 0.0
    sx = sorted(x); n = len(sx); tot = sum(sx)
    if tot == 0: return 0.0
    cum = sum((i + 1) * val for i, val in enumerate(sx))
    return (2 * cum) / (n * tot) - (n + 1) / n

def ild(genres_list):
    if len(genres_list) <= 1:
        return 0.0
    pairs = itertools.combinations(genres_list, 2)
    dissim = [
        1 - len(set(a) & set(b)) / len(set(a) | set(b))
        for a, b in pairs
    ]
    return float(np.mean(dissim))

def kl_div(p, q, eps=1e-8):
    keys = set(p) | set(q)
    p_vec = np.array([p.get(k, eps) for k in keys], dtype=float)
    q_vec = np.array([q.get(k, eps) for k in keys], dtype=float)
    p_vec /= p_vec.sum(); q_vec /= q_vec.sum()
    return (p_vec * np.log(p_vec / q_vec)).sum()

# --- generate per‑user rows ---------------------------------------------------
rows = []
for uid, grp in test_df.groupby('user_id'):
    gt          = set(grp.item_id.tolist())
    train_items = train_seen.get(uid, set())
    user_genres = list(itertools.chain(*(genre_dict.get(str(it), []) for it in train_items)))
    user_gen_dist = pd.Series(user_genres).value_counts().to_dict()

    r = {
        'userId': uid,
        'train' : list(train_items),
        'gt'    : list(gt)
    }

    for (mdl, scn), mod in final_models.items():
        rec = _topN(mod, uid, topN_k)
        r[f"rec_{mdl}_{scn}"] = rec

        # ---------- Cold‑start Rate ------------------------------------------
        cold_rate = sum(it in cold_items for it in rec) / len(rec) if rec else 0
        r[f"CR_{mdl}_{scn}"] = cold_rate
        coverage_dict[(mdl, scn)].update(rec)
        # ---------------------------------------------------------------------

        # ---------- existing metrics -----------------------------------------
        pop_bias = np.mean([train_pop.get(it, 0) / max_pop for it in rec]) if rec else 0
        fairness = 1 - gini([train_pop.get(it, 0) for it in rec])
        novelty  = np.mean([-math.log2(train_pop.get(it, 1) /
                                       len(train_set.user_data)) for it in rec]) if rec else 0

        rec_gen = [genre_dict.get(str(it), ['(none)']) for it in rec]
        diversity = ild(rec_gen)

        rec_gen_flat = list(itertools.chain(*rec_gen))
        rec_gen_dist = pd.Series(rec_gen_flat).value_counts().to_dict()
        calib = kl_div(user_gen_dist, rec_gen_dist)

        dcg   = sum(1 / math.log2(rnk + 2) for rnk, it in enumerate(rec) if it in gt)
        idcg  = sum(1 / math.log2(rnk + 2) for rnk in range(min(len(gt), 10)))
        ndcg  = dcg / idcg if idcg else 0
        recall = len(set(rec) & gt) / len(gt) if gt else 0

        r.update({
            f"PB_{mdl}_{scn}": pop_bias,
            f"FA_{mdl}_{scn}": fairness,
            f"NO_{mdl}_{scn}": novelty,
            f"DI_{mdl}_{scn}": diversity,
            f"CB_{mdl}_{scn}": calib,
            f"RC_{mdl}_{scn}": recall,
            f"ND_{mdl}_{scn}": ndcg
        })

    rows.append(r)

recs = pd.DataFrame(rows)

# ---------- broadcast coverage columns ---------------------------------------
for (mdl, scn), items in coverage_dict.items():
    coverage = len(items) / len(all_iids)
    recs[f"CV_{mdl}_{scn}"] = coverage
# ------------------------------------------------------------------------------

fn_suffix = f"{DATASET}_{MODEL_CHOICE}"
recs.to_csv(f"reclist_df_{fn_suffix}.csv", index=False)
print(f"✔ reco lists saved → reclist_df_{fn_suffix}.csv")

display(recs)


In [None]:
# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block H – AGGREGATE METRICS (now includes Cold‑start & Coverage)
# ╚════════════════════════════════════════════════════════════════════════════╝
metric_rows = []
for col in [c for c in recs.columns if c.startswith('rec_')]:
    mdl, scn = col.split('_', 2)[1:]

    pb = recs[f"PB_{mdl}_{scn}"].mean()
    fa = recs[f"FA_{mdl}_{scn}"].mean()
    no = recs[f"NO_{mdl}_{scn}"].mean()
    di = recs[f"DI_{mdl}_{scn}"].mean()
    cb = recs[f"CB_{mdl}_{scn}"].mean()
    rc = recs[f"RC_{mdl}_{scn}"].mean()
    nd = recs[f"ND_{mdl}_{scn}"].mean()

    # ---------- NEW metrics ---------------------------------------------------
    cr = recs[f"CR_{mdl}_{scn}"].mean()          # Cold‑start Rate @ 10
    cv = recs[f"CV_{mdl}_{scn}"].mean()          # Catalogue Coverage @ 10
    # -------------------------------------------------------------------------

    metric_rows.append({
        'model': mdl, 'scenario': scn,
        'Recall@10': rc, 'NDCG@10': nd,
        'ColdRate@10': cr, 'Coverage@10': cv,       # ▲ added columns
        'PopularityBias': pb, 'Fairness': fa,
        'Novelty': no, 'Diversity': di,
        'CalibrationBias': cb
    })

agg = pd.DataFrame(metric_rows)
agg.to_csv(f"agg_metrics_{fn_suffix}.csv", index=False)
print(f"✔ metrics saved → agg_metrics_{fn_suffix}.csv")

pd.options.display.float_format = lambda x: f"{x:8.3f}"
print("\n═════ FINAL METRICS ═════")
print(agg.sort_values(['model', 'scenario']).to_string(index=False))


In [None]:
# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block H‑2 – SAVE ADDITIONAL METADATA (NEW)
# ╚════════════════════════════════════════════════════════════════════════════╝
genres_df.to_csv("item_metadata_genres.csv", index=False)
merged[['itemId', 'audio', 'visual', 'text']].to_csv(
    "item_embeddings_summary.csv", index=False)

display(genres_df)
print("✔ item_metadata_genres.csv and item_embeddings_summary.csv saved")


In [None]:
display(agg)