<a href="https://colab.research.google.com/github/RecSys-lab/movifex_dataset/blob/main/examples/benchmarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **MoViFex Dataset - Benchmarking (CF-Side)**

🎬 Dataset: [link](https://huggingface.co/datasets/alitourani/MoViFex_Dataset/tree/main)

🎬 Framework: [link](https://github.com/RecSys-lab/MoViFex)

In [21]:
# ╔════════════════════════════════════════════════════════════════════════════╗
#  Block 0 – EXPERIMENT CONFIGURATION
# ╚════════════════════════════════════════════════════════════════════════════╝

# Experiment Parameters
MODEL_CHOICE    = 'vbpr'          # 'cf' | 'vbpr' | 'amr' | 'vmf'
FAST_Prtye      = True            # Fast prototype → n_epochs = 1
USE_GPU_FOR_HPO = False           # GPU-enabled or not
PARALLEL_HPO    = True            # CPU-parallelization flag
SEED            = 42              # Seed for reproducibility
VERBOSE         = True            # Logging level
TEXT_MAX_PARTS  = 15
N_EPOCHS        = 20

# MoViFex Dataset
MOVIE_VARIANT = "full_movies"     # 'full_movies' | 'movie_shots' | 'movie_trailers'
FEATURE_EXTRACTOR = "incp3"       # 'incp3' | 'vgg19'
FEATURE_AGGREGATOR = "max"        # 'max' | 'mean'

# Text-Augmented Dataset
LLM_VARIANT = 'openai'            # 'openai' | 'st' | 'llama'
TEXT_AUGMENTED  = True            # True → use augmented textual path

# MovieLenz
ML_VARIANT = "ml-25m"             # 'ml-25m'

## **[Step 1] Clone & Install Libraries**

In [16]:
import os

# Check if we need to clone the helper framework
if os.path.exists('/content/MoViFex'):
  print("✨ The framework is already cloned!")
else:
  # Clone the repo
  print("✨ Cloning the repository")
  !git clone https://github.com/RecSys-lab/MoViFex.git

  # Install the required library
  %cd MoViFex
  !pip install -e .

  # Add the repository to the Python path
  import sys
  sys.path.append('/content/MoViFex')

  # Go back to the root
  %cd ..

✨ The framework is already cloned!


## 🚀 **[Step 2] Load and Prepare Data**

### I. *Load the Dataset Metadata File*

In [35]:
import os
import json
import movifex
import pandas as pd
from movifex.utils import loadJsonFromUrl
from movifex.datasets.movifex.helper_visualfeats_agg import generatedAggFeatureAddresses
from movifex.datasets.movifex.helper_visualfeats_agg import loadAggregatedFeaturesIntoDataFrame

# Variables
configs = {
  "name": "MoViFex-visual",
  "path_metadata": "https://huggingface.co/datasets/alitourani/MoViFex_Dataset/resolve/main/stats.json",
  "path_raw": "https://huggingface.co/datasets/alitourani/MoViFex_Dataset/raw/main/",
  "feature_sources": ["full_movies", "movie_shots", "movie_trailers"],
  "agg_feature_sources": ["full_movies_agg", "movie_shots_agg", "movie_trailers_agg"],
  "feature_models": ["incp3", "vgg19"],
  "aggregation_models": ["Max", "Mean"]
}

# Metadata fetching
datasetMetadataUrl = configs['path_metadata']
print(f"✨ Fetching the dataset metadata from '{datasetMetadataUrl}' ...")
jsonData = loadJsonFromUrl(datasetMetadataUrl)
movifexDF_meta = pd.DataFrame(jsonData)
movifexDF_meta = movifexDF_meta.rename(columns={'id': 'itemId'})
movifexDF_meta['itemId'] = movifexDF_meta['itemId'].astype(str).astype(int)
print("'MoViFex-visual' dataset is loaded into a DataFrame:")
print(movifexDF_meta.head(5))

# Aggregated features fetching
print(f"\n✨ Preparing the addresses of aggregated features ...")
aggFeatureAddresses = generatedAggFeatureAddresses(configs)
print(f"\n✨ Now, loading the aggregated features into DataFrames for '{MOVIE_VARIANT}' extracted by '{FEATURE_EXTRACTOR}' ...")
movifexDF_featMax, movifexDF_featMean = loadAggregatedFeaturesIntoDataFrame(aggFeatureAddresses[f'{MOVIE_VARIANT}_agg'][FEATURE_EXTRACTOR])
movifexDF_featMax['embedding'] = movifexDF_featMax['embedding'].apply(lambda x: np.fromstring(x, sep=','))
movifexDF_featMean['embedding'] = movifexDF_featMean['embedding'].apply(lambda x: np.fromstring(x, sep=','))
print(f"\nThe data loaded into DataFrames! Sample of the 'Max' DataFrame:")
movifexDF_featMax.head(3)

✨ Fetching the dataset metadata from 'https://huggingface.co/datasets/alitourani/MoViFex_Dataset/resolve/main/stats.json' ...
'MoViFex-visual' dataset is loaded into a DataFrame:
   itemId                       title  year                      genres
0       6                        Heat  1995   [Action, Crime, Thriller]
1      50         Usual Suspects, The  1995  [Crime, Mystery, Thriller]
2     111                 Taxi Driver  1976    [Crime, Drama, Thriller]
3     150                   Apollo 13  1995    [Adventure, Drama, IMAX]
4     165  Die Hard: With a Vengeance  1995   [Action, Crime, Thriller]

✨ Preparing the addresses of aggregated features ...
- Fetching URL from 'https://huggingface.co/datasets/alitourani/MoViFex_Dataset/resolve/main/stats.json' ...
- Fetching all movie IDs ...
- Found 274 movie IDs ...
- Generating a list of addresses to fetch the aggregated features ...
- Generated 1644 aggregated feature addresses, e.g., https://huggingface.co/datasets/alitourani/MoViF

Unnamed: 0,itemId,embedding
0,6,"[2.20092, 2.158851, 1.767559, 1.588628, 1.9214..."
1,50,"[2.608933, 2.313115, 1.61709, 2.527633, 1.2831..."
2,111,"[2.064346, 1.855269, 1.985471, 2.009896, 1.377..."


### II. *Load MovieLenz-25M*

In [29]:
from movifex.utils import loadDataFromCSV
from movifex.datasets.movielens.downloader import downloadMovielens25m

# Variables
datasetPath = "/content/ML25"
movielenzUrl = f"https://files.grouplens.org/datasets/movielens/{ML_VARIANT}.zip"

# Download the MovieLenz Dataset
if os.path.exists('/content/ML25'):
  print("✨ The dataset is already downloaded!")
  datasetPath = os.path.join(datasetPath, "ml-25m")
else:
  print(f"Downloading the '{ML_VARIANT}' dataset from '{movielenzUrl}' ...")
  isDownloadSuccessful = downloadMovielens25m(movielenzUrl, datasetPath)
  if not isDownloadSuccessful:
    print('- Seems like there was a problem while downloading!')
  datasetPath = os.path.join(datasetPath, "ml-25m")

# Load the Files
print(f"\nLoading '{ML_VARIANT}' files from '{datasetPath}' ...")
mlMoviesDF = loadDataFromCSV(os.path.join(datasetPath, "movies.csv"))
mlRatingsDF = loadDataFromCSV(os.path.join(datasetPath, "ratings.csv"))
# Normalization
mlMoviesDF = mlMoviesDF.rename(columns={'movieId': 'itemId'})
mlRatingsDF = mlRatingsDF.rename(columns={'movieId': 'itemId'})
print(f"{len(mlMoviesDF)} movies and {len(mlRatingsDF)} ratings have been loaded!")
mlMoviesDF.head(5)

✨ The dataset is already downloaded!

Loading 'ml-25m' files from '/content/ML25/ml-25m' ...
62423 movies and 25000095 ratings have been loaded!


Unnamed: 0,itemId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### IV. *Load LLM-Augmented Text*

In [28]:
import numpy as np

# Variables
textAugBase = ("https://raw.githubusercontent.com/yasdel/Poison-RAG-Plus/"
                 "main/AttackData/Embeddings_from_Augmentation_Attack_Data/"
                 "ml-latest-small/")
textAugPrefix_aug = "enriched_description_part"
textAugPrefix_raw = "originalraw_combined_all_part"

# Parser Utility
def _parse_safe(s: str) -> np.ndarray:
  vec = np.fromstring(str(s).replace(',', ' '), sep=' ', dtype=np.float32)
  if not np.all(np.isfinite(vec)):
    vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0)
  return vec

parse = _parse_safe

def loadTextAugmented(model, augmented, max_parts=15, verbose=True):
  prefix = f'{model}_{textAugPrefix_aug}' if augmented else f'{model}_{textAugPrefix_raw}'
  dfs = []
  for i in range(1, max_parts+1):
    url = f"{textAugBase}{prefix}{i}.csv.gz"
    try:
        df = pd.read_csv(url, compression='gzip')
        df['text'] = df.embeddings.map(parse)
        dfs.append(df[['itemId','text']])
    except:
        break
  out = pd.concat(dfs).drop_duplicates('itemId')
  if verbose:
      tag = 'AUG' if augmented else 'ORIG'
      print(f"[Text]  {tag} parts={len(dfs)} items={len(out):,}")
  return out

# Load
print(f"\nLoading Textual '{LLM_VARIANT}' files '{'with' if TEXT_AUGMENTED else 'without'}' augmentation ...")
textAugDF = loadTextAugmented(LLM_VARIANT, TEXT_AUGMENTED)
textAugDF.head(5)


Loading Textual 'openai' files 'with' augmentation ...
[Text]  AUG parts=3 items=1,606


Unnamed: 0,itemId,text
0,1516,"[-0.009714896, -0.024003807, -0.0416483, -0.02..."
1,5952,"[0.0024696812, -0.03361401, -0.019164726, -0.0..."
2,370,"[-0.0020823667, -0.027629452, 0.006294715, -0...."
3,292,"[-0.011372974, -0.038963087, -0.024515806, -0...."
4,1209,"[0.007154904, -0.025495825, -0.011659123, -0.0..."


## **📊 [Step 3] Experiments**

In [None]:
# for df in (vis_df, aud_df, txt_df):
#   df['itemId'] = df.itemId.astype(str)