In [35]:
import pandas as pd
import numpy as np
import random
import time
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import HDBSCAN
from ollama import chat
import numpy as np
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor, as_completed
from loader_clone import create_loaders

from representant_base import (
    Representant,
    DiversityStimulus,
    RepresentantGenerator,
    GenresDiversityHandler,
    PlotDiversityHandler,
)

In [36]:
_, loader = create_loaders()


2017


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loader.ratings_df.loc[:, "ratings_per_year"] = loader.ratings_df['movieId'].map(loader.ratings_df['movieId'].value_counts()) / loader.ratings_df['movieId'].map(movies_df_indexed["age"])


Ratings shape after filtering: (3536742, 5), n_users = 9612, n_items = 1525
2017


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loader.ratings_df.loc[:, "ratings_per_year"] = loader.ratings_df['movieId'].map(loader.ratings_df['movieId'].value_counts()) / loader.ratings_df['movieId'].map(movies_df_indexed["age"])


Ratings shape after filtering: (8146440, 5), n_users = 34683, n_items = 9456


In [37]:
# mapping for filtered dataset...
movie_ids = sorted(loader.ratings_df.movieId.unique())
id2idx = {mid: i for i, mid in enumerate(movie_ids)}
idx2id = {i: mid for mid, i in id2idx.items()}

# rename cols as in EasyStudy and map movieId to indices
ratings_mapped = (
    loader.ratings_df
    .rename(columns={"userId": "user",
                     "movieId": "item",
                     "rating": "rating"})
    .assign(item=lambda x: x["item"].map(id2idx))
    [["user", "item", "rating"]]
    .astype({"user": int, "item": int, "rating": float})
    .reset_index(drop=True)
)

loader.ratings_df = ratings_mapped

print(ratings_mapped.head())


   user  item  rating
0    16    74     1.0
1    16    86     1.5
2    16   213     1.0
3    16   313     2.0
4    16   347     1.0


In [61]:
def apply_quantile_transform(scores, output_distribution="normal"):
    # Map scores to a specified distribution using quantile transformation
    qt = QuantileTransformer(output_distribution=output_distribution, random_state=42)
    scores_q = qt.fit_transform(scores)
    # Return both transformed scores and the transformer object
    return scores_q, qt

class LLMProfiling():

    def __init__(self, loader, positive_threshold, l2, **kwargs):
        self._ratings_df = None
        self._loader = None
        self._all_items = None

        self._model = None

        self._hdbscan_clusterer = None

        self.stimulus_handlers: dict[DiversityStimulus, type[RepresentantGenerator]] = {
            DiversityStimulus.GENRES: GenresDiversityHandler,
            DiversityStimulus.PLOT: PlotDiversityHandler,
        }

        self.diversity_stimulus = None

        self._rating_matrix = None

        self._threshold = positive_threshold
        self._l2 = l2

        self._items_count = None

        self._weights = None
        self._max_clusters = None

    def fit(self, loader):

        self._ratings_df = loader.ratings_df
        self._loader = loader
        self._all_items = self._ratings_df.item.unique()

        self._model = SentenceTransformer('all-MiniLM-L6-v2')

        self._hdbscan_clusterer = HDBSCAN(
            min_cluster_size=3,
            min_samples=None,
            metric='cosine',
        )

        self._rating_matrix = (
            self._loader.ratings_df.pivot(index="user", columns="item", values="rating")
            .fillna(0)
            .values
        )

        self._items_count =  np.shape(self._rating_matrix)[1]

        X = np.where(self._rating_matrix >= self._threshold, 1, 0).astype(np.float32)

        # Compute Gram matrix (G = X^T @ X)
        G = X.T @ X
        G += self._l2 * np.eye(self._items_count)  # Regularization

        # Compute the inverse of G
        P = np.linalg.inv(G)

        # Compute B matrix
        diag_P = np.diag(P)
        B = P / (-diag_P[:, None])  # Normalize rows by diagonal elements
        np.fill_diagonal(B, 0)  # Set diagonal to zero
        print("B matrix computed")
        self._weights = B
        self._max_clusters = 2

    # Predict for the user
    def predict(self, selected_items, filter_out_items, k, dominant_stimulus_weight, dominant_stimulus):
        #print("Selected", selected_items)
        #print("Filter out", filter_out_items)

        indices = list(selected_items)
        user_vector = np.zeros((self._items_count,), dtype=np.float32)
        for i in indices:
            user_vector[i] = 1.0

        relevance_scores = np.dot(user_vector, self._weights)
        relevance_scores = apply_quantile_transform(relevance_scores.reshape(-1, 1), output_distribution='normal')[0].flatten()

        MAX_CLUSTERS = 3
        
        self.diversity_stimulus = dominant_stimulus
        if self.diversity_stimulus == DiversityStimulus.GENRES:
            genre_weight = dominant_stimulus_weight
            plot_weight = 1 - dominant_stimulus_weight
        else:
            plot_weight = dominant_stimulus_weight
            genre_weight = 1 - dominant_stimulus_weight

        # Prepare user-preferred movies based on selected items
        user_preferred_movies = []
        for item in selected_items:
            user_preferred_movies.append(self._loader.movies_df.iloc[item])

        # #print("PREF MOVIES:", user_preferred_movies)

        # Update the final embedding calculation with the genre and plot weights
        final_embedding = genre_weight * self._loader.genres_embeddings + plot_weight * self._loader.plot_embeddings
        #print("final embed shape", final_embedding.shape)
        
        mask = np.ones(final_embedding.shape[0], dtype=bool)
        mask[filter_out_items] = 0
        relevance_scores[filter_out_items] = 0
        original_indices = np.where(mask)[0]
        emb_matrix = final_embedding[mask]
        
        user_genre_embeddings = self._model.encode([movie['genres'] for movie in user_preferred_movies])
        user_plot_embeddings = self._model.encode([movie['plot'] for movie in user_preferred_movies])
        user_embeddings = genre_weight * user_genre_embeddings + plot_weight * user_plot_embeddings
        #print(user_embeddings.shape)

        #print("clustering")
        
        cluster_labels = self._hdbscan_clusterer.fit_predict(user_embeddings)
        #print("clustering DONE")
        print(cluster_labels)
        clusters = {}

        if len(np.unique(cluster_labels)) == 1: # No clusters found sample randomly
            random_indices = np.random.choice(len(user_preferred_movies), size=min(len(user_preferred_movies),self._max_clusters), replace=False)
            for i in random_indices:
                label = "random_" + str(i)
                clusters[label] = user_preferred_movies[i]
        else:
            labels, counts = np.unique(cluster_labels[cluster_labels != -1], return_counts=True)

            # Get index of most and least common clusters
            most_common_cluster = labels[np.argmax(counts)]
            least_common_cluster = labels[np.argmin(counts)]

            # If they are the same (e.g., all clusters have same size), pick a different one for diversity
            if most_common_cluster == least_common_cluster and len(labels) > 1:
                # Exclude the most common cluster and pick randomly from the rest
                alternative_clusters = [label for label in labels if label != most_common_cluster]
                least_common_cluster = np.random.choice(alternative_clusters)

            print(f"Most common cluster: {most_common_cluster}")
            print(f"Least common cluster: {least_common_cluster}")

            selected_clusters = [most_common_cluster, least_common_cluster]

            cluster_mask = ~np.isin(cluster_labels, selected_clusters)
            cluster_labels[cluster_mask] = -1 # Mask remaining clusters as noise
            print(f"Cluster labels: {cluster_labels}")

            for i in range(len(user_preferred_movies)):
                label = cluster_labels[i]
                movie_info = user_preferred_movies[i]
                if label == -1:
                    continue # Skip noise movies
                if str(label) not in clusters:
                    clusters[str(label)] = []
                clusters[str(label)].append(movie_info)

        tasks = list(clusters.items())

        def _produce(label, data):
            if label.startswith("random_"):
                rep = Representant(genres=data["genres"], plot=data["plot"])
            else:
                limit = min(len(data), 10)  # Limit to 10 movies per cluster since we are using Llama3.1:8b
                print(f"limit: {limit}")
                indices = np.random.choice(len(data), size=limit, replace=False)
                cluster_sample = [data[i] for i in indices]
                # print input movies
                for i, movie in enumerate(cluster_sample):
                    print(f"Movie {i}: {movie['title']} ({movie['genres']}) - {movie['plot']}...")
                rep = self._generate_representant(cluster_sample, self.diversity_stimulus)
                # print repreresentant
                if rep:
                    print(f"Representant: {rep.genres} - {rep.plot}...") 

            if not rep:
                return label, None, None

            rep_genre_embeddings = self._model.encode([rep.genres])
            rep_plot_embeddings = self._model.encode([rep.plot])
            rep_embeddings = genre_weight * rep_genre_embeddings + plot_weight * rep_plot_embeddings

            return label, rep, rep_embeddings

        representants = []
        representant_embeddings_dict  = {}

        with ThreadPoolExecutor(max_workers=1) as pool:
            futures = [pool.submit(_produce, lbl, data) for lbl, data in tasks]

            for fut in as_completed(futures):
                label, rep, emb = fut.result()
                if rep is not None:
                    representants.append(rep)
                    representant_embeddings_dict[label] = emb
        
        #print("\n--- Generating Diversity Representant ---")
        div_representant = self._generate_diversity_representant(representants, self.diversity_stimulus)
        if div_representant:
            print(f"Generated diversity representant:", div_representant)
            rep_genre_embeddings = self._model.encode([div_representant.genres])
            rep_plot_embeddings = self._model.encode([div_representant.plot])
            rep_embeddings = genre_weight * rep_genre_embeddings + plot_weight * rep_plot_embeddings
            representant_embeddings_dict["diversity"] = rep_embeddings
        else:
            #print("Could not generate diversity representant.")
            pass

        # Find similar embeddings, movies
        used_items = set()
        cluster_candidates = {}
        for cluster_id, rep_emb in representant_embeddings_dict.items():
            similarities = cosine_similarity(rep_emb.reshape(1, -1), emb_matrix)[0]
            # multiply similarities by preds item wise
            similarities = apply_quantile_transform(similarities.reshape(-1, 1), output_distribution='normal')[0].flatten()
            similarities = similarities * relevance_scores[mask]
            closest_indices = np.argsort(-similarities)[:k]
            top_k_original_indices = original_indices[closest_indices]
            cluster_candidates[cluster_id] = [int(i) for i in top_k_original_indices if int(i) not in used_items]

        # Create result in round-robin way
        # TODO: Use LLM to re-rank the items ? or half of them ?
        cluster_ids = list(cluster_candidates.keys())
        result = []
        i = 0
        while len(result) < k and any(cluster_candidates.values()):
            cluster_id = cluster_ids[i % len(cluster_ids)]
            candidates = cluster_candidates[cluster_id]

            if candidates:
                candidate = candidates.pop(0)
                if candidate not in used_items:
                    result.append(candidate)
                    used_items.add(candidate)
            i += 1

        #print("LLMprofiling done")

        return result[:k]

    def _generate_representant(self, movies_cluster, stimulus: DiversityStimulus):
            
        handler = self.stimulus_handlers[stimulus]()

        return handler.generate_cluster_representant(movies_cluster)
    
    def _generate_diversity_representant(self, representants, stimulus: DiversityStimulus):

        handler = self.stimulus_handlers[stimulus]()

        return handler.generate_diversity_representant(representants)
    

In [62]:
algo = LLMProfiling(loader=loader, positive_threshold=2.5, l2=500)
algo.fit(loader)

B matrix computed


In [63]:
titles = [
    "Interstellar (2014)",
    "The Martian (2015)",
    "Ad Astra (2019)",
    "Hercules (1997)",
    "Percy Jackson & the Olympians: The Lightning Thief (2010)",
    "Clash of the Titans (2010)",
    "Fire Island (2022)",
    "The Menu (2022)",
    "Madagascar (2005)",
    "Incredibles 2 (2018)",
    "Incredibles, The (2004)",
    "Wheelman (2017)",
    "Drive (2011)"
]

In [64]:
# get movie ids from titles
movie_ids = loader.movies_df[loader.movies_df['title'].isin(titles)]['movieId'].index.tolist()
res = algo.predict(
    selected_items=movie_ids,
    filter_out_items=movie_ids,
    k=10,
    dominant_stimulus_weight=0.8,
    dominant_stimulus=DiversityStimulus.PLOT
    )

[ 0 -1 -1  0  0 -1  1  1 -1 -1  1 -1 -1]
Most common cluster: 0
Least common cluster: 1
Cluster labels: [ 0 -1 -1  0  0 -1  1  1 -1 -1  1 -1 -1]
limit: 3
Movie 0: Clash of the Titans (2010) (Action, Adventure, Drama, Fantasy) - Perseus, a demigod and the son of Zeus, battles the minions of Hades and the Underworld in order to stop them from conquering Olympus and Earth....
Movie 1: Percy Jackson & the Olympians: The Lightning Thief (2010) (Adventure, Fantasy) - A teenager discovers he&#x27;s the descendant of a Greek god and sets out on an adventure to settle an on-going battle between the gods....
Movie 2: Hercules (1997) (Adventure, Animation, Children, Comedy, Musical) - The son of Zeus and Hera is stripped of his immortality as an infant and must become a true hero in order to reclaim it....
Representant: Action, Adventure, Drama, Fantasy - In a world where gods and mortals coexist, a young hero discovers they're the last living heir of a powerful deity. As they embark on an epic q

In [60]:
res_movies = loader.movies_df.iloc[res]
print("Recommended movies:")
for i, movie in res_movies.iterrows():
    print(f"{i}: {movie['title']} ({movie['genres']}) - {movie['plot'][:100]}...")

Recommended movies:
6324: Percy Jackson: Sea of Monsters (2013) (Adventure, Children, Fantasy) - In order to restore their dying safe haven, the son of Poseidon and his friends embark on a quest to...
9402: 65 (2023) (Sci-Fi, Thriller) - An astronaut crash lands on a mysterious planet only to discover he&#x27;s not alone....
4563: 300 (2007) (Action, Fantasy, War, IMAX) - In the ancient battle of Thermopylae, King Leonidas and 300 Spartans fight against Xerxes and his ma...
5948: Wrath of the Titans (2012) (Action, Adventure, Fantasy, IMAX) - Perseus braves the treacherous underworld to rescue his father, Zeus, captured by his son, Ares, and...
8396: Roma (2018) (Drama) - A year in the life of a upper-middle-class family&#x27;s maid in Mexico City in the early 1970s....
6672: Nightcrawler (2014) (Crime, Drama, Thriller) - A petty thief desperate for work muscles into the world of crime journalism and becomes the star of ...
7358: Gods of Egypt (2016) (Adventure, Fantasy) - Mortal hero 