In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


### Load all the data needed

In [7]:
# Movie Dataset
movies = pd.read_csv('/home/pokji/vscode-projects/uni/information_search/data/movieapp_movie.csv')
movies.set_index('movie_id', inplace=True)

# Metadata features
content_based_features = np.load('all_features.npy')

# Plot embeddings
plot_embeddings = pd.read_csv('/home/pokji/vscode-projects/uni/information_search/first-recommender/movie_plot_embeddings.csv')
plot_embeddings.set_index('movieId', inplace=True)

# Poster embeddings
clip_embeddings_df = pd.read_csv('/home/pokji/vscode-projects/uni/information_search/poster-based-recommender/embeddings_movie_id.csv')
clip_embeddings = np.load('clip_poster_embeddings.npy')
clip_embeddings_df['embeddings'] = list(clip_embeddings)
clip_embeddings_df.set_index('movie_id', inplace=True)

In [None]:
# Set movie index for recommendations
movie_id = 24

### Content based

In [None]:
movie_idx = movies.index.get_loc(movie_id)

# Vector for the target movie to be compared
target_vector = content_based_features[movie_idx].reshape(1, -1)

# Calculate the similarity between every other movie
similarities = cosine_similarity(target_vector, content_based_features).flatten()

# Get top 5 most similar movies
most_similar_indices = similarities.argsort()[::-1][1: 6] 

# Take into account the rating of the movie
ratings = movies.loc[most_similar_indices]['avgRating']
ratings_scaled = MinMaxScaler().fit_transform(ratings.values.reshape(-1, 1)).flatten()

# Combine similarity and rating
sim_weight = 0.7
rating_weight = 0.3
combined_score = (similarities[most_similar_indices] * sim_weight) + (ratings_scaled * rating_weight)

sorted_idx = np.argsort(combined_score)[::-1]

recommended = movies.iloc[most_similar_indices].iloc[sorted_idx]
print(recommended[['title', 'releaseYear', 'avgRating']])

### Plot based

In [None]:
# Data frame connecting the movieIDs and Names
movie_names = movies['title']

# List of all movie Ids
movie_ids = plot_embeddings.index.tolist()


def get_similar_movies(movie_id, n=5):
    """ Return the titles of N most similar movies """

    # If the movie can't be found
    if movie_id not in movie_ids:
        return f"Movie ID {movie_id} not found in dataset"
    
    # get the selected movie embeddings
    target_embedding = plot_embeddings.loc[movie_id].values.reshape(1, -1)
    
    # Compute the similarite between chosen movie and all other
    similarities = cosine_similarity(target_embedding, plot_embeddings)[0]

    # A list of (movie_id, similarity)
    similarity_pairs = list(zip(plot_embeddings.index, similarities))

    # Sort the movies by similarity
    similarity_pairs.sort(key=lambda x: x[1], reverse=True)

    # Choose top N movies (without the first one = the same movie)
    top_similar = similarity_pairs[1:n+1]
    top_movie_ids = [movie_id for movie_id, _ in top_similar]

    # The ids and names of recommended movies
    top_movie_names = movie_names[top_movie_ids]

    print(f"Top {n} most similar movies for {movie_names[movie_id]}")

    return top_movie_names

### Poster based

In [None]:
# Create movie_id - embeddings mapping
clip_embeddings_df['embeddings'] = list(clip_embeddings)

# Explode the genres column in movies DataFrame
movies_exploded = movies.explode('genres')

# Merge the movies DataFrame with the embeddings
movies_with_posters = pd.merge(
    movies_exploded,
    clip_embeddings_df[['movie_id', 'embeddings']],
    left_on='movie_id',  # adjust if your movie ID column is named differently
    right_on='movie_id'
)


def recommend_by_poster(movie_id, top_n=5):
    """ Recommend movies based on the poster of a given movie ID
        ARGS: movie_id (MovieLens), top_n movies
        RETURNS: MovieLens IDs
    """

    target_rows = movies_with_posters[movies_with_posters['movie_id'] == movie_id]
    if target_rows.empty:
        print(f"No movie found with ID {movie_id}.")
        return None
    
    # Get all unique genres for the target movie
    target_genres = set(target_rows['genres'])

    # Use the first embedding
    target_emb = target_rows.iloc[0]['embeddings'].reshape(1, -1)

    # Filter candidates based on genres and exclude the target movie
    candidates = movies_with_posters[
        (movies_with_posters['movie_id'] != movie_id) &
        (movies_with_posters['genres'].isin(target_genres))
    ].copy()

    # Drop the duplicated movies from exploded df
    candidates = candidates.drop_duplicates('movie_id')


    if candidates.empty:
        print("No other movies with matching genres.")
        return None
    
    # Compute cosine similarity
    embds = np.stack(candidates['embeddings'].values)
    sims = cosine_similarity(target_emb, embds)[0]
    
    candidates['similarity'] = sims

    recommendations = candidates.sort_values('similarity', ascending=False).head(top_n)
    return recommendations['movie_id'].tolist()

recommendations = recommend_by_poster(movie_id=1)
print(recommendations)


### Merged recommendations

In [8]:
def merged_recommendations(movie_id, top_n=5, weights=(0.4, 0.3, 0.3)):
    """
    Combine content, plot, and poster similarities for recommendations.
    weights: (content_weight, plot_weight, poster_weight)
    """
    # --- Content-based ---
    try:
        movie_idx = movies.index.get_loc(movie_id)
    except KeyError:
        print(f"Movie ID {movie_id} not found in movies.")
        return []
    target_vector = content_based_features[movie_idx].reshape(1, -1)
    content_sims = cosine_similarity(target_vector, content_based_features).flatten()
    
    # --- Plot-based ---
    if movie_id not in plot_embeddings.index:
        print(f"Movie ID {movie_id} not found in plot embeddings.")
        return []
    plot_target = plot_embeddings.loc[movie_id].values.reshape(1, -1)
    plot_sims = cosine_similarity(plot_target, plot_embeddings)[0]
    
    # --- Poster-based ---
    poster_row = clip_embeddings_df[clip_embeddings_df.index == movie_id]
    if poster_row.empty:
        print(f"Movie ID {movie_id} not found in poster embeddings.")
        return []
    
    poster_emb = poster_row.iloc[0]['embeddings'].reshape(1, -1)
    poster_sims = cosine_similarity(poster_emb, clip_embeddings).flatten()
    
    # --- Align indices ---
    all_movie_ids = list(movies.index)
    # Ensure all arrays are aligned to movies DataFrame index
    content_sims = pd.Series(content_sims, index=all_movie_ids)
    plot_sims = pd.Series(plot_sims, index=plot_embeddings.index)
    poster_sims = pd.Series(poster_sims, index=clip_embeddings_df.index)
    
    # Reindex plot and poster to match movies
    plot_sims = plot_sims.reindex(all_movie_ids, fill_value=0)
    poster_sims = poster_sims.reindex(all_movie_ids, fill_value=0)
    
    # --- Normalize ---
    scaler = MinMaxScaler()
    content_norm = scaler.fit_transform(content_sims.values.reshape(-1, 1)).flatten()
    plot_norm = scaler.fit_transform(plot_sims.values.reshape(-1, 1)).flatten()
    poster_norm = scaler.fit_transform(poster_sims.values.reshape(-1, 1)).flatten()
    
    # --- Weighted sum ---
    combined_score = (
        weights[0] * content_norm +
        weights[1] * plot_norm +
        weights[2] * poster_norm
    )
    
    # Exclude the target movie itself
    result_df = pd.DataFrame({
        'movie_id': all_movie_ids,
        'score': combined_score
    }).set_index('movie_id')
    result_df = result_df.drop(movie_id)
    
    # Top N recommendations
    top_recs = result_df.sort_values('score', ascending=False).head(top_n)
    return movies.loc[top_recs.index][['title', 'releaseYear', 'avgRating']]

# Example usage:
recommendations = merged_recommendations(movie_id=1, top_n=5)
print(recommendations)


                         title  releaseYear  avgRating
movie_id                                              
3114               Toy Story 2       1999.0        3.8
78499              Toy Story 3       2010.0        3.9
95446                  Tin Toy       1988.0        3.2
120468         Partysaurus Rex       2012.0        3.4
106022    Toy Story of Terror!       2013.0        3.4
