In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

In [None]:
# Load the dataset with low_memory=False to suppress DtypeWarning
file_path = './data/movies_metadata.csv'
movies_metadata = pd.read_csv(file_path, low_memory=False)

# Data Cleaning and Feature Extraction

# Drop rows with missing 'overview' or 'title'
movies_metadata_cleaned = movies_metadata.dropna(subset=['overview', 'title'])

In [None]:
# Convert 'genres' from string representation of lists to actual lists
import ast

def parse_genres(genres_str):
    try:
        genres_list = ast.literal_eval(genres_str)
        return [genre['name'] for genre in genres_list]
    except:
        return []

# Use .loc to avoid SettingWithCopyWarning
movies_metadata_cleaned.loc[:, 'genres'] = movies_metadata_cleaned['genres'].apply(parse_genres)

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings.flatten()  # Flatten to 2D array

# Generate embeddings for all movie overviews with progress bar
embeddings = []
for overview in tqdm(movies_metadata_cleaned['overview'], desc="Generating embeddings"):
    embeddings.append(get_embeddings(overview))

movies_metadata_cleaned['embeddings'] = embeddings

In [None]:
import pickle

# Save embeddings to a file
with open('movie_embeddings.pkl', 'wb') as f:
    pickle.dump(movies_metadata_cleaned, f)

print("Embeddings saved successfully.")

In [None]:
# Calculate Similarities and Recommend Movies
def recommend_movies(movie_title, num_recommendations=5):
    # Get the embeddings for the given movie title
    movie_idx = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == movie_title.lower()].index[0]
    movie_embedding = movies_metadata_cleaned.loc[movie_idx, 'embeddings']
    
    # Calculate cosine similarity between the given movie and all others
    similarities = cosine_similarity([movie_embedding], list(movies_metadata_cleaned['embeddings']))
    
    # Get indices of the most similar movies
    similar_indices = similarities.argsort()[0][-num_recommendations-1:-1][::-1]
    
    # Return the titles of the recommended movies
    return movies_metadata_cleaned.iloc[similar_indices]['title'].tolist()

In [None]:
# Example usage
recommended_movies = recommend_movies('Toy Story')
print(recommended_movies)


### Pregenerated Embeddings

In [None]:
import pandas as pd
import pickle

# Load the embeddings from the file
with open('movie_embeddings.pkl', 'rb') as f:
    movies_metadata_cleaned = pickle.load(f)

# Calculate Similarities and Recommend Movies
def recommend_movies(movie_title, num_recommendations=5):
    # Get the embeddings for the given movie title
    movie_idx = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == movie_title.lower()].index[0]
    movie_embedding = movies_metadata_cleaned.loc[movie_idx, 'embeddings']
    
    # Calculate cosine similarity between the given movie and all others
    similarities = cosine_similarity([movie_embedding], list(movies_metadata_cleaned['embeddings']))
    
    # Get indices of the most similar movies
    similar_indices = similarities.argsort()[0][-num_recommendations-1:-1][::-1]
    
    # Return the titles of the recommended movies
    return movies_metadata_cleaned.iloc[similar_indices]['title'].tolist()



In [None]:
# Example usage
recommended_movies = recommend_movies('Toy Story')
print(recommended_movies)

### Approach Explaination

In [None]:
import pandas as pd
import pickle

# Load the embeddings from the file
file_path = 'movie_embeddings.pkl'  # Adjust the path to your local file
with open(file_path, 'rb') as f:
    movies_metadata_cleaned = pickle.load(f)

# Extract overviews and embeddings for "Toy Story" and "Toy Story 2"
toy_story_1 = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == 'toy story']
toy_story_2 = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == 'toy story 2']

toy_story_1_overview = toy_story_1['overview'].values[0]
toy_story_2_overview = toy_story_2['overview'].values[0]

toy_story_1_embedding = toy_story_1['embeddings'].values[0]
toy_story_2_embedding = toy_story_2['embeddings'].values[0]

print("Toy Story Overview:", toy_story_1_overview)
print("Toy Story Embedding:", toy_story_1_embedding)
print("Toy Story 2 Overview:", toy_story_2_overview)
print("Toy Story 2 Embedding:", toy_story_2_embedding)


In [None]:
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity

# Load the embeddings from the file
file_path = 'movie_embeddings.pkl'  # Adjust the path to your local file
with open(file_path, 'rb') as f:
    movies_metadata_cleaned = pickle.load(f)

# Function to recommend movies and print similarities
def recommend_movies_with_similarity(movie_title, num_recommendations=5):
    # Get the embeddings for the given movie title
    movie_idx = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == movie_title.lower()].index[0]
    movie_embedding = movies_metadata_cleaned.loc[movie_idx, 'embeddings']
    
    # Calculate cosine similarity between the given movie and all others
    similarities = cosine_similarity([movie_embedding], list(movies_metadata_cleaned['embeddings']))
    
    # Get indices of the most similar movies
    similar_indices = similarities.argsort()[0][-num_recommendations-1:-1][::-1]
    
    # Get the similarity scores of the most similar movies
    similar_movies = movies_metadata_cleaned.iloc[similar_indices]
    similarity_scores = similarities[0][similar_indices]
    
    # Return the titles and similarity scores of the recommended movies
    return list(zip(similar_movies['title'], similarity_scores))

# Example usage
recommended_movies = recommend_movies_with_similarity('Toy Story')
for title, score in recommended_movies:
    print(f"{title}, Similarity: {score:.4f}")


In [None]:
def genre_overlap(genres1, genres2):
    if not genres1 or not genres2:
        return 0
    set1, set2 = set(genres1), set(genres2)
    return len(set1.intersection(set2)) / len(set1.union(set2))

# Evaluate recommendations and print overlap percentages
def evaluate_recommendations_content_based(recommendation_func, test_cases, metadata, k=5):
    results = []
    
    for movie_title in test_cases:
        try:
            recommended_movies = recommendation_func(movie_title, k)
            original_genres = set(metadata[metadata['title'].str.lower() == movie_title.lower()]['genres'].values[0])
            print(f"Evaluating recommendations for '{movie_title}':")
            for rec_movie in recommended_movies:
                rec_genres = set(metadata[metadata['title'] == rec_movie]['genres'].values[0])
                genre_overlap_score = genre_overlap(original_genres, rec_genres)
                print(f"- Recommended Movie: '{rec_movie}' | Genre Overlap: {genre_overlap_score:.2%}")
                results.append(genre_overlap_score)
            print()  # Print a newline for better readability
        except IndexError:
            continue
    
    return np.mean(results)

# Define test cases
test_cases = ['Toy Story', 'The Matrix', 'Inception']

# Evaluate the recommendations
average_genre_overlap = evaluate_recommendations_content_based(recommend_movies, test_cases, movies_metadata_cleaned, k=5)
print(f"Average Genre Overlap: {average_genre_overlap:.2%}")

In [None]:
import random

test_cases = movies_metadata_cleaned['title'].tolist()[:20]

# Evaluate the recommendations
average_genre_overlap = evaluate_recommendations_content_based(recommend_movies, test_cases, movies_metadata_cleaned, k=5)
print(f"Average Genre Overlap: {average_genre_overlap:.2%}")