In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
# Load the dataset with low_memory=False to suppress DtypeWarning
file_path = './data/movies_metadata.csv'
movies_metadata = pd.read_csv(file_path, low_memory=False)

# Data Cleaning and Feature Extraction

# Drop rows with missing 'overview' or 'title'
movies_metadata_cleaned = movies_metadata.dropna(subset=['overview', 'title'])

In [21]:
# Convert 'genres' from string representation of lists to actual lists
import ast

def parse_genres(genres_str):
    try:
        genres_list = ast.literal_eval(genres_str)
        return [genre['name'] for genre in genres_list]
    except:
        return []

# Use .loc to avoid SettingWithCopyWarning
movies_metadata_cleaned.loc[:, 'genres'] = movies_metadata_cleaned['genres'].apply(parse_genres)

In [12]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
    return embeddings.flatten()  # Flatten to 2D array

# Generate embeddings for all movie overviews with progress bar
embeddings = []
for overview in tqdm(movies_metadata_cleaned['overview'], desc="Generating embeddings"):
    embeddings.append(get_embeddings(overview))

movies_metadata_cleaned['embeddings'] = embeddings

Generating embeddings: 100%|██████████| 44506/44506 [1:06:48<00:00, 11.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_metadata_cleaned['embeddings'] = embeddings


In [15]:
import pickle

# Save embeddings to a file
with open('movie_embeddings.pkl', 'wb') as f:
    pickle.dump(movies_metadata_cleaned, f)

print("Embeddings saved successfully.")

Embeddings saved successfully.


In [13]:
# Calculate Similarities and Recommend Movies
def recommend_movies(movie_title, num_recommendations=5):
    # Get the embeddings for the given movie title
    movie_idx = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == movie_title.lower()].index[0]
    movie_embedding = movies_metadata_cleaned.loc[movie_idx, 'embeddings']
    
    # Calculate cosine similarity between the given movie and all others
    similarities = cosine_similarity([movie_embedding], list(movies_metadata_cleaned['embeddings']))
    
    # Get indices of the most similar movies
    similar_indices = similarities.argsort()[0][-num_recommendations-1:-1][::-1]
    
    # Return the titles of the recommended movies
    return movies_metadata_cleaned.iloc[similar_indices]['title'].tolist()

In [14]:
# Example usage
recommended_movies = recommend_movies('Toy Story')
print(recommended_movies)


['Toy Story 2', 'Toy Story 3', 'A Fairly Odd Movie: Grow Up, Timmy Turner!', 'Santa Buddies', 'Frankenweenie']


### Pregenerated Embeddings

In [23]:
import pandas as pd
import pickle

# Load the embeddings from the file
with open('movie_embeddings.pkl', 'rb') as f:
    movies_metadata_cleaned = pickle.load(f)

# Calculate Similarities and Recommend Movies
def recommend_movies(movie_title, num_recommendations=5):
    # Get the embeddings for the given movie title
    movie_idx = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == movie_title.lower()].index[0]
    movie_embedding = movies_metadata_cleaned.loc[movie_idx, 'embeddings']
    
    # Calculate cosine similarity between the given movie and all others
    similarities = cosine_similarity([movie_embedding], list(movies_metadata_cleaned['embeddings']))
    
    # Get indices of the most similar movies
    similar_indices = similarities.argsort()[0][-num_recommendations-1:-1][::-1]
    
    # Return the titles of the recommended movies
    return movies_metadata_cleaned.iloc[similar_indices]['title'].tolist()



In [17]:
# Example usage
recommended_movies = recommend_movies('Toy Story')
print(recommended_movies)

['Toy Story 2', 'Toy Story 3', 'A Fairly Odd Movie: Grow Up, Timmy Turner!', 'Santa Buddies', 'Frankenweenie']


### Approach Explaination

In [4]:
import pandas as pd
import pickle

# Load the embeddings from the file
file_path = 'movie_embeddings.pkl'  # Adjust the path to your local file
with open(file_path, 'rb') as f:
    movies_metadata_cleaned = pickle.load(f)

# Extract overviews and embeddings for "Toy Story" and "Toy Story 2"
toy_story_1 = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == 'toy story']
toy_story_2 = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == 'toy story 2']

toy_story_1_overview = toy_story_1['overview'].values[0]
toy_story_2_overview = toy_story_2['overview'].values[0]

toy_story_1_embedding = toy_story_1['embeddings'].values[0]
toy_story_2_embedding = toy_story_2['embeddings'].values[0]

print("Toy Story Overview:", toy_story_1_overview)
print("Toy Story Embedding:", toy_story_1_embedding)
print("Toy Story 2 Overview:", toy_story_2_overview)
print("Toy Story 2 Embedding:", toy_story_2_embedding)


Toy Story Overview: Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.
Toy Story Embedding: [-2.51855224e-01  2.36002162e-01  2.19279557e-01 -1.60503700e-01
  2.77142406e-01 -1.14872968e-02  1.97392583e-01  3.12333763e-01
  1.45756930e-01 -6.70629516e-02 -5.16198315e-02 -4.43411440e-01
 -1.82073027e-01  4.56858307e-01 -1.65228158e-01  4.60242301e-01
  4.79053885e-01  1.02735549e-01  2.77580023e-02  6.01836503e-01
  1.80173457e-01 -1.96500309e-02 -2.46261746e-01  5.22988260e-01
  5.73677480e-01  1.95410609e-01 -1.09145828e-01  9.95385125e-02
  1.43513242e-02  1.33308947e-01  2.66691595e-01 -1.75681412e-01
 -3.87390740e-02 -3.35716724e-01 -5.08531332e-01  7.15702921e-02
  9.36417207e-02 -3.44585776e-01 -5.42960279e-02  9.13154334e-02
 -1.7

In [7]:
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity

# Load the embeddings from the file
file_path = 'movie_embeddings.pkl'  # Adjust the path to your local file
with open(file_path, 'rb') as f:
    movies_metadata_cleaned = pickle.load(f)

# Function to recommend movies and print similarities
def recommend_movies_with_similarity(movie_title, num_recommendations=5):
    # Get the embeddings for the given movie title
    movie_idx = movies_metadata_cleaned[movies_metadata_cleaned['title'].str.lower() == movie_title.lower()].index[0]
    movie_embedding = movies_metadata_cleaned.loc[movie_idx, 'embeddings']
    
    # Calculate cosine similarity between the given movie and all others
    similarities = cosine_similarity([movie_embedding], list(movies_metadata_cleaned['embeddings']))
    
    # Get indices of the most similar movies
    similar_indices = similarities.argsort()[0][-num_recommendations-1:-1][::-1]
    
    # Get the similarity scores of the most similar movies
    similar_movies = movies_metadata_cleaned.iloc[similar_indices]
    similarity_scores = similarities[0][similar_indices]
    
    # Return the titles and similarity scores of the recommended movies
    return list(zip(similar_movies['title'], similarity_scores))

# Example usage
recommended_movies = recommend_movies_with_similarity('Toy Story')
for title, score in recommended_movies:
    print(f"{title}, Similarity: {score:.4f}")


Toy Story 2, Similarity: 0.9195
Toy Story 3, Similarity: 0.8972
A Fairly Odd Movie: Grow Up, Timmy Turner!, Similarity: 0.8906
Santa Buddies, Similarity: 0.8902
Frankenweenie, Similarity: 0.8896


In [19]:
def genre_overlap(genres1, genres2):
    if not genres1 or not genres2:
        return 0
    set1, set2 = set(genres1), set(genres2)
    return len(set1.intersection(set2)) / len(set1.union(set2))

# Evaluate recommendations and print overlap percentages
def evaluate_recommendations_content_based(recommendation_func, test_cases, metadata, k=5):
    results = []
    
    for movie_title in test_cases:
        try:
            recommended_movies = recommendation_func(movie_title, k)
            original_genres = set(metadata[metadata['title'].str.lower() == movie_title.lower()]['genres'].values[0])
            print(f"Evaluating recommendations for '{movie_title}':")
            for rec_movie in recommended_movies:
                rec_genres = set(metadata[metadata['title'] == rec_movie]['genres'].values[0])
                genre_overlap_score = genre_overlap(original_genres, rec_genres)
                print(f"- Recommended Movie: '{rec_movie}' | Genre Overlap: {genre_overlap_score:.2%}")
                results.append(genre_overlap_score)
            print()  # Print a newline for better readability
        except IndexError:
            continue
    
    return np.mean(results)

# Define test cases
test_cases = ['Toy Story', 'The Matrix', 'Inception']

# Evaluate the recommendations
average_genre_overlap = evaluate_recommendations_content_based(recommend_movies, test_cases, movies_metadata_cleaned, k=5)
print(f"Average Genre Overlap: {average_genre_overlap:.2%}")

Evaluating recommendations for 'Toy Story':
- Recommended Movie: 'Toy Story 2' | Genre Overlap: 100.00%
- Recommended Movie: 'Toy Story 3' | Genre Overlap: 100.00%
- Recommended Movie: 'A Fairly Odd Movie: Grow Up, Timmy Turner!' | Genre Overlap: 25.00%
- Recommended Movie: 'Santa Buddies' | Genre Overlap: 0.00%
- Recommended Movie: 'Frankenweenie' | Genre Overlap: 100.00%

Evaluating recommendations for 'The Matrix':
- Recommended Movie: 'Transmorphers' | Genre Overlap: 66.67%
- Recommended Movie: 'Cyborg X' | Genre Overlap: 66.67%
- Recommended Movie: 'Halo: Nightfall' | Genre Overlap: 50.00%
- Recommended Movie: 'Starship Troopers' | Genre Overlap: 50.00%
- Recommended Movie: 'Valerian and the City of a Thousand Planets' | Genre Overlap: 66.67%

Evaluating recommendations for 'Inception':
- Recommended Movie: 'Extracted' | Genre Overlap: 40.00%
- Recommended Movie: 'Red Mist' | Genre Overlap: 16.67%
- Recommended Movie: 'Johnny Mnemonic' | Genre Overlap: 66.67%
- Recommended Movie: 

In [27]:
import random

test_cases = movies_metadata_cleaned['title'].tolist()[:20]

# Evaluate the recommendations
average_genre_overlap = evaluate_recommendations_content_based(recommend_movies, test_cases, movies_metadata_cleaned, k=5)
print(f"Average Genre Overlap: {average_genre_overlap:.2%}")

Evaluating recommendations for 'The American President':
- Recommended Movie: 'Sweet Smell of Success' | Genre Overlap: 33.33%
- Recommended Movie: 'Confess' | Genre Overlap: 20.00%
- Recommended Movie: 'Dave' | Genre Overlap: 33.33%
- Recommended Movie: 'State of Play' | Genre Overlap: 0.00%
- Recommended Movie: 'Wife vs. Secretary' | Genre Overlap: 100.00%

Evaluating recommendations for 'Dracula: Dead and Loving It':
- Recommended Movie: 'Horror of Dracula' | Genre Overlap: 50.00%
- Recommended Movie: 'Daemonium: Soldier of the Underworld' | Genre Overlap: 0.00%
- Recommended Movie: 'Mark of the Vampire' | Genre Overlap: 33.33%
- Recommended Movie: 'Fantômas: Fantômas Against Fantômas' | Genre Overlap: 0.00%
- Recommended Movie: 'Vampire in Brooklyn' | Genre Overlap: 66.67%

Evaluating recommendations for 'Balto':
- Recommended Movie: 'The Thaw' | Genre Overlap: 0.00%
- Recommended Movie: 'Khadak' | Genre Overlap: 0.00%
- Recommended Movie: 'Germ' | Genre Overlap: 0.00%
- Recommende

KeyboardInterrupt: 