In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from itertools import combinations
import time
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
directory = '/Users/pin.lyu//Documents/BC_Folder/NLP/Data/Movie'

# Full file paths

movies_path = os.path.join(directory, "tmdb_5000_movies.csv")

credits_path = os.path.join(directory, "tmdb_5000_credits.csv")

# Load data

movies = pd.read_csv(movies_path)

credits = pd.read_csv(credits_path)

### EDA

In [5]:
movies.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [6]:
credits.head(3)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [7]:
# The shapes of the two data frames

movies_shape = movies.shape

credits_shape = credits.shape

print("Shape of movies DataFrame:", movies_shape)

print("Shape of credits DataFrame:", credits_shape)

Shape of movies DataFrame: (4803, 20)
Shape of credits DataFrame: (4803, 4)


In [8]:
# Description of movies 

movies['overview'].head(3)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
Name: overview, dtype: object

### Data Processing

In [10]:
# Merge the 'credits' and 'movies' DataFrames on 'movie_id'

movie_df = credits[['movie_id', 'title']].merge(movies[['id', 'overview']], left_on='movie_id', right_on='id', how='inner')

# Select only the columns needed

movie_df = movie_df[['movie_id', 'title', 'overview']]

movie_df.shape

(4803, 3)

In [11]:
movie_df.head()

Unnamed: 0,movie_id,title,overview
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [12]:
# handle NAs with ''

movie_df['overview'] = movie_df['overview'].fillna('')

In [13]:
def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    
    # Tokenize words
    tokens = word_tokenize(text)
    
    # Load stopwords
    stop_words = set(stopwords.words("english"))
    
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize and filter tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    
    return tokens

In [14]:
# Apply the preprocessing function to the column 'overview'

movie_df['overview_tokens'] = movie_df['overview'].apply(preprocess)

movie_df['overview_tokens']

0       [century, paraplegic, marine, dispatched, moon...
1       [captain, barbossa, long, believed, dead, come...
2       [cryptic, message, bond, past, sends, trail, u...
3       [following, death, district, attorney, harvey,...
4       [john, carter, warweary, former, military, cap...
                              ...                        
4798    [el, mariachi, want, play, guitar, carry, fami...
4799    [newlywed, couple, honeymoon, upended, arrival...
4800    [signed, sealed, delivered, introduces, dedica...
4801    [ambitious, new, york, attorney, sam, sent, sh...
4802    [ever, since, second, grade, first, saw, et, e...
Name: overview_tokens, Length: 4803, dtype: object

### Cosine Similarity

In [16]:
# Start the timer

start_time = time.time()

In [17]:
# Initialize TF-IDF Vectorizer

tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'overview' column into TF-IDF vectors

tfidf_matrix = tfidf.fit_transform(movie_df['overview'])

tfidf_matrix.shape

(4803, 20978)

In [18]:
# Computes the pairwise similarity between every pair of movies in the dataset

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [19]:
# Build the recommendation function

def recommend_movies(title, cosine_sim=cosine_sim, movie_df=movie_df):
    
    # Get the index of the movie that matches the title
    
    idx = movie_df[movie_df['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 5 most similar movies (excluding itself)
    
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar movies
    
    return movie_df['title'].iloc[movie_indices]

# Example usage

print(recommend_movies('Avengers: Age of Ultron'))

# ====================

# End the timer
end_time = time.time()

# Calculate and print the runtime
runtime = end_time - start_time
print(f"Runtime: {runtime:.4f} seconds")

16                   The Avengers
79                     Iron Man 2
68                       Iron Man
26     Captain America: Civil War
227                Knight and Day
Name: title, dtype: object
Runtime: 0.1824 seconds


### Jaccard Similarity 

In [21]:
# Start the timer

start_time = time.time()

In [22]:
#movie_df['overview_tokens'] = movie_df['overview'].apply(lambda x: set(x.lower().split()))

In [23]:
# Calculate Jaccard Similarity

def compute_jaccard_similarity(movie_df):
    
    n_movies = len(movie_df)
    
    jaccard_sim = np.zeros((n_movies, n_movies))

    # Compute Jaccard Similarity for each pair of movies
    
    for i, j in combinations(range(n_movies), 2):
        
        set_i = set(movie_df.loc[i, 'overview_tokens'])
        
        set_j = set(movie_df.loc[j, 'overview_tokens'])
        
        intersection = len(set_i.intersection(set_j))
        
        union = len(set_i.union(set_j))
        
        jaccard_sim[i, j] = intersection / union if union != 0 else 0
        
        jaccard_sim[j, i] = jaccard_sim[i, j]  # Symmetric matrix

    # Fill diagonal with 1 (each movie is perfectly similar to itself)
    
    np.fill_diagonal(jaccard_sim, 1)

    return jaccard_sim

In [24]:
# Compute the Jaccard similarity matrix

jaccard_sim = compute_jaccard_similarity(movie_df)

In [25]:
# Define recommendation function

def recommend_movies_jaccard(title, jaccard_sim=jaccard_sim, movie_df=movie_df):
    
    # Check if the movie title exists in the DataFrame
    
    if title not in movie_df['title'].values:
        
        return f"Movie title '{title}' not found in the dataset."

    # Get the index of the movie
    
    idx = movie_df[movie_df['title'] == title].index[0]

    # Get the similarity scores for that movie
    
    sim_scores = list(enumerate(jaccard_sim[idx]))

    # Sort the movies by similarity
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 5 most similar movies (excluding itself)
    
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar movies
    
    return movie_df['title'].iloc[movie_indices]

# Example 

print(recommend_movies_jaccard('Avengers: Age of Ultron'))

# ====================

# End the timer
end_time = time.time()

# Calculate and print the runtime
runtime = end_time - start_time
print(f"Runtime: {runtime:.4f} seconds")

138          The Last Airbender
344                 Unstoppable
531     The Man from U.N.C.L.E.
2048               Darling Lili
2176                Simon Birch
Name: title, dtype: object
Runtime: 105.2647 seconds


### Conclusion

Cosine Similarity outperforms Jaccard Similarity in recommending similar movies for a user, yielding more accurate suggestions. It is also computationally more efficient, as it avoids set calculations required by Jaccard Similarity, leading to faster execution.

### Advantages of Cosine Similarity

**1. Scale-invariant**: Cosine similarity focuses on the orientation (angle) of the vectors, not their magnitude. This is useful for text data, where the length of documents (overviews) can vary.

**2. Interpretable**: The similarity scores range from 0 to 1, making it easy to interpret and use for recommendations.

**3. Efficient**: Computationally efficient for comparing large numbers of text documents. Much much faster than Jaccard Similarity. 0.18 secs vs 105.36 secs