In [29]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load dataset
df = pd.read_csv("../Data/clean_parsed_tmdb_5000.csv")

In [8]:
df.head()

Unnamed: 0,budget,genres,homepage,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,237000000,"Action, Adventure, Fantasy, Science Fiction",http://www.avatarmovie.com/,"culture clash, future, space war, space colony...",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",2009-12-10,2787965087,162.0,"English, Español",Released,Enter the World of Pandora.,Avatar,7.2,11800,"Sam Worthington, Zoe Saldana, Sigourney Weaver...","Stephen E. Rivkin (Editor), Rick Carter (Produ..."
1,300000000,"Adventure, Fantasy, Action",http://disney.go.com/disneypictures/pirates/,"ocean, drug abuse, exotic island, east india t...",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,2007-05-19,961000000,169.0,English,Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"Johnny Depp, Orlando Bloom, Keira Knightley, S...","Dariusz Wolski (Director of Photography), Gore..."
2,245000000,"Action, Adventure, Crime",http://www.sonypictures.com/movies/spectre/,"spy, based on novel, secret agent, sequel, mi6...",en,A cryptic message from Bond’s past sends him o...,107.376788,"Columbia Pictures, Danjaq, B24","United Kingdom, United States of America",2015-10-26,880674609,148.0,"Français, English, Español, Italiano, Deutsch",Released,A Plan No One Escapes,Spectre,6.3,4466,"Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...","Thomas Newman (Original Music Composer), Sam M..."
3,250000000,"Action, Crime, Drama, Thriller",http://www.thedarkknightrises.com/,"dc comics, crime fighter, terrorist, secret id...",en,Following the death of District Attorney Harve...,112.31295,"Legendary Pictures, Warner Bros., DC Entertain...",United States of America,2012-07-16,1084939099,165.0,English,Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"Christian Bale, Michael Caine, Gary Oldman, An...","Hans Zimmer (Original Music Composer), Charles..."
4,260000000,"Action, Adventure, Science Fiction",http://movies.disney.com/john-carter,"based on novel, mars, medallion, space travel,...",en,"John Carter is a war-weary, former military ca...",43.926995,Walt Disney Pictures,United States of America,2012-03-07,284139100,132.0,English,Released,"Lost in our world, found in another.",John Carter,6.1,2124,"Taylor Kitsch, Lynn Collins, Samantha Morton, ...","Andrew Stanton (Screenplay), Andrew Stanton (D..."


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4802 entries, 0 to 4801
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4802 non-null   int64  
 1   genres                4775 non-null   object 
 2   homepage              1712 non-null   object 
 3   keywords              4391 non-null   object 
 4   original_language     4802 non-null   object 
 5   overview              4799 non-null   object 
 6   popularity            4802 non-null   float64
 7   production_companies  4452 non-null   object 
 8   production_countries  4629 non-null   object 
 9   release_date          4802 non-null   object 
 10  revenue               4802 non-null   int64  
 11  runtime               4802 non-null   float64
 12  spoken_languages      4716 non-null   object 
 13  status                4802 non-null   object 
 14  tagline               3959 non-null   object 
 15  title                

In [13]:
# Combine all content into a single text field
df["combined_content"] = (
    df["genres"].astype(str) + " " +
    df["keywords"].astype(str) + " " +
    df["overview"].astype(str) + " " +
    df["production_companies"].astype(str) + " " +
    df["tagline"].astype(str) + " " +
    df["cast"].astype(str) + " " +
    df["crew"].astype(str)
)

In [21]:
# Convert everything to lowercase
df["combined_content"] = df["combined_content"].astype(str).str.lower()

# Replace anything that's not a letter or number with whitespace
df["combined_content"] = df["combined_content"].str.replace(r'[^\w\s]', '', regex=True)

# Strip trailing whitespace
df["combined_content"] = df["combined_content"].str.strip()

In [23]:
df["combined_content"]

0       action adventure fantasy science fiction cultu...
1       adventure fantasy action ocean drug abuse exot...
2       action adventure crime spy based on novel secr...
3       action crime drama thriller dc comics crime fi...
4       action adventure science fiction based on nove...
                              ...                        
4797    action crime thriller united statesmexico barr...
4798    comedy romance nan a newlywed couples honeymoo...
4799    comedy drama romance tv movie date love at fir...
4800    nan nan when ambitious new york attorney sam i...
4801    documentary obsession camcorder crush dream gi...
Name: combined_content, Length: 4802, dtype: object

In [25]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [27]:
# Create a matrix by fitting the TF-IDF model
tfidf_matrix = vectorizer.fit_transform(df["combined_content"])

In [31]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 443871 stored elements and shape (4802, 5000)>

In [30]:
similarity_matrix = cosine_similarity(tfidf_matrix)

In [32]:
similarity_matrix

array([[1.        , 0.19578554, 0.66291794, ..., 0.04589267, 0.01466897,
        0.02389071],
       [0.19578554, 1.        , 0.15682752, ..., 0.07075526, 0.01271746,
        0.02324541],
       [0.66291794, 0.15682752, 1.        , ..., 0.05500419, 0.04229058,
        0.01561895],
       ...,
       [0.04589267, 0.07075526, 0.05500419, ..., 1.        , 0.03705502,
        0.04878338],
       [0.01466897, 0.01271746, 0.04229058, ..., 0.03705502, 1.        ,
        0.02449469],
       [0.02389071, 0.02324541, 0.01561895, ..., 0.04878338, 0.02449469,
        1.        ]], shape=(4802, 4802))

In [33]:
similarity_matrix.shape

(4802, 4802)

In [52]:
movie_index = df[df['title'] == 'Superman'].index[0]

In [53]:
# Compute similarity to chosen movie
scores = similarity_matrix[movie_index]


In [54]:
scores

array([0.05735164, 0.10487115, 0.06439599, ..., 0.06275975, 0.03348045,
       0.06614791], shape=(4802,))

In [55]:
# Find the indices 10 most similar movies
similar_indices = scores.argsort()[::-1][1:11]

In [56]:
similar_indices

array([ 870, 2433, 1296,   10,   14,  511,   65, 1359, 1720,    9])

In [57]:
# Display the most similar movies
df['title'].iloc[similar_indices]

870                            Superman II
2433      Superman IV: The Quest for Peace
1296                          Superman III
10                        Superman Returns
14                            Man of Steel
511                                  X-Men
65                         The Dark Knight
1359                                Batman
1720                              Kick-Ass
9       Batman v Superman: Dawn of Justice
Name: title, dtype: object