In [2]:
# dataset
# https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset 

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Read the data
movie_data = pd.read_csv('movies_metadata.csv')

In [17]:
# Movie summary data, fill null value with empty string
movie_summary = movie_data['overview'].fillna('')

# Movie title data
movie_title_data = movie_data['original_title']

# Map movie title to its index
movie_to_index = pd.Series(movie_data.index, index=movie_data['title']).drop_duplicates()

In [18]:
tfidf = TfidfVectorizer(stop_words='english')
movie_matrix= tfidf.fit_transform(movie_summary)

In [19]:
movie_matrix.shape

(45466, 75827)

In [20]:
movie_title = 'Furious 7'

# Get the index of movie title
idx = movie_to_index[movie_title]

# Get the corresponding movie summary
movie_test_summary = [movie_summary[idx]]

In [21]:
# Fetch the TF-IDF vector of the corresponding movie
movie_test_matrix = tfidf.transform(movie_test_summary)
print(movie_test_matrix.shape)

(1, 75827)


In [22]:
# Calculate the cosine similarity between the movie and each of the entry in   # movie_matrix

sim_scores = cosine_similarity(movie_test_matrix, movie_matrix).tolist()[0]
print(len(sim_scores))

45466


In [23]:
sim_scores = sorted(enumerate(sim_scores), key=lambda i: i[1], reverse=True)


# Fetch the top 10 recommended movies
sim_scores = sim_scores[1:11]

In [24]:
# Fetch the recommended movies' indexes
movie_indexes = [i[0] for i in sim_scores]

# Print the title of recommended movies
print([movie_title_data[i] for i in movie_indexes])

['The Fast and the Furious', 'Fast & Furious 6', 'Los violadores', 'Fast Five', 'Genius on Hold', 'Youth Without Youth', 'The Skydivers', 'Aenigma', 'The Cell', 'Urban Justice']


In [25]:
def get_recommendation(movie_title, movie_matrix=movie_matrix):

  # Get the index of movie title
  idx = movie_to_index[movie_title]

  # Get the corresponding movie summary
  movie_test_summary = [movie_summary[idx]]

  # Fetch the TF-IDF vector of the corresponding movie
  movie_test_matrix = tfidf.transform(movie_test_summary)

  # Calculate the cosine similarity between the movie and each of the entry in                                 #   movie_matrix

  sim_scores = cosine_similarity(movie_test_matrix, movie_matrix).tolist()[0]
  sim_scores = sorted(enumerate(sim_scores), key=lambda i: i[1], reverse=True)

  # Fetch the top 10 recommended movies
  sim_scores = sim_scores[1:11]

  # Fetch the recommended movies' indexes
  movie_indexes = [i[0] for i in sim_scores]

  # Return the title of recommended movies
  return [movie_title_data[i] for i in movie_indexes]

In [26]:
get_recommendation('Toy Story')

['Toy Story 3',
 'Toy Story 2',
 'The 40 Year Old Virgin',
 'Small Fry',
 "Andy Hardy's Blonde Trouble",
 'Hot Splash',
 'Andy Kaufman Plays Carnegie Hall',
 'Superstar: The Life and Times of Andy Warhol',
 'Andy Peters: Exclamation Mark Question Point',
 'The Champ']