In [4]:
%pip install pandas numpy

# Import necessary libraries
import pandas as pd
import numpy as np
import ast

# Load the datasets
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

# Merge the two dataframes on the 'title' column
movies = movies.merge(credits, on='title')

Collecting pandas
  Downloading pandas-2.3.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.3.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.0 MB 932.9 kB/s eta 0:00:12
   -- ------------------------------------- 0.8/11.0 MB 907.1 kB/s eta 0:00:12
   --- ------------------------------------ 1.0/11.0 MB 1.0 MB/s eta 0:00:10
   ---- ----------------------------------- 1.3/11.0 MB 1.1 MB/s eta 0:00:09
   ---- ------------------------

In [5]:
# Select the key features for the recommendation system
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Define a helper function to convert stringified lists of dictionaries into a list of names
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

# Apply the conversion function to the 'genres' and 'keywords' columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# For the 'cast' column, we'll only keep the top 3 actors
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)][:3])

# For the 'crew' column, we'll extract the director's name
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

In [6]:
# Join the elements in the lists into single strings
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

# Combine the 'overview', 'genres', 'keywords', 'cast', and 'crew' into a single 'tags' column
movies['tags'] = movies['overview'].fillna('') + ' ' + movies['genres'].apply(lambda x: " ".join(x)) + ' ' + movies['keywords'].apply(lambda x: " ".join(x)) + ' ' + movies['cast'].apply(lambda x: " ".join(x)) + ' ' + movies['crew'].apply(lambda x: " ".join(x))

# Create a new DataFrame with just the movie ID, title, and tags
# Use .copy() to ensure you are working on a new DataFrame, not a view
new_df = movies[['movie_id', 'title', 'tags']].copy()

# Convert the tags to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

In [8]:
%pip install scikit-learn

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer with English stop words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'tags' column to create the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(new_df['tags'])

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.7 MB ? eta -:--:--
   --- ------------------------------------ 0.8/8.7 MB 2.8 MB/s eta 0:00:03
   -------- ------------------------------- 1.8/8.7 MB 3.1 MB/s eta 0:00:03
   ------------ --------------------------- 2.6/8.7 MB 3.6 MB/s eta 0:00:02
   ------------------ --------------------- 3.9/8.7 MB 4.1 MB/s eta 0:00:02
   ------------------------ --------------- 5.2/8.

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [10]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = new_df[new_df['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 similar movies
    return new_df['title'].iloc[movie_indices]

# Test the function with a movie title
print(get_recommendations('The Dark Knight Rises'))

65                              The Dark Knight
428                              Batman Returns
119                               Batman Begins
299                              Batman Forever
1360                                     Batman
1361                                     Batman
3859    Batman: The Dark Knight Returns, Part 2
210                              Batman & Robin
9            Batman v Superman: Dawn of Justice
2509                                  Slow Burn
Name: title, dtype: object


In [11]:
import pickle

# Save the cosine similarity matrix to a file
pickle.dump(cosine_sim, open('cosine_sim.pkl', 'wb'))
pickle.dump(new_df, open('movies.pkl', 'wb'))