In [1]:
import numpy as np
import pandas as pd
import ast

In [2]:
# Read the movie and credits datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.head()

In [4]:
credits.head()

In [5]:
# Merge credits dataframe into movies dataframe based on the movie title
movies = movies.merge(credits, on='title')

In [6]:
# Select the required columns for the model
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [7]:
# Check for null values
movies.isnull().sum()

In [8]:
# Drop missing values
movies.dropna(inplace=True)

In [9]:
# Check for duplicates
movies.duplicated().sum()

In [10]:
movies.head()

In [11]:
# Format the genres section by extracting the genre names
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [12]:
movies['genres'] = movies['genres'].apply(convert)

In [13]:
# Format the keywords section by extracting the keyword names
movies['keywords'] = movies['keywords'].apply(convert)

In [14]:
# Format the cast section by extracting the cast names (limited to 3)
def convertcast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convertcast)

In [15]:
# Fetch the director name from the crew section
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(fetch_director)


In [16]:
# Split the overview section into a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [17]:
# Remove white spaces between words in genres, keywords, cast, and crew sections
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ", ""))
    return L1

movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

In [18]:
# Create a 'tags' column to merge overview, genres, keywords, cast, and crew sections
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [19]:
# Create a new dataframe with movie_id, title, and tags columns
new_df = movies[['movie_id', 'title', 'tags']]

In [20]:
# Convert tags from list to string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [21]:
# Converting in lower-case
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
new_df.head()

In [22]:
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000,stop_words= 'english')

In [23]:
# Convert the tags into numerical vectors
vectors = cv.fit_transform(new_df['tags']).toarray()

In [24]:
# Get the feature names
cv.get_feature_names_out()

In [25]:
# Calculate cosine similarity between the vectors
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [26]:
# Create a recommend function to fetch similar movies
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [27]:
# Example usage
recommend('Inception')