In [3]:
import numpy as np
import pandas as pd
import ast
import os

# Load the datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge the datasets on the 'title' column
movies = movies.merge(credits, on='title')


In [5]:
# Keep only the necessary columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

# Convert JSON columns to a list of names
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Convert cast and crew columns to a list of names, limiting cast to top 3 actors
def convert_cast(text):
    return [i['name'] for i in ast.literal_eval(text)[:3]]

movies['cast'] = movies['cast'].apply(convert_cast)

# Extract the director's name from the crew column
def fetch_director(text):
    return [i['name'] for i in ast.literal_eval(text) if i['job'] == 'Director']

movies['crew'] = movies['crew'].apply(fetch_director)

# Remove spaces in names for consistency
def collapse(L):
    return [i.replace(" ", "") for i in L]

movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

# Convert overview to a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Create a 'tags' column by combining all relevant columns
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Drop unnecessary columns and join the 'tags' column into a single string
final_df = movies[['movie_id', 'title']]
final_df['tags'] = movies['tags'].apply(lambda x: " ".join(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['tags'] = movies['tags'].apply(lambda x: " ".join(x))


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert the tags into a matrix of token counts
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(final_df['tags']).toarray()

# Compute the cosine similarity matrix
similarity = cosine_similarity(vector)


In [9]:
def recommend(movie):
    try:
        index = final_df[final_df['title'] == movie].index[0]
        distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
        recommended_movies = [final_df.iloc[i[0]].title for i in distances[1:6]]
        return recommended_movies
    except IndexError:
        return "Movie not found!"

# Test the recommendation system
recommendations = recommend('Batman Begins')
print(recommendations)


['The Dark Knight', 'The Dark Knight Rises', 'Batman', 'Batman & Robin', 'Batman']


In [11]:
import pickle

# Save the final dataframe and similarity matrix for later use
pickle.dump(final_df, open('movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
