In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge datasets on the title
movies = movies.merge(credits, on='title')


In [3]:
movies = movies[['movie_id', 'title', 'overview', 'keywords', 'genres', 'cast', 'crew']]

In [4]:
movies.dropna(inplace=True)

In [5]:
# Convert JSON-like strings in genres and keywords columns into lists of genre/keyword names
def convert(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)


In [6]:
#Extract up to 3 main cast members.
def convert3(obj):
    L = []
    for i in ast.literal_eval(obj):
        if len(L) < 3:
            L.append(i['name'])
    return L

movies['cast'] = movies['cast'].apply(convert3)


In [7]:
# Extract the name of the director.
def fetch_director(obj):
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            return [i['name']]
    return []

movies['crew'] = movies['crew'].apply(fetch_director)


In [8]:
# Split overview into a list of words and remove spaces in all lists.
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])


In [9]:
# Combine all relevant columns (overview, genres, keywords, cast, and crew) into a single tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df = movies[['movie_id', 'title', 'tags']]  # Create a new DataFrame with only necessary columns

In [10]:
# Save the processed data to a CSV file for deployment.

# Join the tags into a single string and convert to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# Save the processed data to a CSV file
new_df.to_csv('movies_data.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [11]:
# Use CountVectorizer to convert the tags column into feature vectors.
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

# Stemming
ps = PorterStemmer()

def stem(text):
    return " ".join([ps.stem(i) for i in text.split()])

new_df['tags'] = new_df['tags'].apply(stem)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [12]:
# Calculate the cosine similarity between all movies.
similarity = cosine_similarity(vectors)

In [13]:
# Instead of storing the full similarity matrix, save the top 5 recommendations for each movie.
recommendations = []

for i in range(len(new_df)):
    distances = similarity[i]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    recommendations.append([new_df.iloc[i]['title']] + [new_df.iloc[j[0]]['title'] for j in movies_list])

# Create a DataFrame for recommendations and save it
recommendations_df = pd.DataFrame(recommendations, columns=['movie'] + [f'recommendation_{i+1}' for i in range(5)])
recommendations_df.to_csv('recommendations.csv', index=False)
