In [45]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [46]:
# Load the datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [47]:
# Merge the movies and credits data on 'title'
movies = movies.merge(credits, on='title')

In [48]:
# Keep only the necessary columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity', 'vote_average', 'vote_count']]

In [49]:
# Drop rows with missing values
movies.dropna(inplace=True)


In [50]:
# Convert JSON-like string columns into lists
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

In [51]:
def convertcast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:  # Considering top 3 cast members
            L.append(i['name'])
        counter += 1
    return L


In [52]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [53]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ", ""))
    return L1

In [54]:
# Apply the functions to extract genres, keywords, cast, and crew
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convertcast)
movies['crew'] = movies['crew'].apply(fetch_director)

In [55]:
# Remove spaces in names for better matching
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

In [56]:
# Process the 'overview' column and create 'tags' column
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [57]:
# Create a new dataframe with movie_id, title, and tags
new = movies[['movie_id', 'title', 'tags', 'popularity', 'vote_average', 'vote_count']]
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['tags'] = new['tags'].apply(lambda x: " ".join(x))


In [58]:
# Calculate movie significance score
new['weighted_rating'] = (new['vote_count'] / (new['vote_count'].mean() + 1)) * new['vote_average']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['weighted_rating'] = (new['vote_count'] / (new['vote_count'].mean() + 1)) * new['vote_average']


In [59]:
# Use TfidfVectorizer instead of CountVectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vector = tfidf.fit_transform(new['tags']).toarray()

In [60]:
# Calculate cosine similarity between movies
similarity = cosine_similarity(vector)


In [79]:
# Recommendation function considering similarity and weighted rating
def recommend(movie, num_recommendations=5):
    if movie not in new['title'].values:
        return f"'{movie}' is not found in the dataset. Please try another movie."

    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: (x[1], new.iloc[x[0]]['weighted_rating']))

    recommended_movies = []
    for i in distances[1:num_recommendations + 1]:
        movie_title = new.iloc[i[0]].title
        recommended_movies.append(movie_title)
    
    return recommended_movies


In [82]:
# Test the recommendation function
print(recommend('Spectre'))

['Quantum of Solace', 'Never Say Never Again', 'Skyfall', 'Thunderball', 'Restless']


In [81]:
# Save the processed data for the Streamlit app
pickle.dump(new, open('gen_movie_list.pkl', 'wb'))
pickle.dump(similarity, open('gen_similarity.pkl', 'wb'))