In [2]:
import numpy as np
import pandas as pd
import ast
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
# Merge datasets on title
movies = movies.merge(credits, on='title')

In [5]:
# Select relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [6]:
# Remove rows with missing values
movies.dropna(inplace=True)

In [7]:
# Helper function to extract names from JSON-like strings
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L


In [8]:
# Helper function to extract director name from crew
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [9]:
# Helper function to remove spaces from names
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ", ""))
    return L1

In [10]:
# Process genres and keywords
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [11]:
# Process cast (top 3 actors only)
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: x[0:3])

In [12]:
# Process crew (director only)
movies['crew'] = movies['crew'].apply(fetch_director)


In [13]:
# Remove spaces from all list items
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [14]:
# Convert overview to list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [15]:
# Combine all features into tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [16]:
# Create final dataframe with only required columns
new = movies.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'])

In [17]:
# Convert tags list to string
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [18]:
# Create count vectorizer with top 5000 features
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()

In [19]:
# Calculate cosine similarity
similarity = cosine_similarity(vector)

In [20]:
# Recommendation function
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [21]:
# Test the recommendation system
recommend('Avatar')

Titan A.E.
Small Soldiers
Ender's Game
Aliens vs Predator: Requiem
Independence Day


In [23]:
# Save the model and data
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))