In [1]:
# Importing necessary libraries
import numpy as np 
import pandas as pd
import ast  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer
import pickle
import os
import lzma

In [2]:
# Loading the datasets
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')


In [3]:
# Merging datasets on the title column
movies = movies.merge(credits, on='title')

In [4]:
#Trimming to make dataset shareable
movies = movies.head(4500)

In [5]:
# Keeping only the important columns for recommendation
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [6]:
# Droping rows with missing values
movies.dropna(inplace=True)

In [7]:
# Function to convert for genres and keywords
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L


In [8]:
# Applying conversion to genres and keywords columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [9]:
# Function to get top 3 cast members
def convert_cast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter += 1
    return L

In [10]:
# Applying the function to the cast column
movies['cast'] = movies['cast'].apply(convert_cast)

In [11]:
# Function to get the director from crew
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [12]:
# Applying the function to the crew column
movies['crew'] = movies['crew'].apply(fetch_director)

In [13]:
# Converting overview to list for consistency
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [14]:
# Function to remove spaces in names
def remove_space(L):
    return [i.replace(" ", "") for i in L]

In [15]:
# Applying the function to remove spaces in relevant columns
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [16]:
# Creating a single 'tags' column by combining all
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x)).apply(lambda x: x.lower())

In [17]:
# Initializing a PorterStemmer
ps = PorterStemmer()

In [18]:
# Function to apply stemming to each word in the tags
def stems(text):
    return " ".join([ps.stem(i) for i in text.split()])

In [19]:
# Applying stemming to the tags column
movies['tags'] = movies['tags'].apply(stems)

In [20]:
# Final dataset with important columns
new_df = movies[['movie_id', 'title', 'tags']]

In [21]:
# Converting text to vector form using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new_df['tags']).toarray()

In [22]:
# Computing cosine similarity based on the vectorized tags
similarity = cosine_similarity(vector)

In [23]:
# Recommendation function based on cosine similarity
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:6]:  
        print(new_df.iloc[i[0]].title)

In [24]:
# Example
recommend('Iron Man')

Iron Man 3
Iron Man 2
Avengers: Age of Ultron
The Avengers
Captain America: Civil War


In [25]:
# Saving the necessary objects to pickle files for future use
pickle.dump(new_df, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [26]:
# compressing for shareable file
with lzma.open('movie_list.pkl.xz', 'wb') as f:
    pickle.dump(movies, f)

with lzma.open('similarity.pkl.xz', 'wb') as f:
    pickle.dump(similarity, f)

In [27]:
# Deleting the .pkl files
if os.path.exists('movie_list.pkl'):
    os.remove('movie_list.pkl')

if os.path.exists('similarity.pkl'):
    os.remove('similarity.pkl')