In [None]:
import numpy as np
import pandas as pd

## Dataset
The two datasets used in this project are the tmdb 5000 movies and the tmdb 5000 credits. 
These datasets contain movie metadata such as titles, genres, keywords, cast, and overviews. 

This data is used to construct feature vectors for each movie, which 
are then compared using cosine similarity to identify similar movies.

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv.zip')
credits = pd.read_csv('tmdb_5000_credits.csv.zip')

In [None]:
movies.head()

In [None]:
credits.head(1)

In [None]:
movies = movies.merge(credits,on='title')

In [None]:
movies.info()

## Data Cleaning

In [None]:
#choosing the columns that matter
movies = movies[['movie_id','genres','keywords','title','overview','cast','crew']]

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True) #clearing missing data

In [None]:
movies.duplicated().sum() #no duplicate data in this dataset

In [None]:
movies.iloc[0].genres

In [None]:
#converting the list of dictionaries to a plain python list
def convert(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

In [None]:
import ast

In [None]:
movies['genres']=movies['genres'].apply(convert)

In [None]:
movies.head()

In [None]:
movies['keywords']=movies['keywords'].apply(convert)

In [None]:
movies.head()

In [None]:
#creating a list of only "character" values(names) for the cast column
def convert2(obj):
    l = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            l.append(i['name'])
            counter+=1
        else:
            break
    return l

In [None]:
movies['cast']=movies['cast'].apply(convert2)

In [None]:
movies.head()

In [None]:
#fetching only directors from the crew column
def fetch_director(obj):
    l = []
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l

In [None]:
movies['crew']=movies['crew'].apply(fetch_director)

In [None]:
movies.head()

In [None]:
 movies['overview'][0]

In [None]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [None]:
#removing whitespaces
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies.head()

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
new_df = movies[['movie_id','title','tags']]

In [None]:
new_df

In [None]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df.head()

In [None]:
new_df['tags'][0]

In [None]:
#converting to lowercase
new_df['tags']=new_df['tags'].apply(lambda x: x.lower())

In [None]:
new_df.head()

In [None]:
import nltk #Natural Language Toolkit

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
#reducing each word to its root form
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

## Vectorisation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
vectors[0]

In [None]:
cv.get_feature_names_out()

In [None]:
new_df["tags"] = new_df["tags"].apply(stem)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
similarity[0]

In [None]:
#final function
def recommend(movie):
    movie_index = new_df[new_df["title"]==movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

## Example Recommendations

In [102]:
recommend("Avatar")

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem


In [104]:
recommend("Titanic")

Raise the Titanic
Captain Phillips
The Notebook
In the Heart of the Sea
Ghost Ship
