In [None]:
import numpy as np
import pandas as pd
import ast
import nltk
import random

In [None]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [None]:
movies.info()
credits.info()

In [None]:
# Preprocessing
# manipulating the data according to our need

# columns we need
# movie_id, title, genre, id, keywords, overview, cast, crew

# <----merging the two data sets and keeping relevant columns---->
movies=movies.merge(credits, on="title")
new_movies=movies[["genres","title", "id", "keywords", "overview", "cast", "crew"]]

In [None]:
new_movies.head(1)

In [None]:
# checking if there is any missing data
# next we remove that data and we check again, so the flow goes
# new_movies.isnull().sum()
# new_movies.dropna()
# new_movies.isnull().sum()

new_movies.isnull().sum()

In [None]:
new_movies=new_movies.dropna()

In [None]:
new_movies.isnull().sum()

In [None]:
# checking if there is any missing data
new_movies.duplicated().sum()

# no duplicate data so continue

In [None]:
# now we are gonna convert genre, keywords, cast, crew to required form
# in genres and keywords there are list of distionaries and we only need names of genres and keywords
new_movies.iloc[0].genres


# the data in genres is a string, to convert string to dictionaries

In [None]:
def convert_dictionary_to_names(obj):
    l=[]
    for i in ast.literal_eval(obj):
        # the data in genres is a string, to convert string to dictionaries we use ast.literal_eval
        l.append(i['name'])
    return l

In [None]:
new_movies['genres']  #before applying convert_dictionary_to_names function

In [None]:
new_movies['keywords']  #before applying convert_dictionary_to_names function

In [None]:
new_movies['genres']=new_movies['genres'].apply(convert_dictionary_to_names)
new_movies['keywords']=new_movies['keywords'].apply(convert_dictionary_to_names)

In [None]:
new_movies['genres']  #after applying convert_dictionary_to_names function

In [None]:
new_movies['keywords']  #after applying convert_dictionary_to_names function

In [None]:
new_movies['cast'][0]

# from this we only require the names of first 3 actors/actresses as they are the most relevant

In [None]:
# gives back names of top 3 cast members
def convert_cast(obj):
    l=[]
    counter=0
    for i in ast.literal_eval(obj):
        # the data in genres is a string, to convert string to dictionaries we use ast.literal_eval
        if counter>=3:
            break
        l.append(i['name'])
        counter+=1
    return l

In [None]:
new_movies['cast']=new_movies['cast'].apply(convert_cast)

In [None]:
new_movies['cast'][0]

In [None]:
new_movies['crew'][0]

# we only require the name of the DIRECTOR

In [None]:
# gives back the name of the director
def convert_crew(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l

In [None]:
new_movies['crew']=new_movies['crew'].apply(convert_crew)

In [None]:
new_movies['crew'][0]

In [None]:
new_movies['overview'][0]

In [None]:
# initially the overview is a string, we convert into a list
new_movies['overview'] = new_movies['overview'].apply(lambda x:x.split(" "))

In [None]:
new_movies.head()

In [None]:
# here we remove the spaces between the elements of genres, keywords, overview, cast and crew
# for example Zoe Saldana becomes ZoeSaldana
# This is done so that the model doesn't get confused between Chris Evans and Chris Prat when a tag Chris comes up
# They will be converted to ChrisEvans and ChrisPrat and they are completely different tags


new_movies['genres']=new_movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
new_movies['keywords']=new_movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
new_movies['overview']=new_movies['overview'].apply(lambda x: [i.replace(" ", "") for i in x])
new_movies['cast']=new_movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
new_movies['crew']=new_movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [None]:
new_movies.head()

In [None]:
# creating a new column, tags, that is the concatenation of keywords, genres, overview, cast, crew
new_movies['tags']=new_movies['keywords'] + new_movies['genres'] +new_movies['overview']+new_movies['cast']+new_movies['crew']

In [None]:
new_movies.head()

In [None]:
# converting tags form list to string

new_movies['tags']=new_movies['tags'].apply(lambda x: " ".join(x))


In [None]:
# converting tags to lowercase so there is no problem in computation

new_movies['tags']=new_movies['tags'].apply(lambda x: x.lower())

In [None]:
new_movies['tags'][0]

In [None]:
#  PorterStemmer is used to perform stemming over the tags
Stemming is reducing words to their root words, inorder to avoid repeating of words that represent similar things
for example love, loved, loving, loves are different words. After stemming they will be converted to love

from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
def stemi(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [None]:
new_movies['tags'] = new_movies['tags'].apply(stemi)

In [None]:
new_movies['tags'][0]

In [None]:
new_movies.head()

In [None]:
# creating the final dataframe to apply model to
final_movies=new_movies[['title', 'id', 'tags']]

In [None]:
final_movies.head()

In [None]:
# sklearn function to perform vectorization of the data.
# this function basically takes a specific number of words which have the most frequency of appearance
# this doesn't include stop words like is, am, the etc
# then based on these words, every movie gets converted into a vector with a specific number of elements (here we take 5000 words)

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(final_movies['tags']).toarray()

In [None]:
vectors.shape

In [None]:
cv.get_feature_names_out()

In [None]:
# this function calculates the similarity of every vector with every other vector
# this function uses and angle between the vector to denote the similarity (or inverse of distance), rather than the eucledian distance
# similarity ranges form 0-1
# the higher the number, more similar the movies

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
similarity[1]

In [None]:
# This function takes the movie name (case sensitive) and provide 5 similar movies
def recommend(movie):
    index=final_movies[final_movies['title']==movie].index[0]
    distances=similarity[index]
    distances=list(enumerate(distances))
    movies_list = sorted(distances, reverse=True, key=lambda x:x[1])[1:6]
    recomended_movies=[]
    for i in movies_list:
        print(final_movies.iloc[i[0]].title)


In [None]:
# This function akes an array of all movies titles as argument and provide 5 movies at random
def get_random(movies):
    y=random.choices(movies, k=5)
    return y

In [None]:
# This is the main driver code

print("Welcome to the Movie recommender system!!")

to_continue=True
while(to_continue):
    choice=input("Type Random to get 5 random movies, or type Recommend to get similar movies to your favorite movie ").lower()
    print()
    if choice=="random":
        movie_titles=final_movies['title'].to_numpy()
        y=get_random(movie_titles)
        for i in y:
            print(i)
    
    elif choice=="recommend":
        my_movie=input("Type your favorite movie (Case Sensitive) ")
        print()
        recommend(my_movie)
    
    else:
        print("Couldn't understand your command ")
        print()
        
    cont=input("press Y to run again, N to quit ").lower()
    print()
    print()
    if cont=='n':
        to_continue=False
        print("Thank you for your time, have a nice day ")



