In [None]:
import numpy as np
import pandas as pd

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head(1)

In [None]:
credits.head(1)

In [None]:
credits.head(1)['crew'].values

In [None]:
# merge the databases
movies = movies.merge(credits, on='title')

In [None]:
movies.info()

In [None]:
# genres, id, keywords, title, overview, cast, crew
# extract useful data column from overall dataset 
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [None]:
movies.isnull().sum()

In [None]:
# remove unknown value from overview column
movies.dropna(inplace=True) 

In [None]:
movies.isnull().sum()

In [None]:
movies.duplicated().sum()

In [None]:
movies.iloc[0].genres

In [None]:
# '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
# covert into ['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [None]:
import ast
# ast.litral_eval
# covert String into List

In [None]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [None]:
# avu o/p madse
convert('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies['genres']

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies['keywords']

In [None]:
movies['cast'][0]

In [None]:
# extract name of the top 3 star from all cast
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [None]:
movies['cast'] = movies['cast'].apply(convert3)

In [None]:
movies['cast']

In [None]:
movies['crew'][0]

In [None]:
# extract only those name value whose job is director
def fetch_Director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch_Director)

In [None]:
movies['crew']

In [None]:
# String
movies['overview'][0]

In [None]:
# Convert into String so we can concatenate easily
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies['overview']

In [None]:
# clean data
movies.head()

In [None]:
# 'sam wothinton' 'sam Mendes'-> 'samworthinton'
# ['sam', 'worthinton', 'sam', 'mendes'] -> ['samworthinton', 'sammendes']
# to achieve right recommendation

movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [None]:
# data without space
movies.head()

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] +movies['crew']

In [None]:
movies.head()

In [None]:
# create new data frame
new_df = movies[['movie_id', 'title', 'tags']]

In [None]:
new_df

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df.head()

In [None]:
new_df['tags'][0]

In [None]:
new_df['tags'].apply(lambda x:x.lower())

In [None]:
new_df.head()

In [None]:
# There is a problem with words like (actor, actors)
# Stemming Technique
# ['play', 'playing', 'played']  after applying stemming ['play','play', 'play']

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():  #String to List
        y.append(ps.stem(i))
    
    return " ".join(y) #list to String

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
new_df['tags'][0]

In [None]:
# vectorization (text to vector) Technique(Bag of words)
# stop words like of, in , and, etc. 
# here 5000 most frequent words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
vectors

In [None]:
# most frequent 5000 words
cv.get_feature_names()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
similarity[1]

In [None]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]  #fetch the index of particular movie
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse = True, key= lambda x:x[1])[1:6]
#     enumarate function use for the remembering the index of every movie after sorting. so we can fetch movie based on the index number
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [None]:
recommend('Batman')

In [None]:
import pickle #for sending the data from here to webpage

In [None]:
new_df.to_dict()  #send data in form of dictionary

In [None]:
pickle.dump(new_df.to_dict(), open('movie_dict.pkl','wb'))

In [None]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))