# Movie Recommendation System


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
print(credits.columns)
print(movies.columns)

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')
Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [4]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

# EDA:
1. Removing unnecessary columns 
2. Removing rows if any of the field is NaN

In [5]:
movies=movies.merge(credits, on = 'title')

In [6]:
movies = movies[['id','title','genres','keywords','overview','cast','crew']]

In [7]:
movies.dropna(inplace=True)

In [8]:
#remove duplicate data
movies.duplicated().sum()
#No duplicated columns


0

In [10]:
import ast

In [11]:
def convertG(obj):
    lst = ast.literal_eval(obj)
    L=[]
    for x in lst:
        L.append(x["name"])
    return L

In [12]:
movies['genres']=movies['genres'].apply(convertG)

In [13]:
movies['keywords']=movies['keywords'].apply(convertG)

In [14]:
def convertC(obj):
    lst = ast.literal_eval(obj)
    count=0
    L=[]
    for x in lst:
        if(count != 5):
            L.append(x['name'])
            count+=1
        else:
            break
    return L

In [15]:
movies['cast']=movies['cast'].apply(convertC)

In [16]:
def convertCrew(obj):
    lst = ast.literal_eval(obj)
    L=[]
    for x in lst:
        if(x['job']=="Director" or x['job']=="Producer"):
            L.append(x['name'])
    return L

In [17]:
def removeDuplicates(obj):
    return list(set(obj))

In [18]:
movies['crew'] = movies['crew'].apply(convertCrew).apply(removeDuplicates)

In [19]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [21]:
movies['cast']=movies['cast'].apply(lambda x: [i.replace(' ','')for i in x])
movies['genres']=movies['genres'].apply(lambda x: [i.replace(' ','')for i in x])
movies['keywords']=movies['keywords'].apply(lambda x: [i.replace(' ','')for i in x])
movies['crew']=movies['crew'].apply(lambda x: [i.replace(' ','')for i in x])

In [22]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['cast'] + movies['crew']+movies['keywords']

In [25]:
movies.drop(['overview','genres','cast','crew','keywords'],axis=1,inplace=True)

In [26]:
movies['tags']=movies['tags'].apply(lambda x: " ".join(x))

In [28]:
movies['tags'] = movies['tags'].apply(lambda x: x.lower())

In [30]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [31]:
def stem(text):
    filtered=[]
    for i in text.split():
        filtered.append(ps.stem(i))
    return " ".join(filtered)


In [32]:
movies['tags'] = movies['tags'].apply(stem)

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [36]:
vectors = cv.fit_transform(movies['tags']).toarray()

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
similarity = cosine_similarity(vectors)

In [102]:
def recommend(movie):
    movie_idx = movies[movies['title']==movie].index[0]
    distances = similarity[movie_idx]
    recommended_lst = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:10]
    for i in recommended_lst:
        print(movies.iloc[i[0]].title)

In [106]:
recommend('Finding Nemo')

The Peanuts Movie
Bolt
Wish I Was Here
Dude Where's My Dog?
Khumba
The Rugrats Movie
Apocalypto
Howl's Moving Castle
Should've Been Romeo


# Now for Deployment:

In [107]:
import pickle

In [108]:
pickle.dump(movies,open('movies.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity.pkl','wb'))