In [1]:
# This python code basically provides us with two important files :
# 1. Processed data in the required form from the data that we downloaded from tmdb website (data of around 5000 movies)
# 2. Similarity matrix that consist of consine similarity of one movie with all other movies
# These two files we will be using for our streamlit web application 
# Using cosine similarity matrix we will be finding movies closest to the input movie by sorting all other movies in the
# descending order of thier cosine similarity value with input movie and then we will recommend the ones with highest 
# cosine similarity value.

In [2]:
# Importing Libraries
import numpy as np
import pandas as pd
import nltk  # To apply stemming to the tags
from sklearn.feature_extraction.text import CountVectorizer # We will be Using this class for vectorization of tags(texual data)

In [3]:
# Importing tmdb data  
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

In [4]:
# Merging both data frames on the basis of title(that is common in both) and storing it back into the movies
movies = movies.merge(credits,on='title')

In [5]:
# These are the columns that we will be keeping with us to assign and a tag to the movie and in movie recommendation.
# movie_id
# title
# overview
# genres
# keywords
# cast
# crew
selected_col=['movie_id','title','overview','genres','keywords','cast','crew']

In [6]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [8]:
# If there is any null value then replacing it with null string

for feature in selected_col:
    movies[feature] = movies[feature].fillna('')

In [9]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [10]:
# As we can see genres is in the form of string so we will convert it into the format like 
# ['Action','Adventure','Fantasy','Science Fiction']

In [11]:
import ast

In [12]:
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj): # ast.literal_eval(obj) convert it into list
        L.append(i['name'])
    return L

In [13]:
movies['genres'] = movies['genres'].apply(convert)

In [14]:
# So genres col is converted in the required form as we can see below
movies['genres']

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4809, dtype: object

In [15]:
# As we can see that our keyword col is also in same form as that of genres so it also needs data processing like genres col
movies['keywords']
movies['keywords'] = movies['keywords'].apply(convert)

In [16]:
# Outputting the current status of our data
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [17]:
# Formatting the cast col
# here we will be picking only the top 4 cast of the movie 
def convertCast(obj):
    L=[]
    count=0
    for i in ast.literal_eval(obj): # ast.literal_eval(obj) convert it into list
        if count < 4 :
            L.append(i['name'])
            count+=1
        else:
            break
    return L

In [18]:
movies['cast'] = movies['cast'].apply(convertCast)

In [19]:
movies['cast']

0       [Sam Worthington, Zoe Saldana, Sigourney Weave...
1       [Johnny Depp, Orlando Bloom, Keira Knightley, ...
2       [Daniel Craig, Christoph Waltz, Léa Seydoux, R...
3       [Christian Bale, Michael Caine, Gary Oldman, A...
4       [Taylor Kitsch, Lynn Collins, Samantha Morton,...
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805    [Edward Burns, Kerry Bishé, Marsha Dietlein, C...
4806    [Eric Mabius, Kristin Booth, Crystal Lowe, Geo...
4807    [Daniel Henney, Eliza Coupe, Bill Paxton, Alan...
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldm...
Name: cast, Length: 4809, dtype: object

In [20]:
# So we have formatted three col genres, keywords and cast now we are left with crew 
# from crew we are only going to extract director name
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

In [21]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [22]:
# Now lets have a look at our data
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [23]:
# One more tranformation that we need to apply on these col is that we should remove the space between one single entity like 
# Sam Worthington is the name of single person if we do not remove space then Sam will become one tag and Worthington will
# other so to remove this discrepency we need to apply this transformation.

In [24]:
def removeSpace(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [25]:
movies['cast'] = movies['cast'].apply(removeSpace)
movies['crew'] = movies['crew'].apply(removeSpace)
movies['genres'] = movies['genres'].apply(removeSpace)
movies['keywords'] = movies['keywords'].apply(removeSpace)

In [26]:
# overview col consists of string so we will first convert it into list then concatenate it with other 4 col and then
# convert the combined list back into the string

In [27]:
movies['overview'] = movies['overview'].apply(lambda x:x.split()) # convert overview into list as we can see below
movies['overview']

0       [In, the, 22nd, century,, a, paraplegic, Marin...
1       [Captain, Barbossa,, long, believed, to, be, d...
2       [A, cryptic, message, from, Bond’s, past, send...
3       [Following, the, death, of, District, Attorney...
4       [John, Carter, is, a, war-weary,, former, mili...
                              ...                        
4804    [El, Mariachi, just, wants, to, play, his, gui...
4805    [A, newlywed, couple's, honeymoon, is, upended...
4806    ["Signed,, Sealed,, Delivered", introduces, a,...
4807    [When, ambitious, New, York, attorney, Sam, is...
4808    [Ever, since, the, second, grade, when, he, fi...
Name: overview, Length: 4809, dtype: object

In [28]:
# Now forming the tags that consists of the col that helps in recommending movies
tags = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['overview'] = movies['overview'].apply(lambda x: " ".join(x)) #To restore overview as the string

In [29]:
# We will now convert tags in form of string instead of list
tags = tags.apply(lambda x: " ".join(x))

In [30]:
# So that is how our finally tags data look like
tags

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond’s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4804    El Mariachi just wants to play his guitar and ...
4805    A newlywed couple's honeymoon is upended by th...
4806    "Signed, Sealed, Delivered" introduces a dedic...
4807    When ambitious New York attorney Sam is sent t...
4808    Ever since the second grade when he first saw ...
Length: 4809, dtype: object

In [31]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [32]:
def stem(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [33]:
tags=tags.apply(stem)

In [34]:
# Now we have tags in our hand we will be using cosine similarity to make our recommendation system

In [35]:
cv = CountVectorizer(max_features=5000,stop_words='english') 

In [36]:
vectors = cv.fit_transform(tags).toarray() # numpy matrix from (sparse matrix returned by cv.fir_transform()

In [37]:
from sklearn.metrics.pairwise import cosine_similarity  # for finding similarity between movies

In [38]:
similarity = cosine_similarity(vectors)

In [39]:
similarity 

array([[1.        , 0.08226127, 0.08471737, ..., 0.04543109, 0.        ,
        0.        ],
       [0.08226127, 1.        , 0.05884899, ..., 0.02366905, 0.        ,
        0.02548236],
       [0.08471737, 0.05884899, 1.        , ..., 0.02437575, 0.        ,
        0.        ],
       ...,
       [0.04543109, 0.02366905, 0.02437575, ..., 1.        , 0.0404226 ,
        0.04222003],
       [0.        , 0.        , 0.        , ..., 0.0404226 , 1.        ,
        0.08703883],
       [0.        , 0.02548236, 0.        , ..., 0.04222003, 0.08703883,
        1.        ]])

In [40]:
# This function will sort movies as per similarity with the given movie
def provideClosestMovies(index):
    recommend_mov=sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    return recommend_mov

In [43]:
# This is the main funtion that will be recommending us the movies that basically make use of sorting(by calling above
# function) and similarity matrix

def recommend(movie):
    index = movies[movies['title'] == movie].index[0] # to get the index of the input movie
    recommend_mov = provideClosestMovies(index)
    for i in recommend_mov[1:8]:
        print(movies.iloc[i[0]].title)

In [44]:
recommend('Superman Returns')
# One example how our function works 

Superman II
Superman III
Superman IV: The Quest for Peace
Superman
The Wolverine
Iron Man 2
Man of Steel
The Beastmaster
Krrish


In [45]:
# for using these two files in our final streamlit app
import pickle

In [46]:
pickle.dump(movies.to_dict(),open('movies_dict.pkl','wb'))

In [47]:
pickle.dump(similarity,open('similarity.pkl','wb'))