In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
## Since there are two datasets taking about the same data (Movies) we are going to merge both.

movies = movies.merge(credits, on='title')
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Data Preprocessing

In [6]:
## Filtering important columns
# genres
# id
# keywords
# title
# overview
# cast
# crew

movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']] 

In [7]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
## Checking for missing data

movies.isnull().sum() ## Turns out there are 3 movies which have no overview, since it's not a big number we will drop those rows

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [9]:
 ## Dropping missing values

movies.dropna(inplace=True)
movies.isnull().sum()    ## So no more missing values

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [10]:
## Checking for duplicate data
movies.duplicated().sum()  # no duplicate data

0

Creating Tags column out of the (overview, genres, keyword, cast, crew)

In [11]:
## Processing genres column
import ast

movies.iloc[0].genres # string of list of dictionaries (containing ids and names of genres)

def convert(obj):       # function to create a list of genres in this movies
    L = []
    obj = ast.literal_eval(obj)     # converting the string of list to just list so that we can loop over it!
    for i in obj:
        L.append(i['name'])
    return L


movies['genres'] = movies['genres'].apply(convert)     # applying the conversion function to the dataframe for genres column

In [12]:
## Processing keywords column

movies['keywords'] = movies['keywords'].apply(convert) # applying the same conversion function to the dataframe for keywords column

In [13]:
## Getting top three from casts column as they are most popular/ relevant for movies

def convert_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert_cast)

In [14]:
## Getting director from crew column as director is most relevant from crew when recommending movies

def get_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(get_director)

In [15]:
## Overview column is a string we will convert it to lists to so that we can concatenate it with other columns

movies['overview'] = movies['overview'].apply(lambda x:x.split())

 Now the problem is, there is a spacing between words in colums like Sam Worthington instead of SamWorthington. This can create a problem that it will deal with Sam and Worthington as two different entities and Sam might get mixed up with other instances in this data such as Sam Mendes. This can create inaccuracies in our model so transforming these to remove spacings might help.

In [16]:
## Removing spacings between words

movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [17]:
## Now we will concatenate the columns overview, genres, keywords, cast, and crew to make a list of all tags and then convert it to a paragraph. which can be used to recommend movies. 

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# as we don't need the other columns anymore we will create a new dataframe without these
new_movies = movies[['movie_id', 'title', 'tags']]

# converting tags column to paragraph
new_movies['tags'] = new_movies['tags'].apply(lambda x: ' '.join(x))

# changing to lowercase
new_movies['tags'] = new_movies['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(lambda x: ' '.join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(lambda x: x.lower())


There are some words which have same meaning but act different like actor/actors, able/abilities/ability etc. so we will apply stemming to make them same

In [28]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [32]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return ' '.join(y)

new_movies['tags'] = new_movies['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(stem)


### Vectorization
In this part we will change the tags to vectors to calculate the similarities between different tags from different movies.

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')

In [34]:
vectors = cv.fit_transform(new_movies['tags']).toarray()

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

Calculating similarity matrix of movies

In [44]:
similarity = cosine_similarity(vectors)

[(1216, 0.28676966733820225),
 (2409, 0.26901379342448517),
 (3730, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

### Recommender Function

In [49]:
def recommend(movie):
    movie_index = new_movies[new_movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key=lambda x: x[1])[1:6]

    for i in movies_list:
        print(new_movies.iloc[i[0]].title)


    

In [59]:
recommend('The Dark Knight')

The Dark Knight Rises
Batman Begins
Batman Returns
Batman Forever
Batman
