# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Simple Recommender

In [2]:
#load data
data = pd.read_csv('movies_metadata.csv', low_memory=False) #low memory because different dtypes

data.sample(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
25293,False,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",http://wolfcop.com/,262840,tt2781516,en,WolfCop,It's not unusual for alcoholic cop Lou to blac...,...,2014-06-06,0.0,79.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Here Comes The Fuzz,WolfCop,False,5.2,60.0
37192,False,,0,[],,177216,tt0105667,en,Twist,The history of post-World War II popular dance...,...,1992-09-18,0.0,74.0,[],Released,,Twist,False,0.0,0.0
34146,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.sayonara-kabukicho.com/,287493,tt3876372,ja,さよなら歌舞伎町,This erotically charged drama from Japanese di...,...,2014-09-07,0.0,135.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Kabukicho Love Hotel,False,6.4,11.0
16028,False,,0,"[{'id': 18, 'name': 'Drama'}]",,47792,tt0088241,el,Ταξίδι στα Κύθηρα,An old communist returns to Greece after 32 ye...,...,1984-04-21,0.0,120.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Voyage to Cythera,False,5.9,10.0
3545,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",,80287,tt0095977,en,Rent-a-Cop,When call-girl Della gets caught in the middle...,...,1987-11-28,0.0,96.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Deadlier than Dirty Harry, faster than Cobra",Rent-a-Cop,False,4.6,9.0


Ratings can be skewed given the number of voters and thus dont represent popularity so will use the following equation:

Weighted Rating (WR) = (v/(v+m) * R) + (m/(m+v) * C)
    - v = Number of Votes
    - m = minimum votes to be listed in chart
    - R = averge rating
    - C = mean vote across whole report

In [3]:
#get columns 
data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

will need vote_vount(v) amd vote_average(R), m is subjective, but will use 90%

In [4]:
#calc mean for vote_average(C) for whole report
C = data['vote_average'].mean()
print(C)

5.618207215134185


In [5]:
#calculate m, m=90th percentile
m = data['vote_count'].quantile(0.9)
print(m)

160.0


In [6]:
#make copy
movies = data.copy()

#filter movies for vote count above 90th percentile
movies = movies.loc[movies['vote_count'] >= m]

print(f"movies: {movies.shape},\noriginal: {data.shape}")

movies: (4555, 24),
original: (45466, 24)


In [10]:
#calc weighted average rating
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [11]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
movies['score'] = movies.apply(weighted_rating, axis=1)

In [12]:
#Sort movies based on score calculated above
movies = movies.sort_values('score', ascending=False)

#Print the top 15 movies
movies[['title', 'vote_count', 'vote_average', 'score']].head(15)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


# Content Based Recommender

Recommend based on similarity. Need to compute pairwise cosine similarity scores based on plot description

words...so NLP comes into play

## Recommender based on Plot

In [38]:
#the data is huge, so going to take a random sample of 10,000 to start then go from there
movies_10k = data.sample(10000)
movies_10k.shape

(10000, 24)

In [8]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies_10k['overview'] = movies_10k['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies_10k['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(10000, 35508)

In [12]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [16]:
cosine_sim.shape

(10000, 10000)

In [17]:
cosine_sim[0]

array([1.        , 0.        , 0.        , ..., 0.        , 0.01323476,
       0.        ])

In [18]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(movies_10k.index, index=movies_10k['title']).drop_duplicates()
# yhis will help id index of movie title

In [19]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_10k['title'].iloc[movie_indices]


In [21]:
#generate random title to test recommender
movies_10k['title'].sample()

8102    Benji: Off the Leash!
Name: title, dtype: object

In [23]:
#test
get_recommendations('Benji: Off the Leash!')

39791                            Snowtime!
36769                            Pirosmani
4713                    The Learning Curve
25816                           Girl Happy
6283                            Battle Cry
17675                    The Last Frontier
1446                        A Chef in Love
31647                      Brooklyn Boheme
27395    A Reason to Live, a Reason to Die
20495          Starship Troopers: Invasion
Name: title, dtype: object

## Recommender based on Credits, Genres, Keywords

In [42]:
movies_10k.nunique()

adult                       3
belongs_to_collection     738
budget                    446
genres                   1633
homepage                 1737
id                       9998
imdb_id                  9996
original_language          70
original_title           9868
overview                 9758
popularity               9812
poster_path              9918
production_companies     5666
production_countries      792
release_date             6823
revenue                  1503
runtime                   246
spoken_languages          679
status                      5
tagline                  4459
title                    9787
video                       2
vote_average               90
vote_count                789
dtype: int64

In [44]:
# Load keywords and credits
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

# Remove rows with bad IDs - skipping below because I'm only working with movies_10k. [19730, 29503, 35587] these numbers are rows with bad id's drop all or some as movies_10k is random
movies_10k = movies_10k.drop(35587)

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies_10k['id'] = movies_10k['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
movies_10k = movies_10k.merge(credits, on='id')
movies_10k = movies_10k.merge(keywords, on='id')


In [45]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movies_10k[feature] = movies_10k[feature].apply(literal_eval)


In [46]:
#get director's name 

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


In [47]:
#returns top 3 elements or entire list whichever is longer
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []


In [48]:
# Define new director, cast, genres and keywords features that are in a suitable form.
movies_10k['director'] = movies_10k['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    movies_10k[feature] = movies_10k[feature].apply(get_list)


In [54]:
# Print the new features of the first 3 films
movies_10k[['title', 'cast', 'director', 'keywords', 'genres']].head(3)


Unnamed: 0,title,cast,director,keywords,genres
0,Christmas Oranges,"[Bruce Newbold, Edward Herrmann, Bailee Michel...",John Lyde,[christmas],[Family]
1,10th & Wolf,"[James Marsden, Brian Dennehy, Leo Rossi]",Robert Moresco,"[undercover, mafia, mobster]","[Action, Crime, Drama]"
2,Cut Snake,"[Sullivan Stapleton, Jessica De Gouw, Alex Rus...",Tony Ayres,[],[Thriller]


In [55]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


In [56]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movies_10k[feature] = movies_10k[feature].apply(clean_data)


In [57]:
#create "soup" that contains a string of all important data you want to feed to vectorizer
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])


In [58]:
# Create a new soup feature
movies_10k['soup'] = movies_10k.apply(create_soup, axis=1)
movies_10k['soup'].head()


0    christmas brucenewbold edwardherrmann baileemi...
1    undercover mafia mobster jamesmarsden brianden...
2     sullivanstapleton jessicadegouw alexrussell t...
3    ruralsetting farmer lorettayoung josephcotten ...
4     lucabarbareschi zhangjingchu carlng lucabarba...
Name: soup, dtype: object

In [59]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies_10k['soup'])


In [60]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


In [61]:
# Reset index of your main DataFrame and construct reverse mapping as before
movies_10k = movies_10k.reset_index()
indices = pd.Series(movies_10k.index, index=movies_10k['title'])


In [64]:
#get random soup to test
movies_10k['title'].sample()

7262    Elvis & Nixon
Name: title, dtype: object

In [65]:
#try a recommendation using function from above
get_recommendations('Elvis & Nixon', cosine_sim2)

2391                                     Puolin ja toisin
5062                                       Primary Colors
9612                                            The Guide
2912                                   The Abduction Club
5884                      Scipione detto anche l'africano
9578                                               Return
4188                                          Army of One
5645                                       Carry On Henry
5646                                       Carry On Henry
8012    Drunk Stoned Brilliant Dead: The Story of the ...
Name: title, dtype: object