In [1]:
import pandas as pd
import numpy as np

### Load Data

In [2]:
movies = pd.read_csv('data/movies.csv')
credits = pd.read_csv('data/credits.csv')
ratings = pd.read_csv('data/ratings.csv')

In [3]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Popularity-Based Recommendation

Recommend movies based on ratings.

#### Calculate Weighted Rating

W = ((R * v) + (C * m)) / (v + m)

where:
- W is the weighted rating;
- R is the mean rating for the movie, from 1 to 10;
- v is the number of votes for the movie;
- m is the minimum votes required;
- C is the mean vote across all movies.

In [6]:
# here use 90 percentile of vote count as the minimum required number
m = movies['vote_count'].quantile(0.9)

In [7]:
C = movies['vote_average'].mean()

In [8]:
def weighted_rating(df, m=m ,C=C):
    R = df['vote_average']
    v = df['vote_count']
    wr = ((R * v) + (C * m)) / (v + m)
    return wr

In [9]:
movies['weighted_rating'] = movies.apply(weighted_rating, axis=1)

In [10]:
# showing top 10 movies
movies.sort_values('weighted_rating', ascending=False)[['title', 'weighted_rating']].head(10)

Unnamed: 0,title,weighted_rating
1881,The Shawshank Redemption,8.059258
662,Fight Club,7.939256
65,The Dark Knight,7.92002
3232,Pulp Fiction,7.904645
96,Inception,7.863239
3337,The Godfather,7.851236
95,Interstellar,7.809479
809,Forrest Gump,7.803188
329,The Lord of the Rings: The Return of the King,7.727243
1990,The Empire Strikes Back,7.697884


### Content-Based Recommendation

Recommend movies which are most similar to certain movies.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer(stop_words = 'english')
movies['overview'] = movies['overview'].fillna("") # fill null with empty string
tfidf_matrix = tfidf.fit_transform(movies['overview']) # generate tfidf matrix

In [13]:
# showing tfidf matrix
# the coefficient is called term frequency inverse document frequency
# it indicates the frequency of the word in that movie, but also the rarity of that word
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,00,000,007,07am,10,100,1000,101,108,10th,...,zuckerberg,zula,zuzu,zyklon,æon,éloigne,émigré,été,única,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Similarity Matrix

The similarity matrix indicates the similarity of each pair of movies.

In [14]:
from sklearn.metrics.pairwise import linear_kernel

In [15]:
similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
similarity_matrix

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

#### Get the most similar movies to a certain movie

In [17]:
def similar_movies(movie_title, n_movies):
    """
    movie_title: title of the movie you want to find similar movies to
    n_movies: number of the most similar movies you want to get
    """
    
    idx = movies.loc[movies['title']==movie_title].index[0] # get the index of the movie you want to find similar movies to
    scores = list(enumerate(similarity_matrix[idx])) # get the list of similarty coefficients
    scores = sorted(scores, key=lambda x: x[1], reverse=True) # sort the scores
    movies_indices = [m[0] for m in scores[1:n_movies+1]] # get the indices of the most similar n movies
    similar_movies = list(movies['title'].iloc[movies_indices]) #get the list of the most similar n movies
    return similar_movies

In [18]:
# get the top 5 similar movies 
similar_movies('Interstellar', 5)

['Space Pirate Captain Harlock',
 'Starship Troopers',
 'The Green Inferno',
 'Prometheus',
 'All Good Things']

### Collaborative-Based Recommendation

Recommend movies by using Singular Value Decomposition (SVD) of users'ratings.

In [19]:
ratings = ratings[['userId','movieId','rating']]

In [20]:
# number of unique users
len(list(ratings['userId'].unique()))

671

In [21]:
# number od unique movies
len(list(ratings['movieId'].unique()))

9066

In [22]:
# Look for the users who rated more than 50 movies
usercount = ratings[['userId','movieId']].groupby("userId").count()
usercount = usercount[usercount["movieId"] >= 50]
print(usercount.shape)

(427, 1)


In [23]:
# Look for the movies reviewed by more than 50 users
moviecount = ratings[['userId','movieId']].groupby("movieId").count()
moviecount = moviecount[moviecount["userId"] >= 50]
print(moviecount.shape)

(453, 1)


In [24]:
# Keep only the popular movies and active users
ratings_filtered = ratings[ratings["userId"].isin(usercount.index) & ratings["movieId"].isin(moviecount.index)]
#print(ratings)

In [25]:
# Convert records into user-movie rating score matrix
rating_matrix = ratings_filtered.pivot(index="userId", columns="movieId", values="rating").fillna(0)
matrix = rating_matrix.values

In [26]:
# Singular value decomposition
u, s, vh = np.linalg.svd(matrix, full_matrices=False)

In [27]:
# calculate the similarity
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))

In [28]:
vh.shape

(427, 453)

In [29]:
def recommend_movies(movie_title, n_movies):
    """
    movie_title: title of the movie 
    n_movies: number of the most similar movies 
    """
    
    idx = movies.loc[movies['title']==movie_title].index[0] # get the index of the movie 
    scores = []
    for col in range(vh.shape[1]):
        if col != idx:
            similarity = cosine_similarity(vh[:,idx], vh[:,col])
            scores.append((col, similarity)) # get the list of similarty coefficients
    scores = sorted(scores, key=lambda x: x[1], reverse=True) # sort the scores
    movies_indices = [m[0] for m in scores[:n_movies]] # get the indices of the most similar n movies
    recommend_movies = list(movies['title'].iloc[movies_indices]) #get the list of the most similar n movies
    return recommend_movies

In [30]:
# get the top 5 similar movies 
recommend_movies('Interstellar', 5)

['RoboCop', 'Cast Away', 'Pearl Harbor', 'Evan Almighty', 'Bedtime Stories']