In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ids = pd.read_csv('links.csv')

In [3]:
movies = movies.merge(ids,left_on='movieId',right_on='movieId')

In [4]:
movies['genres'] = movies['genres'].apply(lambda x: x.replace('|',','))

In [None]:
'''
<============== top rated recommendations ==============>
'''

In [5]:
def weighted_rating(v,m,R,C):
    '''
    Calculate the weighted rating
    
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)
    '''
    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

In [6]:
def get_highly_rated(rating_df, item_df):
    
    # pre processing
    vote_count = (
        rating_df
        .groupby('movieId',as_index=False)
        .agg( {'userId':'count', 'rating':'mean'} )
        )
    vote_count.columns = ['movieId', 'vote_count', 'avg_rating']
    
    # calcuate input parameters
    C = np.mean(vote_count['avg_rating'])
    m = np.percentile(vote_count['vote_count'], 70)
    vote_count = vote_count[vote_count['vote_count'] >= m]
    R = vote_count['avg_rating']
    v = vote_count['vote_count']
    vote_count['weighted_rating'] = weighted_rating(v,m,R,C)
    
    # post processing
    vote_count = vote_count.merge(item_df, on = ['movieId'], how = 'left')
    popular_items = vote_count.loc[:,['movieId', 'genres', 'vote_count', 'avg_rating', 'weighted_rating']]
    popular_items = popular_items.sort_values('weighted_rating',ascending=False)
    popular_items = popular_items.head(10)
    return popular_items

In [7]:
get_highly_rated(ratings,movies)

Unnamed: 0,movieId,genres,vote_count,avg_rating,weighted_rating
183,318,"Crime,Drama",317,4.429022,4.403818
408,858,"Crime,Drama",192,4.289062,4.25295
1276,2959,"Action,Crime,Drama,Thriller",218,4.272936,4.241498
569,1221,"Crime,Drama",129,4.25969,4.208361
42,50,"Crime,Mystery,Thriller",204,4.237745,4.205389
152,260,"Action,Adventure,Sci-Fi",251,4.231076,4.204795
374,750,"Comedy,War",97,4.268041,4.200357
561,1213,"Crime,Drama",126,4.25,4.198024
294,527,"Drama,War",220,4.225,4.195318
2480,58559,"Action,Crime,Drama,IMAX",149,4.238255,4.194469


In [8]:
'''
<============== content based recommendations ==============>
'''

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# preprocessing
ratedMovies = movies.loc[movies['movieId'].isin(ratings['movieId'])].copy()

In [11]:
# extract the genre
genre = ratedMovies['genres'].str.split(",", expand=True)

# get all possible genre
all_genre = set()
for c in genre.columns:
    distinct_genre = genre[c].str.lower().str.strip().unique()
    all_genre.update(distinct_genre)
all_genre.remove(None)
all_genre.remove('(no genres listed)')

In [12]:
# create item-genre matrix
item_genre_mat = ratedMovies[['movieId', 'genres']].copy()
item_genre_mat['genres'] = item_genre_mat['genres'].str.lower().str.strip()

# OHE the genres column
for genre in all_genre:
    item_genre_mat[genre] = np.where(item_genre_mat['genres'].str.contains(genre), 1, 0)
item_genre_mat = item_genre_mat.drop(['genres'], axis=1)
item_genre_mat = item_genre_mat.set_index('movieId')

# compute similarity matix
corr_mat = cosine_similarity(item_genre_mat)

In [13]:
def top_k_items(item_id, top_k, corr_mat, map_name):
    
    # sort correlation value ascendingly and select top_k item_id
    top_items = corr_mat[item_id,:].argsort()[-top_k:][::-1] 
    top_items = [map_name[e] for e in top_items] 

    return top_items

In [14]:
# get top-k similar items
ind2name = {ind:name for ind,name in enumerate(item_genre_mat.index)}
name2ind = {v:k for k,v in ind2name.items()}

In [30]:
movieId = 110102

In [31]:
similar_items = top_k_items(name2ind[movieId],
                            top_k = 10,
                            corr_mat = corr_mat,
                            map_name = ind2name)
similar_items.remove(movieId)

In [32]:
similar_items

[5378, 106487, 46530, 102445, 102880, 101864, 68358, 103228, 72998]

In [33]:
display(movies.loc[movies['movieId'].isin(similar_items)])

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
3832,5378,Star Wars: Episode II - Attack of the Clones (...,"Action,Adventure,Sci-Fi,IMAX",121765,1894.0
6238,46530,Superman Returns (2006),"Action,Adventure,Sci-Fi,IMAX",348150,1452.0
7018,68358,Star Trek (2009),"Action,Adventure,Sci-Fi,IMAX",796366,13475.0
7212,72998,Avatar (2009),"Action,Adventure,Sci-Fi,IMAX",499549,19995.0
8137,101864,Oblivion (2013),"Action,Adventure,Sci-Fi,IMAX",1483013,75612.0
8159,102445,Star Trek Into Darkness (2013),"Action,Adventure,Sci-Fi,IMAX",1408101,54138.0
8178,102880,After Earth (2013),"Action,Adventure,Sci-Fi,IMAX",1815862,82700.0
8194,103228,Pacific Rim (2013),"Action,Adventure,Sci-Fi,IMAX",1663662,68726.0
8295,106487,The Hunger Games: Catching Fire (2013),"Action,Adventure,Sci-Fi,IMAX",1951264,101299.0
