In [151]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [152]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

In [153]:
movies['genres'] = movies['genres'].apply(lambda x: x.replace('|',','))

In [154]:
movies

Unnamed: 0.1,Unnamed: 0,movieId,title,genres,imdbId,tmdbId,g2,overview,poster_path,tagline
0,0,1,Toy Story (1995),"Adventure,Animation,Children,Comedy,Fantasy",114709,862.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,
1,1,2,Jumanji (1995),"Adventure,Children,Fantasy",113497,8844.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,Roll the dice and unleash the excitement!
2,2,3,Grumpier Old Men (1995),"Comedy,Romance",113228,15602.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,Still Yelling. Still Fighting. Still Ready for...
3,3,4,Waiting to Exhale (1995),"Comedy,Drama,Romance",114885,31357.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,Friends are the people who let you be yourself...
4,4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,/e64sOI48hQXyru7naBFyssKFxVd.jpg,Just When His World Is Back To Normal... He's ...
...,...,...,...,...,...,...,...,...,...,...
9550,9550,175705,Themroc (1973),"Comedy,Horror",69369,7014.0,"[{'id': 35, 'name': 'Comedy'}]","Made without proper language, just gibberish a...",/8dAv4QXGhMsJ6rGuDgW9f5nWtVk.jpg,
9551,9551,175707,A German Life (2016),Documentary,5135434,411516.0,"[{'id': 99, 'name': 'Documentary'}]","Brunhilde Pomsel describes herself as an ""apol...",/632Or4YErBU2LIwqeAmG8TUDR9d.jpg,
9552,9552,175743,Self-criticism of a Bourgeois Dog (2017),Comedy,6354108,433410.0,"[{'id': 35, 'name': 'Comedy'}]",A bourgeois dog confesses how he was transform...,/b9tECZJXzPTCepGulfuA5uGEuIo.jpg,
9553,9553,175781,Der Herr Karl (1961),Comedy,273646,28469.0,"[{'id': 35, 'name': 'Comedy'}]",No overview found.,/hUeS1PrPH3Um9idkP42SqibNY5Z.jpg,


In [141]:
'''
<============== top rated recommendations ==============>
'''



In [142]:
def weighted_rating(v,m,R,C):
    '''
    Calculate the weighted rating
    
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)
    '''
    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

In [157]:
def get_highly_rated(rating_df, item_df):
    
    # pre processing
    vote_count = (
        rating_df
        .groupby('movieId',as_index=False)
        .agg( {'userId':'count', 'rating':'mean'} )
        )
    vote_count.columns = ['movieId', 'vote_count', 'avg_rating']
    
    # calcuate input parameters
    C = np.mean(vote_count['avg_rating'])
    m = np.percentile(vote_count['vote_count'], 70)
    vote_count = vote_count[vote_count['vote_count'] >= m]
    R = vote_count['avg_rating']
    v = vote_count['vote_count']
    vote_count['weighted_rating'] = weighted_rating(v,m,R,C)
    
    # post processing
    vote_count = vote_count.merge(item_df, on = ['movieId'], how = 'left')
    popular_items = vote_count.loc[:,['movieId','title', 'genres', 'overview','poster_path','vote_count', 'avg_rating', 'weighted_rating']]
    popular_items = popular_items.sort_values('weighted_rating',ascending=False)
    popular_items = popular_items.head(10)
    return popular_items

In [158]:
get_highly_rated(ratings,movies)

Unnamed: 0,movieId,title,genres,overview,poster_path,vote_count,avg_rating,weighted_rating
183,318,"Shawshank Redemption, The (1994)","Crime,Drama",Framed in the 1940s for the double murder of h...,/9O7gLzmreU0nGkIB6K3BsJbzvNv.jpg,317,4.429022,4.403818
408,858,"Godfather, The (1972)","Crime,Drama","Spanning the years 1945 to 1955, a chronicle o...",/rPdtLWNsZmAtoZl9PK7S2wE3qiS.jpg,192,4.289062,4.25295
1276,2959,Fight Club (1999),"Action,Crime,Drama,Thriller",A ticking-time-bomb insomniac and a slippery s...,/adw6Lq9FiC9zjYEpOqfq03ituwp.jpg,218,4.272936,4.241498
569,1221,"Godfather: Part II, The (1974)","Crime,Drama",In the continuing saga of the Corleone crime f...,/bVq65huQ8vHDd1a4Z37QtuyEvpA.jpg,129,4.25969,4.208361
42,50,"Usual Suspects, The (1995)","Crime,Mystery,Thriller","Held in an L.A. interrogation room, Verbal Kin...",/jgJoRWltoS17nD5MAQ1yK2Ztefw.jpg,204,4.237745,4.205389
152,260,Star Wars: Episode IV - A New Hope (1977),"Action,Adventure,Sci-Fi",Princess Leia is captured and held hostage by ...,/btTdmkgIvOi0FFip1sPuZI2oQG6.jpg,251,4.231076,4.204795
374,750,Dr. Strangelove or: How I Learned to Stop Worr...,"Comedy,War",Insane General Jack D. Ripper initiates a nucl...,/tviJ68Wj4glQk3CPMvdvExYHxX.jpg,97,4.268041,4.200357
561,1213,Goodfellas (1990),"Crime,Drama","The true story of Henry Hill, a half-Irish, ha...",/hAPeXBdGDGmXRPj4OZZ0poH65Iu.jpg,126,4.25,4.198024
294,527,Schindler's List (1993),"Drama,War",The true story of how businessman Oskar Schind...,/yPisjyLweCl1tbgwgtzBCNCBle.jpg,220,4.225,4.195318
2481,58559,"Dark Knight, The (2008)","Action,Crime,Drama,IMAX",Batman raises the stakes in his war on crime. ...,/1hRoyzDtpgMU7Dz4JF22RANzQO7.jpg,149,4.238255,4.194469


In [145]:
'''
<============== content based recommendations ==============>
'''



In [9]:
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# preprocessing
ratedMovies = movies.loc[movies['movieId'].isin(ratings['movieId'])].copy()

In [11]:
# extract the genre
genre = ratedMovies['genres'].str.split(",", expand=True)

# get all possible genre
all_genre = set()
for c in genre.columns:
    distinct_genre = genre[c].str.lower().str.strip().unique()
    all_genre.update(distinct_genre)
all_genre.remove(None)
all_genre.remove('(no genres listed)')

In [12]:
# create item-genre matrix
item_genre_mat = ratedMovies[['movieId', 'genres']].copy()
item_genre_mat['genres'] = item_genre_mat['genres'].str.lower().str.strip()

# OHE the genres column
for genre in all_genre:
    item_genre_mat[genre] = np.where(item_genre_mat['genres'].str.contains(genre), 1, 0)
item_genre_mat = item_genre_mat.drop(['genres'], axis=1)
item_genre_mat = item_genre_mat.set_index('movieId')

# compute similarity matix
corr_mat = cosine_similarity(item_genre_mat)

In [13]:
def top_k_items(item_id, top_k, corr_mat, map_name):
    
    # sort correlation value ascendingly and select top_k item_id
    top_items = corr_mat[item_id,:].argsort()[-top_k:][::-1] 
    top_items = [map_name[e] for e in top_items] 

    return top_items

In [14]:
# get top-k similar items
ind2name = {ind:name for ind,name in enumerate(item_genre_mat.index)}
name2ind = {v:k for k,v in ind2name.items()}

In [30]:
movieId = 110102

In [31]:
similar_items = top_k_items(name2ind[movieId],
                            top_k = 10,
                            corr_mat = corr_mat,
                            map_name = ind2name)
similar_items.remove(movieId)

In [32]:
similar_items

[5378, 106487, 46530, 102445, 102880, 101864, 68358, 103228, 72998]

In [33]:
display(movies.loc[movies['movieId'].isin(similar_items)])

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
3832,5378,Star Wars: Episode II - Attack of the Clones (...,"Action,Adventure,Sci-Fi,IMAX",121765,1894.0
6238,46530,Superman Returns (2006),"Action,Adventure,Sci-Fi,IMAX",348150,1452.0
7018,68358,Star Trek (2009),"Action,Adventure,Sci-Fi,IMAX",796366,13475.0
7212,72998,Avatar (2009),"Action,Adventure,Sci-Fi,IMAX",499549,19995.0
8137,101864,Oblivion (2013),"Action,Adventure,Sci-Fi,IMAX",1483013,75612.0
8159,102445,Star Trek Into Darkness (2013),"Action,Adventure,Sci-Fi,IMAX",1408101,54138.0
8178,102880,After Earth (2013),"Action,Adventure,Sci-Fi,IMAX",1815862,82700.0
8194,103228,Pacific Rim (2013),"Action,Adventure,Sci-Fi,IMAX",1663662,68726.0
8295,106487,The Hunger Games: Catching Fire (2013),"Action,Adventure,Sci-Fi,IMAX",1951264,101299.0
