In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [6]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ids = pd.read_csv('links.csv')

In [102]:
movies = movies.merge(ids,left_on='movieId',right_on='movieId')

In [103]:
movies

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,5476944,432131.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,5914996,445030.0
9739,193585,Flint (2017),Drama,6397426,479308.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,8391976,483455.0


In [104]:
def weighted_rating(v,m,R,C):
    '''
    Calculate the weighted rating
    
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)
    '''
    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

In [105]:
def get_highly_rated(rating_df, item_df):
    
    # pre processing
    vote_count = (
        rating_df
        .groupby('movieId',as_index=False)
        .agg( {'userId':'count', 'rating':'mean'} )
        )
    vote_count.columns = ['movieId', 'vote_count', 'avg_rating']
    
    # calcuate input parameters
    C = np.mean(vote_count['avg_rating'])
    m = np.percentile(vote_count['vote_count'], 70)
    vote_count = vote_count[vote_count['vote_count'] >= m]
    R = vote_count['avg_rating']
    v = vote_count['vote_count']
    vote_count['weighted_rating'] = weighted_rating(v,m,R,C)
    
    # post processing
    vote_count = vote_count.merge(item_df, on = ['movieId'], how = 'left')
    popular_items = vote_count.loc[:,['movieId', 'genres', 'vote_count', 'avg_rating', 'weighted_rating']]
    popular_items = popular_items.sort_values('weighted_rating',ascending=False)
    popular_items = popular_items.head(10)
    return popular_items

In [106]:
get_highly_rated(ratings,movies)

Unnamed: 0,movieId,genres,vote_count,avg_rating,weighted_rating
183,318,Crime|Drama,317,4.429022,4.403818
408,858,Crime|Drama,192,4.289062,4.25295
1276,2959,Action|Crime|Drama|Thriller,218,4.272936,4.241498
569,1221,Crime|Drama,129,4.25969,4.208361
42,50,Crime|Mystery|Thriller,204,4.237745,4.205389
152,260,Action|Adventure|Sci-Fi,251,4.231076,4.204795
374,750,Comedy|War,97,4.268041,4.200357
561,1213,Crime|Drama,126,4.25,4.198024
294,527,Drama|War,220,4.225,4.195318
2480,58559,Action|Crime|Drama|IMAX,149,4.238255,4.194469
