In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from operator import itemgetter
from typing import List


In [46]:
class ItemRank(object):
    """
   
    """

    def __init__(self, 
                 dataframe=pd.DataFrame,
                 df_key=List[str], 
                 rating=None,
                 m=None, 
                 C=None,  **args):
        self.data = dataframe
        self.df_key = df_key
        self.rating = rating
        self.prior = m
        self.confidence = C
    
    ##Need to handle exceptions
    ##Check @property meaning
    @property
    def items(self):
        """
        Returns the data grouped by items
        """
        return self.data.groupby(self.df_key)

    def get_means(self):
        return self.items[self.rating].mean()

    def get_counts(self):
        return self.items[self.rating].count()
    
    def plot_mean_frequency(self):
        grid   = pd.DataFrame({
                    'Mean Rating':  self.items[self.rating].mean(),
                    'Number of Reviews': self.items[self.rating].count()
                 })
        grid.plot(x='Number of Reviews', y='Mean Rating', kind='hexbin',
                  xscale='log', cmap='YlGnBu', gridsize=12, mincnt=1,
                  title="Ratings by Simple Mean")
        plt.show()
    
    def bayesian_mean(self, arr):
        if not self.prior or not self.confidence:
            raise TypeError("Bayesian mean must be computed with m and C")

        return ((self.confidence * self.prior + arr.sum()) /
                (self.confidence + arr.count()))
    
    def get_bayesian_estimates(self):
        return self.items[self.rating].agg(self.bayesian_mean)
    
    def top_items(self, n=10):
        table   = pd.DataFrame({
                    'mean':  self.get_means(),
                    'count': self.get_counts(),
                    'bayes': self.get_bayesian_estimates()
                 })
        return table.sort_values('bayes', ascending = False)[:n]

    def get_rank(self,rating_method='avg',ascending = True):
        if rating_method == 'bayes':
            table   = pd.DataFrame({
                    'count': self.get_counts(),
                    'rating': self.get_bayesian_estimates()
                 })
        elif rating_method == 'avg':
            table   = pd.DataFrame({
                    'count': self.get_counts(),
                    'rating': self.get_means()
                 })
        table['rank'] = table['rating'].rank(ascending = ascending)
        return table.sort_values('rank')

In [3]:
bgg = 'data/bgg.csv'

In [4]:
#import data
df = pd.read_csv(bgg)

In [48]:
ratings = ItemRank(df,df_key= ['game','title'], rating = 'rating',m=6.3,C=30)

In [None]:
print (ratings.top_items(n=100))
print (ratings.plot_mean_frequency())

In [49]:
bayes_rank = ratings.get_rank(rating_method='bayes',ascending= False)

In [50]:
bayes_rank.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,rating,rank
game,title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
261393,Dungeon Universalis,79,8.739817,1.0
228370,TerroriXico,43,8.438356,2.0
240271,Core Space,76,8.338679,3.0
259061,Skytear,69,8.317172,4.0
245240,Goblin Grapple,88,8.226271,5.0
