In [1]:
import pandas as pd
import numpy as np

In [2]:
comp_beer_csv = pd.read_csv('/home/grimoire/Projects/BeerRatings/comparison_beer.csv')
beer_comp = pd.DataFrame(comp_beer_csv)

In [3]:
beer_comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56857 entries, 0 to 56856
Data columns (total 7 columns):
beer_name            56857 non-null object
review_overall       56857 non-null float64
review_taste         56857 non-null float64
review_appearance    56857 non-null float64
review_palate        56857 non-null float64
review_aroma         56857 non-null float64
total_reviews        56857 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 3.0+ MB


In [4]:
beer_comp['total_reviews'].describe()

count    56857.000000
mean        27.899221
std        122.167414
min          0.000000
25%          1.000000
50%          3.000000
75%          9.000000
max       3289.000000
Name: total_reviews, dtype: float64

To ameliorate any possibly accidental entries in the overall rating. Using an average of all the ratings helped to solve any issues that could have possibly arisen as a result of accidental ratings. This methodology also allowed for more flexibility in rating increments. Instead of .5 increments the Averaged has a full decimal range.

In [5]:
beer_comp['averaged'] = (beer_comp.review_appearance + \
                        beer_comp.review_aroma + \
                        beer_comp.review_palate + \
                        beer_comp.review_taste + \
                        beer_comp.review_overall) / 5

In [6]:
C = beer_comp['averaged'].mean()

In [7]:
print('The mean rating for beer is: ', C)

The mean rating for beer is:  3.6135706681114668


In [8]:
m = 3
q_ratings = beer_comp.copy().loc[beer_comp['total_reviews'] >= m]

In [9]:
def weighted_rating(df, m=m, C=C):
    
    #########################################
    # Calculates and returns a weighted rating for specific feature
    # m is minimum votes/ratings required to be listed
    # C is the mean rating/vote across the whole dataframe
    # R is average rating/votes of feature
    # v is number of ratings/votes of feature
    #########################################
    
    v = df['total_reviews']
    R = df['averaged']
    
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [10]:
q_ratings['weighted_averaged'] = q_ratings.apply(weighted_rating, axis=1)

In [11]:
q_ratings = q_ratings.sort_values('weighted_averaged', ascending=False)

In [12]:
q_ratings

Unnamed: 0,beer_name,review_overall,review_taste,review_appearance,review_palate,review_aroma,total_reviews,averaged,weighted_averaged
41268,Rare D.O.S.,4.848485,4.848485,4.469697,4.803030,4.757576,33,4.745455,4.651131
31374,M Belgian-Style Barleywine,4.750000,4.857143,4.482143,4.803571,4.785714,28,4.735714,4.627120
3109,Armand'4 Oude Geuze Lente (Spring),4.730769,4.730769,4.523077,4.669231,4.715385,65,4.673846,4.627069
39650,Pliny The Younger,4.600000,4.724590,4.482787,4.612295,4.723770,610,4.628689,4.623721
51855,Trappist Westvleteren 12,4.617925,4.718553,4.454009,4.633255,4.583333,1272,4.601415,4.599091
53223,Vanilla Bean Aged Dark Lord,4.476974,4.710526,4.450658,4.674342,4.717105,152,4.605921,4.586714
19339,Founders CBS Imperial Stout,4.591052,4.697017,4.457614,4.579278,4.558085,637,4.576609,4.572095
10757,Cantillon Blåbær Lambik,4.628205,4.628205,4.644231,4.493590,4.528846,156,4.584615,4.566294
15244,Dirty Horse,4.820513,4.743590,4.423077,4.576923,4.615385,39,4.635897,4.562874
41266,Rare Bourbon County Stout,4.544177,4.767068,4.269076,4.594378,4.658635,249,4.566667,4.555320
