In [1]:
# %load get_data.py
def get_data() :
    
    import pandas as pd
    
    # rating_update is the original data 'beer_review.csv' with beer_style updated
    # The updates change beer_style to match the styles that are given in 'beer_description.csv'

    csv_beer = pd.read_csv("/home/grimoire/Projects/BeerRatings/rating_update.csv")
    beer_ratings = pd.DataFrame(csv_beer)
    
    return beer_ratings


In [2]:
import pandas as pd
import numpy as np

In [3]:
ratings = get_data()

In [4]:
ratings.keys()

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

In [5]:
ratings = ratings.drop(['brewery_id', 'brewery_name', 
                        'review_time','beer_abv',
                        'beer_beerid', 'beer_style'], axis=1)

In [6]:
ratings.head()

Unnamed: 0,review_overall,review_aroma,review_appearance,review_profilename,review_palate,review_taste,beer_name
0,1.5,2.0,2.5,stcules,1.5,1.5,Sausa Weizen
1,3.0,2.5,3.0,stcules,3.0,3.0,Red Moon
2,3.0,2.5,3.0,stcules,3.0,3.0,Black Horse Black Beer
3,3.0,3.0,3.5,stcules,2.5,3.0,Sausa Pils
4,4.0,4.5,4.0,johnmichaelsen,4.0,4.5,Cauldron DIPA


In [7]:
ratings = ratings[ratings['review_profilename'] != 'UNKNOWN']

In [8]:
ratings['combined'] = (ratings.review_overall.values +
    ratings.review_appearance.values +
    ratings.review_aroma.values +
    ratings.review_palate.values +
    ratings.review_taste.values) / 5

In [9]:
ratings = ratings.drop(['review_overall', 'review_aroma', 
                        'review_appearance','review_palate', 
                        'review_taste'], axis=1)

In [10]:
profile_RatingCount = ratings['review_profilename'].value_counts()
beer_RatingCount = ratings['beer_name'].value_counts()

In [11]:
# This line sets how pandas displays numbers
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print('Profile Rating Counts')
print(profile_RatingCount.describe())
print('\nBeer Rating Counts')
print(beer_RatingCount.describe())

Profile Rating Counts
count   33387.000
mean       47.511
std       182.600
min         1.000
25%         1.000
50%         3.000
75%        16.000
max      5817.000
Name: review_profilename, dtype: float64

Beer Rating Counts
count   56856.000
mean       27.900
std       122.168
min         1.000
25%         1.000
50%         3.000
75%         9.000
max      3289.000
Name: beer_name, dtype: float64


In [12]:
print('Profile Rating Counts')
print(profile_RatingCount[profile_RatingCount < 100].describe())
print('\nBeer Rating Counts')
print(beer_RatingCount[beer_RatingCount >= 10].describe())

Profile Rating Counts
count   30231.000
mean       10.122
std        17.505
min         1.000
25%         1.000
50%         3.000
75%         9.000
max        99.000
Name: review_profilename, dtype: float64

Beer Rating Counts
count   13831.000
mean      106.587
std       230.566
min        10.000
25%        16.000
50%        31.000
75%        86.000
max      3289.000
Name: beer_name, dtype: float64


## Profiles with under 100 ratings
## Include beers with at least 10 ratings

In [13]:
ratings = ratings[ratings['review_profilename'].isin(profile_RatingCount[profile_RatingCount < 100].index)]
ratings = ratings[ratings['review_profilename'].isin(profile_RatingCount[profile_RatingCount > 1].index)]
ratings = ratings[ratings['beer_name'].isin(beer_RatingCount[beer_RatingCount >= 10].index)]

## Only interested in beers the user actually LIKED

In [14]:
# ratings = ratings[ratings['combined']>=3.5]

In [15]:
ratings_pivot = ratings.pivot_table(index='review_profilename', 
                                    columns='beer_name', 
                                    values='combined',
                                    aggfunc='count',
                                    fill_value=0)

In [16]:
ratings_pivot.head()

beer_name,"""400"" Ale","""Hop Obama"" Ale","""Old Yeltsin"" Imperial Stout","""Shabadoo"" Black & Tan Ale","""The Wind Cried Mari..."" Scottish Heather Ale","""True Blue"" Blueberry Ale",# 100,#'s Ale,#9,$ellout $tout,...,Épluche-Culotte,Équinoxe Du Printemps,Ölsch,Örebro Bitter,Ølfabrikken 100 Gram India Pale Ale,Ølfabrikken Jule Ale,Ølfabrikken Kloster Jul,Ølfabrikken Porter,Über Pils,ÜberSun (Imperial Summer Wheat Beer)
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
02maxima,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03SVTCobra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04101Brewer,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0beerguy0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0runkp0s,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
ratings_pivot.to_csv('/home/grimoire/Projects/BeerRatings/test_matrix.csv')