## Imports and Preprocessing

In [1]:
%matplotlib inline
import os
import numpy as np
import sklearn
import pandas as pd
import seaborn as sns
sns.set()

# Import CSV
dir_path = os.path.dirname(os.path.abspath('__file__'))
file_path = os.path.join(dir_path, 'beer_reviews.csv')
beer_data = pd.read_csv(file_path, delimiter=',', encoding='utf-8')
# Drop any column that is not very informative
beer_data.drop(['brewery_id', 'review_time'], axis=1, inplace=True)

# Split data frame into individual frames
reviews = dict()
for col in beer_data.columns:
    reviews.update({
            col:beer_data[col]
        })

## General Recommendation

### Recommend beers based on overall reviews from all reviewers

Ordering of overall rating ("review_overall") for beers returns a list of beers and their corresponding ranking in the list. However, general sorting and ranking is not statistically correct beause,
- the overall ratings are distributed around a mean value 
- all beers are not rated by all the users
- some beers have only one or very few ratings and some others have few hundreds of ratings

Hence, we adopt the following steps to clean and standardize the data before sorting and ranking the reviews.

### Step 1: Data Cleaning

- Group reviews by beer identifiers
- Drop duplicate reviews by the same reviewer for same beer
- Drop any samples with undefined values (NaNs)

### Step 2: Redefining Sample Data

- Calculate sample means of overall ratings for a given beer 
- Include only the beers where we can calculate the sample mean within a certain margin of error.

Chose the beers with reviews greater than a certain threshold number of reviews (min number of samples) that are required to predict the sample mean with 95% confidence interval. 

We use this formula: 

($\frac{\sigma^2 * Z^2}{m^2}$), 

where $\sigma$ is the standard deviation of the sample, 
Z-score for a confidence interval of 95% is 1.96 
and m is the allowed margin of error. 

- Reduce the sample set by assign the mean value as the overall rating for that beer

### Step 3: Sort and Rank
- Sort and rank the ratings to pick the top few beers as recommendations based on this data.
- Display recommendations based on
    - Overall Rating
    - Aroma
    - Apprearance
    - Palate
    - Taste

In [2]:
# Define Margin of Error and Z-score for 95% confidence interval
mError = 0.1
zScore = 1.96

def prep_data_frame(_list):
    """
    _list: list of column headers. Example: ["beer_beerid","beer_name",..]
    """
    _dict = {header: reviews[header] for header in _list}
    return pd.DataFrame.from_dict(_dict)
    
def calculate_stats(key, data_frame):
    _df = data_frame.groupby(level=0)
    samples = _df.count().rename(columns={key: 'count'})
    means = _df.mean().rename(columns={key: 'mean'})
    std = _df.std().rename(columns={key:'std'})
    return pd.concat([samples, means, std], axis=1)
    

beer_identifiers = beer_data[['beer_beerid','beer_name', 'beer_style', 'review_profilename']]
reviews_means = dict()
for key in ['review_overall', 'review_aroma', 'review_taste', 'review_appearance', 'review_palate']:
    
    # Prepare Data Frame for each review
    ids = prep_data_frame(beer_identifiers)
    review = prep_data_frame([key])
    data_frame = pd.concat([ids, review], axis=1).drop_duplicates(['beer_beerid','review_profilename'])
    
    # Filter rows if number of reviews meet certain criteria
    stats = calculate_stats(key, data_frame.set_index(["beer_beerid","beer_name"]))
    stats = stats[stats['std'] != 0] # Remove rows with zero std dev
    stats['required'] = stats['std'].map(lambda x:(x *zScore/mError)**2) # Add a new row with required num samples
    beer_ids = [idx for idx in stats.index if stats.loc[idx, 'count'] > stats.loc[idx, 'required']]
    mean_values = [stats.loc[idx, 'mean'] for idx in beer_ids]

    # Drop duplicate beerids and reviewer profilenames 
    data_frame = data_frame.drop_duplicates(['beer_beerid']).drop('review_profilename', axis=1)

    # Keep the beers that have minimum number of reviews to predict ratings with 95% confidence interval
    review_data_frame = data_frame.set_index(['beer_beerid'])

    review_data_frame = review_data_frame.drop([Id for Id in review_data_frame.index if Id not in beer_ids])

    
    #Add DataFrames for each attribute reviews
    reviews_means.update({
        key : review_data_frame.reset_index()
    })

## Save to disk
- Convert Pandas DataFrame to JSON and save to disk

In [3]:
import json

# Make a copy of the dictionary
_copy = dict()
for key, value in reviews_means.items():
    _copy[key] = value.to_json(orient='index')

# Get the directory path
out_file = os.path.join(dir_path, 'reviews_means.json' )
with open(out_file, 'w') as f:
    json.dump(_copy, f)

### Mean Overall Rating

In [4]:
dF = reviews_means['review_overall'].set_index(['beer_beerid'])
dF = dF.sort_values(by='review_overall', ascending=False)
dF.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_overall
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34909,Stone Old Guardian Barley Wine Style Ale 2007,American Barleywine,5.0
4083,Stone Ruination IPA,American Double / Imperial IPA,5.0
2654,Left Hand Imperial Stout,Russian Imperial Stout,5.0
53433,Unplugged Cranbic Ale,American Wild Ale,5.0
32360,Stone 10th Anniversary IPA,American Double / Imperial IPA,5.0
637,Old Speckled Hen,English Pale Ale,5.0
411,Pranqster,Belgian Strong Pale Ale,5.0
64545,Double Sunshine IPA,American Double / Imperial IPA,5.0
41043,Double Dead Guy Ale,American Strong Ale,5.0
66036,Stone Old Guardian BELGO Barleywine,American Barleywine,5.0


### Aroma

In [5]:
dF = reviews_means['review_aroma'].set_index(['beer_beerid'])
dF = dF.sort_values(by='review_aroma', ascending=False)
dF.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_aroma
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1440,Bailey's Blonde Ale,American Blonde Ale,5.0
705,J.W. Lees Vintage Harvest Ale,English Barleywine,5.0
46984,John Henry 3 Lick Spiker Ale,American Strong Ale,5.0
6305,Drie Fonteinen Oude Geuze,Gueuze,5.0
42349,Vanilla Bean Aged Dark Lord,Russian Imperial Stout,5.0
645,Trappistes Rochefort 10,Quadrupel (Quad),5.0
51619,Ommegang Adoration Ale,Belgian Strong Dark Ale,5.0
27604,Duet IPA,American IPA,5.0
1836,La Chouffe,Belgian Strong Pale Ale,5.0
3916,AleSmith IPA,American IPA,5.0


### Appearance

In [6]:
dF = reviews_means['review_appearance'].set_index(['beer_beerid'])
dF = dF.sort_values(by='review_appearance', ascending=False)
dF.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_appearance
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1836,La Chouffe,Belgian Strong Pale Ale,5.0
3877,Dark Horse Reserve Special Black Bier Ale,American Strong Ale,5.0
73427,Blaecorn Unidragon,Russian Imperial Stout,5.0
44727,Portsmouth 5 C's IPA,American IPA,5.0
8848,Dark Horse Boffo Brown Ale,English Brown Ale,5.0
2196,Herold Bohemian Black Lager,Schwarzbier,5.0
7284,YuleSmith (Summer),American Double / Imperial IPA,5.0
44568,Loser Pale Ale,American Pale Ale (APA),5.0
63422,Either,American Black Ale,5.0
55382,Collaboration No. 1 - Imperial Pilsner,American Double / Imperial Pilsner,5.0


### Palate

In [7]:
dF = reviews_means['review_palate'].set_index(['beer_beerid'])
dF = dF.sort_values(by='review_palate', ascending=False)
dF.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_palate
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17112,Bell's Hopslam Ale,American Double / Imperial IPA,5.0
30,Trois Pistoles,Belgian Strong Dark Ale,5.0
42349,Vanilla Bean Aged Dark Lord,Russian Imperial Stout,5.0
129,Orval Trappist Ale,Belgian Pale Ale,5.0
26072,Noire De Chambly / Chambly Noire,Belgian Dark Ale,5.0
356,Imperial Stout,Russian Imperial Stout,5.0
46080,Apocalypse Cow,American Double / Imperial IPA,5.0
55564,Red Chair NWPA,American Pale Ale (APA),5.0
30184,Oak Aged Dark Lord Imperial Stout,Russian Imperial Stout,5.0
37294,Dark Horizon 1st Edition,Russian Imperial Stout,5.0


### Taste

In [8]:
dF = reviews_means['review_taste'].set_index(['beer_beerid'])
dF = dF.sort_values(by='review_taste', ascending=False)
dF.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_taste
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30184,Oak Aged Dark Lord Imperial Stout,Russian Imperial Stout,5.0
38094,Toronado 20th Anniversary Ale,American Wild Ale,5.0
1836,La Chouffe,Belgian Strong Pale Ale,5.0
27286,Arcadia HopMouth Double IPA,American Double / Imperial IPA,5.0
2803,Sol,American Adjunct Lager,5.0
3833,AleSmith Speedway Stout,American Double / Imperial Stout,5.0
33127,Darkness,Russian Imperial Stout,5.0
30,Trois Pistoles,Belgian Strong Dark Ale,5.0
20168,Hoptical Illusion,American IPA,5.0
48505,Victory At Sea Coffee Vanilla Imperial Porter,American Porter,5.0
