## Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import date
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import sklearn

## Parsing the data

In [2]:
df=pd.read_csv('beer_reviews.csv')
df.columns

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

In [3]:

df.drop(['brewery_id', 'review_time'], axis=1, inplace=True)
df.head()



Unnamed: 0,brewery_name,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,Vecchio Birraio,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,Vecchio Birraio,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,Vecchio Birraio,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,Vecchio Birraio,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,Caldera Brewing Company,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [4]:
reviews = dict()
for col in df.columns:
    reviews.update({
            col:df[col]
        })

reviews

{'beer_abv': 0          5.0
 1          6.2
 2          6.5
 3          5.0
 4          7.7
 5          4.7
 6          4.7
 7          4.7
 8          4.7
 9          4.7
 10         3.5
 11         4.7
 12         4.7
 13         4.7
 14         4.7
 15         7.2
 16         7.2
 17         5.6
 18         3.5
 19         7.4
 20         7.4
 21         7.4
 22         7.4
 23         7.4
 24         7.4
 25         7.4
 26         7.4
 27         7.4
 28         7.4
 29         7.4
           ... 
 1586584    5.8
 1586585    5.8
 1586586    5.8
 1586587    NaN
 1586588    8.0
 1586589    8.0
 1586590    8.0
 1586591    8.0
 1586592    5.2
 1586593    5.2
 1586594    6.0
 1586595    5.0
 1586596    NaN
 1586597    NaN
 1586598    NaN
 1586599    9.0
 1586600    5.2
 1586601    5.2
 1586602    5.2
 1586603    5.2
 1586604    5.2
 1586605    5.2
 1586606    5.2
 1586607    5.2
 1586608    5.2
 1586609    5.2
 1586610    5.2
 1586611    5.2
 1586612    5.2
 1586613    5.2
 Name: beer_

In [5]:
# Define Margin of Error and Z-score for 95% confidence interval
mError = 0.1
zScore = 1.96

def prep_data_frame(_list):
    """
    _list: list of column headers. Example: ["beer_beerid","beer_name",..]
    """
    _dict = {header: reviews[header] for header in _list}
    return pd.DataFrame.from_dict(_dict)
    
def calculate_stats(key, data_frame):
    _df = data_frame.groupby(level=0)
    samples = _df.count().rename(columns={key: 'count'})
    means = _df.mean().rename(columns={key: 'mean'})
    std = _df.std().rename(columns={key:'std'})
    return pd.concat([samples, means, std], axis=1)
    

beer_identifiers = df[['beer_beerid','beer_name', 'beer_style', 'review_profilename']]
reviews_means = dict()
for key in ['review_overall', 'review_aroma', 'review_taste', 'review_appearance', 'review_palate']:
    
    # create df for each review
    ids = prep_data_frame(beer_identifiers)
    review = prep_data_frame([key])
    data_frame = pd.concat([ids, review], axis=1).drop_duplicates(['beer_beerid','review_profilename'])
    
    # Filter rows and keep items that meet criteria
    stats = calculate_stats(key, data_frame.set_index(["beer_beerid","beer_name"]))
    stats = stats[stats['std'] != 0] # Remove rows with zero std dev
    stats['required'] = stats['std'].map(lambda x:(x *zScore/mError)**2) 
    # Add a new row with required number samples
    beer_ids = [idx for idx in stats.index if stats.loc[idx, 'count'] > stats.loc[idx, 'required']]
    mean_values = [stats.loc[idx, 'mean'] for idx in beer_ids]

    # Drop duplicates
    data_frame = data_frame.drop_duplicates(['beer_beerid']).drop('review_profilename', axis=1)

    # Only keep the beers meet requirements 
    review_data_frame = data_frame.set_index(['beer_beerid'])

    review_data_frame = review_data_frame.drop([Id for Id in review_data_frame.index if Id not in beer_ids])

    
    #Create df for reviews
    reviews_means.update({
        key : review_data_frame.reset_index()
    })



In [6]:
prep_data_frame

<function __main__.prep_data_frame>

## Question 3: Which of the factors (aroma, taste, appearance, palette) are most important in determining the overall quality of a beer?

## Question 4: If I typically enjoy a beer due to its aroma and appearance, which beer style should I try?

In [7]:
df_overall = reviews_means['review_overall'].set_index(['beer_beerid'])
df_overall = df_overall.sort_values(by='review_overall', ascending=False)
df_overall.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_overall
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34909,Stone Old Guardian Barley Wine Style Ale 2007,American Barleywine,5.0
4083,Stone Ruination IPA,American Double / Imperial IPA,5.0
2654,Left Hand Imperial Stout,Russian Imperial Stout,5.0
53433,Unplugged Cranbic Ale,American Wild Ale,5.0
32360,Stone 10th Anniversary IPA,American Double / Imperial IPA,5.0
637,Old Speckled Hen,English Pale Ale,5.0
411,Pranqster,Belgian Strong Pale Ale,5.0
64545,Double Sunshine IPA,American Double / Imperial IPA,5.0
41043,Double Dead Guy Ale,American Strong Ale,5.0
66036,Stone Old Guardian BELGO Barleywine,American Barleywine,5.0


In [8]:
df_aroma = reviews_means['review_aroma'].set_index(['beer_beerid'])
df_aroma =df_aroma.sort_values(by='review_aroma', ascending=False)
df_aroma.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_aroma
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1440,Bailey's Blonde Ale,American Blonde Ale,5.0
705,J.W. Lees Vintage Harvest Ale,English Barleywine,5.0
46984,John Henry 3 Lick Spiker Ale,American Strong Ale,5.0
6305,Drie Fonteinen Oude Geuze,Gueuze,5.0
42349,Vanilla Bean Aged Dark Lord,Russian Imperial Stout,5.0
645,Trappistes Rochefort 10,Quadrupel (Quad),5.0
51619,Ommegang Adoration Ale,Belgian Strong Dark Ale,5.0
27604,Duet IPA,American IPA,5.0
1836,La Chouffe,Belgian Strong Pale Ale,5.0
3916,AleSmith IPA,American IPA,5.0


In [9]:
df_appear = reviews_means['review_appearance'].set_index(['beer_beerid'])
df_appear = df_appear.sort_values(by='review_appearance', ascending=False)
df_appear.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_appearance
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1836,La Chouffe,Belgian Strong Pale Ale,5.0
3877,Dark Horse Reserve Special Black Bier Ale,American Strong Ale,5.0
73427,Blaecorn Unidragon,Russian Imperial Stout,5.0
44727,Portsmouth 5 C's IPA,American IPA,5.0
8848,Dark Horse Boffo Brown Ale,English Brown Ale,5.0
2196,Herold Bohemian Black Lager,Schwarzbier,5.0
7284,YuleSmith (Summer),American Double / Imperial IPA,5.0
44568,Loser Pale Ale,American Pale Ale (APA),5.0
63422,Either,American Black Ale,5.0
55382,Collaboration No. 1 - Imperial Pilsner,American Double / Imperial Pilsner,5.0


In [10]:
df_palate = reviews_means['review_palate'].set_index(['beer_beerid'])
df_palate = df_palate.sort_values(by='review_palate', ascending=False)
df_palate.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_palate
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17112,Bell's Hopslam Ale,American Double / Imperial IPA,5.0
30,Trois Pistoles,Belgian Strong Dark Ale,5.0
42349,Vanilla Bean Aged Dark Lord,Russian Imperial Stout,5.0
129,Orval Trappist Ale,Belgian Pale Ale,5.0
26072,Noire De Chambly / Chambly Noire,Belgian Dark Ale,5.0
356,Imperial Stout,Russian Imperial Stout,5.0
46080,Apocalypse Cow,American Double / Imperial IPA,5.0
55564,Red Chair NWPA,American Pale Ale (APA),5.0
30184,Oak Aged Dark Lord Imperial Stout,Russian Imperial Stout,5.0
37294,Dark Horizon 1st Edition,Russian Imperial Stout,5.0


In [11]:
df_taste = reviews_means['review_taste'].set_index(['beer_beerid'])
df_taste  = df_taste .sort_values(by='review_taste', ascending=False)
df_taste.head(10)

Unnamed: 0_level_0,beer_name,beer_style,review_taste
beer_beerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30184,Oak Aged Dark Lord Imperial Stout,Russian Imperial Stout,5.0
38094,Toronado 20th Anniversary Ale,American Wild Ale,5.0
1836,La Chouffe,Belgian Strong Pale Ale,5.0
27286,Arcadia HopMouth Double IPA,American Double / Imperial IPA,5.0
2803,Sol,American Adjunct Lager,5.0
3833,AleSmith Speedway Stout,American Double / Imperial Stout,5.0
33127,Darkness,Russian Imperial Stout,5.0
30,Trois Pistoles,Belgian Strong Dark Ale,5.0
20168,Hoptical Illusion,American IPA,5.0
48505,Victory At Sea Coffee Vanilla Imperial Porter,American Porter,5.0
