In [1]:
import pandas as pd

In [2]:
beers = pd.read_csv('../data/beers.csv')
beers.head()

Unnamed: 0,brewery_name,beer_name,beer_style,beer_abv,num_reviews,rating,city,state
0,21st Amendment Brewery,21 Rock,American Double / Imperial IPA,9.7,2,4.0,San Francisco,CA
1,21st Amendment Brewery,21st Amendment IPA,American IPA,7.0,650,3.89,San Francisco,CA
2,21st Amendment Brewery,21st Amendment Imperial Stout,Russian Imperial Stout,10.0,4,3.88,San Francisco,CA
3,21st Amendment Brewery,21st Amendment Pale Ale,American Pale Ale (APA),5.5,7,3.86,San Francisco,CA
4,21st Amendment Brewery,5-South,American Pale Ale (APA),6.0,3,4.0,San Francisco,CA


## EDA

In [3]:
# How many unique beers?
beers['beer_style'].nunique()

98

In [4]:
# Most frequent?
beers['beer_style'].value_counts().sort_values(ascending=False).head(5)

American IPA                      393
American Pale Ale (APA)           281
American Porter                   190
American Amber / Red Ale          184
American Double / Imperial IPA    173
Name: beer_style, dtype: int64

In [5]:
# Highest ABV?
beers.sort_values(by ='beer_abv', ascending=False).head(5)

Unnamed: 0,brewery_name,beer_name,beer_style,beer_abv,num_reviews,rating,city,state
293,Avery Brewing Company,The Beast Grand Cru 2004,Belgian Strong Dark Ale,18.1,40,3.04,Boulder,CO
190,Arctic Craft Brewery,Warning Sign Eisbock,Eisbock,18.0,4,4.0,Colorado Springs,CO
3612,Southampton Publick House,Double Ice Bock,Eisbock,18.0,22,4.14,Southampton,NY
292,Avery Brewing Company,The Beast Grand Cru,Belgian Strong Dark Ale,16.83,506,3.46,Boulder,CO
269,Avery Brewing Company,Mephistopheles' Stout,American Double / Imperial Stout,16.8,693,3.66,Boulder,CO


## Feature Engineering

In [6]:
beers['beer_style'][0]

'American Double / Imperial IPA'

In [7]:
'Double' in beers['beer_style'][0]

True

In [8]:
# Let's write a function and apply it!
def double_trouble(row):
    if 'Double' in row:
        return 'trouble'
    else:
        return 'no trouble'

In [9]:
beers['double']=beers['beer_style'].apply(double_trouble)
beers.head()

Unnamed: 0,brewery_name,beer_name,beer_style,beer_abv,num_reviews,rating,city,state,double
0,21st Amendment Brewery,21 Rock,American Double / Imperial IPA,9.7,2,4.0,San Francisco,CA,trouble
1,21st Amendment Brewery,21st Amendment IPA,American IPA,7.0,650,3.89,San Francisco,CA,no trouble
2,21st Amendment Brewery,21st Amendment Imperial Stout,Russian Imperial Stout,10.0,4,3.88,San Francisco,CA,no trouble
3,21st Amendment Brewery,21st Amendment Pale Ale,American Pale Ale (APA),5.5,7,3.86,San Francisco,CA,no trouble
4,21st Amendment Brewery,5-South,American Pale Ale (APA),6.0,3,4.0,San Francisco,CA,no trouble


In [10]:
# Is it an Ale?
def ale_finder(row):
    if 'ale' in row.lower():
        return 'yes'
    else:
        return 'no'

In [11]:
beers['its_ale']=beers['beer_style'].apply(ale_finder)
beers.head()

Unnamed: 0,brewery_name,beer_name,beer_style,beer_abv,num_reviews,rating,city,state,double,its_ale
0,21st Amendment Brewery,21 Rock,American Double / Imperial IPA,9.7,2,4.0,San Francisco,CA,trouble,no
1,21st Amendment Brewery,21st Amendment IPA,American IPA,7.0,650,3.89,San Francisco,CA,no trouble,no
2,21st Amendment Brewery,21st Amendment Imperial Stout,Russian Imperial Stout,10.0,4,3.88,San Francisco,CA,no trouble,no
3,21st Amendment Brewery,21st Amendment Pale Ale,American Pale Ale (APA),5.5,7,3.86,San Francisco,CA,no trouble,yes
4,21st Amendment Brewery,5-South,American Pale Ale (APA),6.0,3,4.0,San Francisco,CA,no trouble,yes


## Get some beer dummies!

In [12]:
# Let's shrink out dataset
beers = beers[['beer_style', 'beer_abv']]
beers.head()

Unnamed: 0,beer_style,beer_abv
0,American Double / Imperial IPA,9.7
1,American IPA,7.0
2,Russian Imperial Stout,10.0
3,American Pale Ale (APA),5.5
4,American Pale Ale (APA),6.0


In [15]:
beers['newcol']=beers['beer_style'].map({'American IPA':1, 'Russian Imperial Stout':0, 'American Pale Ale (APA)':3})

In [16]:
beers.head()

Unnamed: 0,beer_style,beer_abv,newcol
0,American Double / Imperial IPA,9.7,
1,American IPA,7.0,1.0
2,Russian Imperial Stout,10.0,0.0
3,American Pale Ale (APA),5.5,3.0
4,American Pale Ale (APA),6.0,3.0


In [18]:
# Convert beer styles to dummies
beers2 = pd.get_dummies(beers, columns=['beer_style'], prefix='', prefix_sep='')
beers2.sample(10)

Unnamed: 0,beer_abv,Altbier,American Adjunct Lager,American Amber / Red Ale,American Amber / Red Lager,American Barleywine,American Black Ale,American Blonde Ale,American Brown Ale,American Dark Wheat Ale,...,Scotch Ale / Wee Heavy,Scottish Ale,Scottish Gruit / Ancient Herbed Ale,Smoked Beer,Tripel,Vienna Lager,Weizenbock,Wheatwine,Winter Warmer,Witbier
499,6.7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
821,10.75,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3064,5.4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3778,5.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3266,5.6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4385,7.5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2512,4.5,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1095,5.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1610,7.5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2484,5.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
