In [245]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
color = sns.color_palette()
sns.set_style('darkgrid')

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn 


pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

In [246]:
train = pd.read_csv("input/recipeData.csv", encoding='latin-1')

In [247]:
train = train[['Style', 'OG', 'FG', 'ABV', 'IBU', 'Color', 'BoilTime', 'BoilGravity']]

In [248]:
train.shape

(73861, 8)

In [249]:
for col in train.columns.values:
    null_priming = train[col].isnull()
    print('{} is null on {} rows out of {}, so {} % of the time'.format(col, null_priming.sum(), len(train), round((null_priming.sum()/len(train))*100,2)))

Style is null on 596 rows out of 73861, so 0.81 % of the time
OG is null on 0 rows out of 73861, so 0.0 % of the time
FG is null on 0 rows out of 73861, so 0.0 % of the time
ABV is null on 0 rows out of 73861, so 0.0 % of the time
IBU is null on 0 rows out of 73861, so 0.0 % of the time
Color is null on 0 rows out of 73861, so 0.0 % of the time
BoilTime is null on 0 rows out of 73861, so 0.0 % of the time
BoilGravity is null on 2990 rows out of 73861, so 4.05 % of the time


In [228]:
train["BoilGravity"].fillna(train['BoilGravity'].median(), inplace=True)

In [230]:
train.dropna(inplace = True)

In [232]:
train["Style"].unique()


array(['Cream Ale', 'Holiday/Winter Special Spiced Beer', 'American IPA',
       'Belgian Blond Ale', 'American Pale Ale', 'Imperial IPA',
       'Robust Porter', 'Bohemian Pilsener', 'Saison',
       'Northern English Brown', 'English IPA', 'Traditional Bock',
       'Premium American Lager', 'Belgian Golden Strong Ale',
       'Double IPA', 'Blonde Ale', 'Light American Lager',
       'German Pilsner (Pils)', 'American Brown Ale', 'Oatmeal Stout',
       'Specialty Beer', 'American Amber Ale', 'Kölsch', 'Witbier',
       'Weizen/Weissbier', 'Trappist Single', 'Russian Imperial Stout',
       'Specialty IPA: Black IPA', 'Sweet Stout', 'Strong Scotch Ale',
       'Belgian Tripel', 'American Stout', 'Belgian Pale Ale',
       'Dark American Lager', 'Dry Stout', 'Belgian Dark Strong Ale',
       'American Wheat or Rye Beer', 'Vienna Lager',
       'Special/Best/Premium Bitter', 'Experimental Beer',
       'Irish Red Ale', 'Old Ale', 'Extra Special/Strong Bitter (ESB)',
       'Winter Sea

In [233]:
IPA = train[train["Style"].str.contains("IPA")]
Ale = train[train["Style"].str.contains("Ale")]
Lager = train[train["Style"].str.contains("Lager")]
Porter = train[train["Style"].str.contains("Porter")]
Saison = train[train["Style"].str.contains("Saison")]
Stout = train[train["Style"].str.contains("Stout")]
Witbier = train[train["Style"].str.contains("Witbier")]

In [234]:
IPA["Style"].replace(['Imperial IPA', 'American IPA', 'English IPA', 'Double IPA',
       'Specialty IPA: Black IPA', 'Specialty IPA: Red IPA',
       'Specialty IPA: White IPA', 'Specialty IPA: Belgian IPA',
       'Specialty IPA: Rye IPA', 'Specialty IPA: Brown IPA'], "IPA", inplace=True)

In [235]:
Ale.Style.unique()
Ale["Style"].replace(['Cream Ale', 'Belgian Blond Ale', 'American Pale Ale',
       'Belgian Golden Strong Ale', 'Blonde Ale', 'American Brown Ale',
       'American Amber Ale', 'Strong Scotch Ale', 'Belgian Pale Ale',
       'Belgian Dark Strong Ale', 'Old Ale', 'British Golden Ale',
       'Flanders Red Ale', 'Irish Red Ale', 'Belgian Specialty Ale',
       'British Strong Ale', 'British Brown Ale',
       'Australian Sparkling Ale', 'American Strong Ale',
       'Flanders Brown Ale/Oud Bruin', 'London Brown Ale'], "Ale", inplace=True)

In [236]:
Lager.Style.unique()
Lager["Style"].replace(['Premium American Lager', 'Light American Lager', 'Vienna Lager',
       'American Light Lager', 'International Pale Lager',
       'American Lager', 'Dark American Lager',
       'Czech Premium Pale Lager', 'Czech Dark Lager',
       'Standard American Lager', 'Czech Pale Lager',
       'International Amber Lager', 'Pre-Prohibition Lager',
       'Czech Amber Lager', 'International Dark Lager'], "Lager", inplace=True)

In [237]:
Porter.Style.unique()
Porter["Style"].replace(['Robust Porter', 'Brown Porter', 'Baltic Porter', 'English Porter',
       'American Porter', 'Pre-Prohibition Porter'], "Porter", inplace=True)

In [238]:
Saison["Style"].unique()

array(['Saison'], dtype=object)

In [239]:
Stout["Style"].unique()
Stout["Style"].replace(['Oatmeal Stout', 'Russian Imperial Stout', 'Sweet Stout',
       'Dry Stout', 'American Stout', 'Foreign Extra Stout',
       'Imperial Stout', 'Irish Stout', 'Tropical Stout',
       'Irish Extra Stout'], "Stout", inplace=True)

In [240]:
Witbier.Style.unique()

array(['Witbier'], dtype=object)

In [241]:
new_train_df = pd.concat([Witbier, Saison, Stout, Porter, Lager, Ale, IPA], axis=0)

In [168]:
new_train_df["Style"].unique()

array(['Witbier', 'Saison', 'Stout', 'Porter', 'Lager', 'Ale', 'IPA'],
      dtype=object)

In [243]:
new_train_df.to_csv("output/new_data.csv")

In [244]:
new_train_df.shape

(52376, 8)