# Import Libraries

In [1]:
import pandas as pd 
pd.options.plotting.backend = "plotly"
import numpy as np

# Load Datasets

In [2]:
data_raw = pd.read_parquet("../data/interim/beer_general_data_clean.parquet")

In [12]:
for column in data_raw:
    print(f'{column} = {(data_raw[column].isnull().sum()/len(data_raw))*100}')

OG = 10.252025346184277
Efficiency = 10.361700720756465
SugarScale = 10.305152918482237
StyleID = 10.278475124667382
Size(L) = 10.228539767013935
BoilGravity = 13.765969623797506
BoilTime = 10.308801163790253
FG = 10.30788910246325
PrimaryTemp = 35.635604138022245
Style = 10.876559339850012
Color = 10.32088597637305
Name = 10.298996504524965
IBU = 10.279159170662636
BrewMethod = 10.241992671587237
BeerID = 10.228995797677436
ABV = 10.286911691942166


# Description and Data Cleaning

## Variable definitions
- Input/Output variables
- Variable Types (Categorical or Numeric)
- Data Types (int, float, string, factor, boolean, ...)

|Feature Name| Type | Description and values | % Missing
|--|--|--|--|
| BeerID | Number | The Record Id |  10.22%
| Name | Categorical (nominal) | The beer name | 10.30% 
| StyleID | Number | The Numeric ID for type of brew | 10.28% 
| Size(L)| Number | Amount brewed for recipe listed | 10.23% 
| OG | Number | Specific gravity of wort before fermentation | 10.25% 
| FG | Number (nominal) | Specific gravity of wort after fermentation | 10.30% 
| ABV | Number | Alcohol By Volume | 10.29% 
| IBU | Number | International Bittering Units | 10.28% 
| Color | Number | Standard Reference Method - light to dark ex. 40 = black-  | 10.32% 
| BoilTime | Number | Time wort is boiled | 10.38% 
| BoilGravity | Number | Specific gravity of wort before the boil | 13.76% 
| Efficiency | Number | Beer mash extraction efficiency - extracting sugars from the grain during mash | 10.36% 
| SugarScale | Categorical (nominal) | Scale to determine the concentration of dissolved solids in wort | 10.30%
| BrewMethod | Categorical (nominal) | Various techniques for brewing | 10.24% 
| PrimaryTemp | Number | Temperature at the fermenting stage | 35.63% 
| Style| Categorical (nominal) | The type of brew | 10.87% 



In [5]:
data_raw.sample(10)

Unnamed: 0,OG,Efficiency,SugarScale,StyleID,Size(L),BoilGravity,BoilTime,FG,PrimaryTemp,Style,Color,Name,IBU,BrewMethod,BeerID,ABV
104009,1063.0,70.0,Specific Gravity,7,20.82,1053.0,60.0,1012.0,21.11,American IPA,6.66,BIAB IPA,73.8,BIAB,66796.0,6.65
30981,1063.0,75.0,Specific Gravity,7,37.85,1053.0,60.0,1013.0,21.11,American IPA,856842215413.0,856842215413,72.51,All Grain,46968.0,856842215413.0
409160,1043.0,70.0,Specific Gravity,10,10.0,1023.0,70.0,,20.0,American Pale Ale,6.87,Oatmeal Pale Ale,25.08,,72891.0,
29743,1048.0,70.0,,6,28.0,1044.0,60.0,,21.0,,21.59,On the Way Out,29.26,All Grain,59329.0,4.73
317286,1075.0,70.0,Specific Gravity,56,20.82,1064.0,60.0,1012.0,,Double IPA,4.39,Cashmere DIPA,62.63,All Grain,31257.0,8.31
149860,,65.0,,9,57.0,,60.0,1.01,,American Light Lager,3.65,Awesome Recipe,,All Grain,66367.0,
249239,1061.0,70.0,Specific Gravity,7,39.75,,60.0,1017.0,,American IPA,9.67,Olympic Tower Bridge IPA Pepper with New ...,180.42,All Grain,3895.0,5.8
309512,1059.0,70.0,Specific Gravity,10,20.82,1043.0,75.0,1012.0,20.0,American Pale Ale,7.69,Pittsburgh P.A.,29.77,All Grain,14167.0,6.09
25959,,35.0,Specific Gravity,169,,1109.0,60.0,1009.0,,Weissbier,4.49,Lemon Coriander Weiss,13.3,extract,32817.0,7.37
397483,1052.0,70.0,,165,,1048.0,60.0,1012.0,18.33,Trappist Single,3.31,Belgian Single,32.19,All Grain,,5.26


In [3]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438567 entries, 0 to 438566
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   OG           393605 non-null  object
 1   Efficiency   393124 non-null  object
 2   SugarScale   393372 non-null  object
 3   StyleID      393489 non-null  object
 4   Size(L)      393708 non-null  object
 5   BoilGravity  378194 non-null  object
 6   BoilTime     393356 non-null  object
 7   FG           393360 non-null  object
 8   PrimaryTemp  282281 non-null  object
 9   Style        390866 non-null  object
 10  Color        393303 non-null  object
 11  Name         393399 non-null  object
 12  IBU          393486 non-null  object
 13  BrewMethod   393649 non-null  object
 14  BeerID       393706 non-null  object
 15  ABV          393452 non-null  object
dtypes: object(16)
memory usage: 53.5+ MB


Null Values

In [4]:
data_raw.isnull().sum()

OG              44962
Efficiency      45443
SugarScale      45195
StyleID         45078
Size(L)         44859
BoilGravity     60373
BoilTime        45211
FG              45207
PrimaryTemp    156286
Style           47701
Color           45264
Name            45168
IBU             45081
BrewMethod      44918
BeerID          44861
ABV             45115
dtype: int64