# EDA

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
import pickle

beer = pd.read_csv(r"C:\Users\kasci\Documents\scraped_data.csv")
beer.head()

Unnamed: 0,OG,FG,ABV,SMR,pH,IBU,type
0,1.062,1.013,6.5%,5.2,,59.26,Specialty IPA: New England IPA
1,1.055,1.013,5.58%,8.0,5.67,39.79,American Pale Ale
2,1.055,1.013,5.48%,4.83,,19.44,Cream Ale
3,1.061,1.016,5.94%,8.5,5.81,62.42,American IPA
4,1.072,1.018,7.09%,6.33,,232.89,Imperial IPA


In [10]:
beer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   OG      1115 non-null   object 
 1   FG      1115 non-null   object 
 2   ABV     1115 non-null   object 
 3   SMR     1115 non-null   float64
 4   pH      119 non-null    float64
 5   IBU     1115 non-null   float64
 6   type    1103 non-null   object 
dtypes: float64(3), object(4)
memory usage: 61.1+ KB


In [11]:
beer_clean = beer.drop(['pH'], axis="columns")
beer_clean = beer_clean.dropna()

beer_clean['ABV'] = beer_clean['ABV'].str.replace(r'%', '')
beer_clean['ABV'] = beer_clean['ABV'].astype(str).astype(float)

beer_clean['OG'] = beer_clean['OG'].str.replace(r'°P', '')
beer_clean['OG'] = beer_clean['OG'].astype(str).astype(float)

beer_clean['FG'] = beer_clean['FG'].str.replace(r'°P', '')
beer_clean['FG'] = beer_clean['FG'].astype(str).astype(float)

beer_clean.head()                                       

Unnamed: 0,OG,FG,ABV,SMR,IBU,type
0,1.062,1.013,6.5,5.2,59.26,Specialty IPA: New England IPA
1,1.055,1.013,5.58,8.0,39.79,American Pale Ale
2,1.055,1.013,5.48,4.83,19.44,Cream Ale
3,1.061,1.016,5.94,8.5,62.42,American IPA
4,1.072,1.018,7.09,6.33,232.89,Imperial IPA


In [12]:
beer_clean.describe()

Unnamed: 0,OG,FG,ABV,SMR,IBU
count,1103.0,1103.0,1103.0,1103.0,1103.0
mean,1.291477,1.050285,6.240526,14.031859,42.407135
std,1.72711,0.297946,1.871511,13.033292,28.549315
min,1.005,1.0,0.49,0.85,0.0
25%,1.05,1.011,5.035,5.22,23.65
50%,1.059,1.014,5.82,8.96,35.64
75%,1.07,1.017,7.01,17.07,55.795
max,19.8,5.7,22.92,85.25,232.89


In [13]:
beer_clean['type'].value_counts().to_frame().head()

Unnamed: 0,type
American IPA,199
American Pale Ale,104
American Amber Ale,33
Imperial IPA,28
Saison,27


In [14]:
types_count = beer_clean['type'].value_counts() # Frequency of each class
types = list(types_count.keys()) # Complete list of styles
popularity = beer_clean['type'].value_counts(normalize = True) # Proportion of each class
top_ten = popularity[:10] # Top ten most common styles
top_ten_types = list(top_ten.keys()) # Class names

# Print results
print('Total types: ',len(types))
print(" ")
print('Top ten type:')
print(top_ten)
beer_clean = beer_clean.drop(['type'], axis="columns")
beer_clean.head()

Total types:  127
 
Top ten type:
American IPA                      0.180417
American Pale Ale                 0.094288
American Amber Ale                0.029918
Imperial IPA                      0.025385
Saison                            0.024479
Russian Imperial Stout            0.023572
Weizen/Weissbier                  0.023572
Irish Red Ale                     0.021759
Robust Porter                     0.019039
Specialty IPA: New England IPA    0.016319
Name: type, dtype: float64


Unnamed: 0,OG,FG,ABV,SMR,IBU
0,1.062,1.013,6.5,5.2,59.26
1,1.055,1.013,5.58,8.0,39.79
2,1.055,1.013,5.48,4.83,19.44
3,1.061,1.016,5.94,8.5,62.42
4,1.072,1.018,7.09,6.33,232.89


# Encoding and training data

In [15]:
target = beer_clean['ABV']
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(beer_clean)
X_train, X_test, y_train, y_test = train_test_split(scaled_numerical_data, target, test_size=0.3)
reg = Lasso()
reg.fit(X_train, y_train)

Lasso()

# Evaluating the model

In [16]:
pred = reg.predict(X_test)
print('Model\'s R^2 score is:', reg.score(X_test, y_test))
print('Mean absolute error is:', mean_absolute_error(y_test, pred))

Model's R^2 score is: 0.7718010760880153
Mean absolute error is: 0.5760797994067067


In [17]:
pred[:5]

array([5.60523283, 5.41475513, 6.08142709, 6.42005412, 5.32480732])

# Exporting the models

In [18]:
with open("regression.pkl", "wb") as reg_file:
    pickle.dump(reg, reg_file)
    
with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)