In [1]:
%load_ext autoreload
%autoreload 2

In [25]:
import pandas as pd
import numpy as np
import category_encoders as ce
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder


In [3]:
df = pd.read_csv('../data/raw/beer_reviews.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
brewery_id            1586614 non-null int64
brewery_name          1586599 non-null object
review_time           1586614 non-null int64
review_overall        1586614 non-null float64
review_aroma          1586614 non-null float64
review_appearance     1586614 non-null float64
review_profilename    1586266 non-null object
beer_style            1586614 non-null object
review_palate         1586614 non-null float64
review_taste          1586614 non-null float64
beer_name             1586614 non-null object
beer_abv              1518829 non-null float64
beer_beerid           1586614 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [5]:
df.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


## Data Preparation

In [6]:
#Drop unnecessary columns 
df_cleaned = df.copy()
drop_cols = ['review_time', 'beer_beerid', 'brewery_id', 'beer_abv', 'beer_name','review_profilename', 'review_overall']

In [7]:
df_cleaned.drop(drop_cols, axis=1, inplace=True)

In [8]:
#Apply standard scaler to number columns
sc = StandardScaler()
num_cols = ['review_aroma','review_appearance','review_palate','review_taste']

In [9]:
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [10]:
df_cleaned.info(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 6 columns):
brewery_name         1586599 non-null object
review_aroma         1586614 non-null float64
review_appearance    1586614 non-null float64
beer_style           1586614 non-null object
review_palate        1586614 non-null float64
review_taste         1586614 non-null float64
dtypes: float64(4), object(2)
memory usage: 72.6+ MB


In [11]:
#Check unique brewery names for encodign
brewery_names = df_cleaned.brewery_name.unique()

In [12]:
df_cleaned.groupby('brewery_name')['brewery_name'].count()\
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .info

<bound method DataFrame.info of                             brewery_name  count
701   Boston Beer Company (Samuel Adams)  39444
1963                Dogfish Head Brewery  33839
4950                   Stone Brewing Co.  33066
4743           Sierra Nevada Brewing Co.  28751
413                 Bell's Brewery, Inc.  25191
...                                  ...    ...
4674                      Science Infuse      1
584                      Bir&#371; Alus      1
581                  Birrificio Valscura      1
1487       Calwer-Eck-Bräu GmbH & Co. KG      1
5741                Łódzkie Browary S.A.      1

[5742 rows x 2 columns]>

In [13]:
df_cleaned.groupby('beer_style')['beer_style'].count()\
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .info()
                             

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104 entries, 12 to 64
Data columns (total 2 columns):
beer_style    104 non-null object
count         104 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.4+ KB


In [14]:
#Trial target encoding
#from category_encoders import TargetEncoder
#encoder = TargetEncoder()
#df_cleaned['brewery_encoded'] = encoder.fit_transform(df_cleaned['brewery_name'], df_cleaned['beer_style'])

In [49]:
df_encoded = df_cleaned.copy()
#target_encode_multiclass(df_encoded,df_encoded['beer_style'])

#Clean NaN values to 0:
df_encoded['brewery_name'] = df_encoded['brewery_name'].fillna('None')

In [50]:
ce_basen = ce.BaseNEncoder(cols=['brewery_name'], return_df=True,base=8)

In [51]:
df_encoded = ce_basen.fit_transform(df_encoded)

In [52]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 11 columns):
brewery_name_0       1586614 non-null int64
brewery_name_1       1586614 non-null int64
brewery_name_2       1586614 non-null int64
brewery_name_3       1586614 non-null int64
brewery_name_4       1586614 non-null int64
brewery_name_5       1586614 non-null int64
review_aroma         1586614 non-null float64
review_appearance    1586614 non-null float64
beer_style           1586614 non-null object
review_palate        1586614 non-null float64
review_taste         1586614 non-null float64
dtypes: float64(4), int64(6), object(1)
memory usage: 133.2+ MB


In [53]:
#Apply Label encoding to Beer Style as Target
le = LabelEncoder()
df_encoded['beer_style'] = le.fit_transform(df_encoded['beer_style'])

In [54]:
df_encoded.describe()

Unnamed: 0,brewery_name_0,brewery_name_1,brewery_name_2,brewery_name_3,brewery_name_4,brewery_name_5,review_aroma,review_appearance,beer_style,review_palate,review_taste
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0
mean,0.0,0.2854551,2.625997,3.465662,3.301558,3.648495,1.431642e-16,-1.473918e-16,42.14243,1.214531e-16,-1.546288e-16
std,0.0,0.451631,2.209936,2.412487,2.15005,2.42991,1.0,1.0,33.02084,1.0,1.0
min,0.0,0.0,0.0,0.0,0.0,0.0,-3.921404,-6.235494,0.0,-4.021736,-3.815543
25%,0.0,0.0,1.0,1.0,2.0,2.0,-0.3377731,-0.5545298,12.0,-0.3572191,-0.4000993
50%,0.0,0.0,2.0,3.0,3.0,4.0,0.3789531,0.2570365,31.0,0.3756843,0.2829894
75%,0.0,1.0,4.0,6.0,5.0,6.0,0.3789531,0.2570365,74.0,0.3756843,0.966078
max,0.0,1.0,7.0,7.0,7.0,7.0,1.812405,1.880169,103.0,1.841491,1.649167


In [56]:
target = df_encoded.pop('beer_style')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_encoded, target, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
#Save test/train splits
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)
np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)

In [57]:
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

In [58]:
from src.models.pytorch import ClassifierDataset
train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())
test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).long())

In [67]:
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [132]:
from src.models.pytorch import PytorchMultiClass

model = PytorchMultiClass(X_train.shape[1])

In [133]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=10, out_features=128, bias=True)
  (layer_2): Linear(in_features=128, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

In [134]:
criterion = nn.CrossEntropyLoss()

In [135]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [136]:
N_EPOCHS = 10
BATCH_SIZE = 64

In [None]:
from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.0720	|	Acc: 5.5%
	(valid)	|	Loss: 0.0717	|	Acc: 7.6%
Epoch: 1
	(train)	|	Loss: 0.0717	|	Acc: 7.3%
	(valid)	|	Loss: 0.0717	|	Acc: 7.5%
Epoch: 2
	(train)	|	Loss: 0.0717	|	Acc: 7.1%
	(valid)	|	Loss: 0.0717	|	Acc: 7.4%
Epoch: 3
	(train)	|	Loss: 0.0717	|	Acc: 7.4%
	(valid)	|	Loss: 0.0717	|	Acc: 7.4%
Epoch: 4
	(train)	|	Loss: 0.0717	|	Acc: 7.4%
	(valid)	|	Loss: 0.0717	|	Acc: 7.4%
Epoch: 5
	(train)	|	Loss: 0.0717	|	Acc: 7.4%
	(valid)	|	Loss: 0.0717	|	Acc: 7.4%
Epoch: 6
	(train)	|	Loss: 0.0717	|	Acc: 7.4%
	(valid)	|	Loss: 0.0717	|	Acc: 7.4%
Epoch: 7
	(train)	|	Loss: 0.0717	|	Acc: 7.4%
	(valid)	|	Loss: 0.0717	|	Acc: 7.4%


In [None]:
torch.save(model, "../models/pytorch_beer_selector.pt")

In [125]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.1f}')

	Loss: 0.0717	|	Accuracy: 0.1
