In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import joblib
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, StandardScaler
from src.models.data_process import DataReader

In [3]:
df = pd.read_csv('../data/raw/beer_reviews.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
brewery_id            1586614 non-null int64
brewery_name          1586599 non-null object
review_time           1586614 non-null int64
review_overall        1586614 non-null float64
review_aroma          1586614 non-null float64
review_appearance     1586614 non-null float64
review_profilename    1586266 non-null object
beer_style            1586614 non-null object
review_palate         1586614 non-null float64
review_taste          1586614 non-null float64
beer_name             1586614 non-null object
beer_abv              1518829 non-null float64
beer_beerid           1586614 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [5]:
df.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


## Data Preparation

In [6]:
#Drop unnecessary columns 
df_cleaned = df.copy()
drop_cols = ['review_time', 'beer_beerid', 'brewery_id', 'beer_abv', 'beer_name','review_profilename', 'review_overall']

In [7]:
df_cleaned.drop(drop_cols, axis=1, inplace=True)

In [8]:
#remove NA values -> convert to '0'
df_cleaned['brewery_name'] = df_cleaned['brewery_name'].fillna('None')

In [9]:
df_cleaned.describe()

Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste
count,1586614.0,1586614.0,1586614.0,1586614.0
mean,3.735636,3.841642,3.743701,3.79286
std,0.6976167,0.6160928,0.6822184,0.7319696
min,1.0,0.0,1.0,1.0
25%,3.5,3.5,3.5,3.5
50%,4.0,4.0,4.0,4.0
75%,4.0,4.0,4.0,4.5
max,5.0,5.0,5.0,5.0


In [10]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 6 columns):
brewery_name         1586614 non-null object
review_aroma         1586614 non-null float64
review_appearance    1586614 non-null float64
beer_style           1586614 non-null object
review_palate        1586614 non-null float64
review_taste         1586614 non-null float64
dtypes: float64(4), object(2)
memory usage: 72.6+ MB


## Data Exploration

In [11]:
#Check unique brewery names for encodign
brewery_names = df_cleaned.brewery_name.unique()

In [12]:
df_cleaned.groupby('brewery_name')['brewery_name'].count()\
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .info

<bound method DataFrame.info of                             brewery_name  count
701   Boston Beer Company (Samuel Adams)  39444
1963                Dogfish Head Brewery  33839
4951                   Stone Brewing Co.  33066
4744           Sierra Nevada Brewing Co.  28751
413                 Bell's Brewery, Inc.  25191
...                                  ...    ...
3010                     Karmeliter Bräu      1
3008               Karme AS, Karksi Õlle      1
3007                               Karma      1
508                 Big Tide Brewing Co.      1
5742                Łódzkie Browary S.A.      1

[5743 rows x 2 columns]>

Here, we have 5,743 unique brewery names to encode. Cardinality too high to use One-Hot Encoding, use LabelEncoder instead.

In [43]:
df_cleaned.groupby('beer_style')['beer_style'].count()\
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .info           

<bound method DataFrame.info of                           beer_style   count
12                      American IPA  117586
9     American Double / Imperial IPA   85977
14           American Pale Ale (APA)   63469
89            Russian Imperial Stout   54129
11  American Double / Imperial Stout   50705
..                               ...     ...
62                              Gose     686
56                              Faro     609
88                        Roggenbier     466
72                             Kvass     297
64                          Happoshu     241

[104 rows x 2 columns]>

For our target group 'beer_style', there are 104 target classes. Also use LabelEncoder to convert into quantified targets for model to train on.

### Data Transformation

In [14]:
#Use entry 223 for predictions after modelling
df_cleaned.iloc[223]

brewery_name         Caldera Brewing Company
review_aroma                               4
review_appearance                          4
beer_style           American Pale Ale (APA)
review_palate                              4
review_taste                               4
Name: 223, dtype: object

In [15]:
#Detail which columns to use for which transform:
#Number columns for standard scaling
num_cols = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']

In [16]:
#Perform standard scaling
data_reader = DataReader()
df_encoded = df_cleaned.copy()
sc = StandardScaler()
df_encoded[num_cols] = sc.fit_transform(df_encoded[num_cols])
joblib.dump(sc,'../app/stdscale.joblib')

['../app/stdscale.joblib']

In [17]:
#Perform Label encoder transformer on cat_cols:
le = LabelEncoder()
df_encoded['brewery_name'] = le.fit_transform(df_cleaned['brewery_name'])
#brew_dict = dict(zip(le.classes_, le.transform(le.classes_)))
joblib.dump(le,'../app/brewnames.joblib')
df_encoded['beer_style'] = le.fit_transform(df_cleaned['beer_style'])
joblib.dump(le,'../app/target.joblib')

['../app/target.joblib']

In [18]:
# Create label encoder pipeline:
df_encoded.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste
0,5439,-2.487952,-2.177663,65,-3.288833,-3.132454
1,5439,-1.771225,-1.366096,51,-1.090123,-1.083188
2,5439,-1.771225,-1.366096,59,-1.090123,-1.083188
3,5439,-1.054499,-0.55453,61,-1.823026,-1.083188
4,1480,1.095679,0.257037,9,0.375684,0.966078


## Test / Train splits

In [19]:
target = df_encoded.pop('beer_style')

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df_encoded, target, test_size=0.2, stratify=target, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
#Save test/train splits
np.save('../data/processed/X_train', X_train)
np.save('../data/processed/X_val',   X_val)
np.save('../data/processed/y_train', y_train)
np.save('../data/processed/y_val',   y_val)

In [21]:
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

In [22]:
from src.models.pytorch import ClassifierDataset
train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())
test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).long())

In [None]:
from src.models.pytorch import PytorchDataset
train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [266]:
class_count = np.unique(y_train, return_counts=True)
class_weights = 5./torch.tensor(class_count, dtype=torch.float)

In [267]:
from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])

In [268]:
from src.models.pytorch import get_device
device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=5, out_features=5, bias=True)
  (layer_out): Linear(in_features=5, out_features=104, bias=True)
  (softmax): Softmax(dim=1)
)

In [269]:
criterion = nn.CrossEntropyLoss(weight = class_weights[1])
#criterion = nn.CrossEntropyLoss()

In [270]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [271]:
N_EPOCHS = 6
BATCH_SIZE = 512

In [None]:
from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

In [None]:
torch.save(model.state_dict(), "../app/pytorch_beer_selector.pt")

In [None]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.1f}')

## Prediction

In [201]:
#Test row 223 from original test
features = data_reader.format_features('Caldera Brewing Company',4,4,4,4)
obs = pd.DataFrame(features)
obs.iloc[0]

brewery_name         Caldera Brewing Company
review_aroma                               4
review_appearance                          4
review_palate                              4
review_taste                               4
Name: 0, dtype: object

In [202]:
#Load dictionary of brewery names to encode brewery_name
#di = np.load('../src/models/brew_dict.npy',allow_pickle='TRUE').item()
#obs['brewery_name'].replace(di, inplace=True)
brew_encode = joblib.load('../app/brewnames.joblib')
obs['brewery_name']=brew_encode.transform(obs['brewery_name'])

In [203]:
#Check if encoded:
obs.iloc[0]

brewery_name         1480
review_aroma            4
review_appearance       4
review_palate           4
review_taste            4
Name: 0, dtype: int64

In [204]:
#Encode values 
obs_clean = obs.copy()
scale = joblib.load('../app/stdscale.joblib')
obs_clean[num_cols] = scale.transform(obs_clean[num_cols])
obs_clean.brewery_name=obs_clean.brewery_name.astype(int)

In [205]:
#Change to tensor for prediction input
obs_tensor = obs_clean.copy()
obs_tensor = obs_tensor.to_numpy()
obs_tensor = torch.from_numpy(obs_tensor)

In [206]:
print(obs_tensor)

tensor([[1.4800e+03, 3.7895e-01, 2.5704e-01, 3.7568e-01, 2.8299e-01]],
       dtype=torch.float64)


In [261]:
#Make prediction:
device = get_device()
beer_select = PytorchMultiClass(obs_tensor.shape[1])
beer_select.load_state_dict(torch.load('../app/pytorch_beer_selector.pt'))

<All keys matched successfully>

In [262]:
beer_select.eval()
obs_tensor = obs_tensor.float()
output = beer_select(obs_tensor).argmax(dim=1)

In [263]:
target_encode = joblib.load('../app/target.joblib')

In [264]:
pred = target_encode.inverse_transform(output)

In [265]:
print(pred)

['American Porter']
