# Build the pipeline

In [1]:
import numpy as np
import pandas as pd

In [2]:
# import load_data function from 
%load_ext autoreload
%autoreload 2

# fix system path
import sys
sys.path.append("/home/jovyan/work")

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer

In [5]:
# load in raw data
# read in dataset and unzip
df = pd.read_csv('../data/raw/beer_review.zip',compression='zip')

In [6]:
# drop unused columns
df.drop(['brewery_id','review_time','beer_name','review_profilename','review_overall','beer_abv','beer_beerid'],axis=1,inplace=True)

In [7]:
df.isnull().sum()

brewery_name         15
review_aroma          0
review_appearance     0
beer_style            0
review_palate         0
review_taste          0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.reset_index(drop=True,inplace=True)

In [10]:
from src.data.sets import split_sets_random

In [11]:
# split dataset without transformation
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df, target_col='beer_style', test_ratio=0.2, to_numpy=False)

In [12]:
X_train.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
996543,De Struise Brouwers,4.0,4.5,5.0,4.5
1044589,Steel Brewing Company,1.5,2.5,3.5,3.5
143716,Cigar City Brewing,3.5,4.0,3.5,3.5
1162000,Dogfish Head Brewery,3.5,3.5,4.0,3.5
763621,Sprecher Brewing Company,4.0,4.0,4.0,4.0


In [13]:
y_train.head()

996543     American Double / Imperial Stout
1044589                American Malt Liquor
143716               Saison / Farmhouse Ale
1162000                      American Stout
763621                          Schwarzbier
Name: beer_style, dtype: object

# build pipeline objects

In [None]:
num_cols = ['review_aroma','review_appearance','review_palate','review_taste']
cat_cols = ['brewery_name','beer_style']

In [14]:
# Create a Pipeline called num_transformer with one step that contains StandardScaler

num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [113]:
# create a function for cat cols in pipepline 
#https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer.html#sphx-glr-auto-examples-compose-plot-column-transformer-py

from sklearn.preprocessing import FunctionTransformer

def cat_label_encoder(data):
    cat_cols = ['brewery_name','beer_style']
    from sklearn.preprocessing import LabelEncoder
    LE = LabelEncoder()
    # create an empty dict to keep label encoder for each cat col
    label_encoders = {}
    # loop over cat_cols and encode
    for cat_col in cat_cols:
        label_encoders[cat_col] = LE
        data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])
        
    return data

cat_label_transformer = FunctionTransformer(cat_label_encoder)

In [15]:
from joblib import load

def apply_scaler(obs):
    '''Applies scaler to num_cols'''
    
    num_cols = ['review_aroma','review_appearance','review_palate','review_taste']
    scaler = load('../models/scaler.joblib')
    obs[num_cols] = scaler.fit_transform(obs[num_cols])
    
    return obs

In [17]:
def format_features(brewery_name: int, review_aroma: int, review_appearance: int, review_palate: int, review_taste: int):
      
        return {
        'brewery_name': [brewery_name],
        'review_aroma': [review_aroma],
        'review_appearance': [review_appearance],
        'review_palate': [review_palate],
        'review_taste': [review_taste]
    }

In [18]:
features = format_features(5,2.2,3.1,4.2,1.1)

In [19]:
obs = pd.DataFrame(features)

In [20]:
obs

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
0,5,2.2,3.1,4.2,1.1


In [21]:
apply_scaler(obs)

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
0,5,0.0,0.0,0.0,0.0


In [30]:
import torch


model = torch.load('../models/model.pt')

In [28]:
from src.models.pytorch import ClassificationEmbdNN

In [31]:
model

ClassificationEmbdNN(
  (emb_layers): ModuleList(
    (0): Embedding(5742, 252)
  )
  (emb_dropout): Dropout(p=0.2, inplace=False)
  (bn_cont): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=256, out_features=208, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (bn1): BatchNorm1d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU()
  (fc2): Linear(in_features=208, out_features=208, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (bn2): BatchNorm1d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU()
  (fc3): Linear(in_features=208, out_features=104, bias=True)
  (act3): Softmax(dim=None)
)

In [None]:
model.load_state_dict(torch.load('../models/embed_3layers.pt'))