# Data Exploration and cleaning

In [1]:
import numpy as np
import pandas as pd

In [2]:
# import load_data function from 
%load_ext autoreload
%autoreload 2

# fix system path
import sys
sys.path.append("/home/jovyan/work")

In [3]:
# read in dataset and unzip
df = pd.read_csv('../data/raw/beer_review.zip',compression='zip')

In [4]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [5]:
df.shape

(1586614, 13)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
brewery_id            1586614 non-null int64
brewery_name          1586599 non-null object
review_time           1586614 non-null int64
review_overall        1586614 non-null float64
review_aroma          1586614 non-null float64
review_appearance     1586614 non-null float64
review_profilename    1586266 non-null object
beer_style            1586614 non-null object
review_palate         1586614 non-null float64
review_taste          1586614 non-null float64
beer_name             1586614 non-null object
beer_abv              1518829 non-null float64
beer_beerid           1586614 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [7]:
df.isnull().sum()

brewery_id                0
brewery_name             15
review_time               0
review_overall            0
review_aroma              0
review_appearance         0
review_profilename      348
beer_style                0
review_palate             0
review_taste              0
beer_name                 0
beer_abv              67785
beer_beerid               0
dtype: int64

In [8]:
67785/1586614

0.042723056773733246

Only 4% of the data is missing, since there are already 1.5 mill observations then we can just delete the nulls

In [9]:
# count the number of targets
df.beer_style.value_counts()

American IPA                        117586
American Double / Imperial IPA       85977
American Pale Ale (APA)              63469
Russian Imperial Stout               54129
American Double / Imperial Stout     50705
                                     ...  
Gose                                   686
Faro                                   609
Roggenbier                             466
Kvass                                  297
Happoshu                               241
Name: beer_style, Length: 104, dtype: int64

There are 104 different beer types we need to predict, so our output layer will be of size 104 and the last function will be softmax

In [10]:
df.beer_style.nunique()

104

In [11]:
df.beer_name.value_counts()

90 Minute IPA                          3290
India Pale Ale                         3130
Old Rasputin Russian Imperial Stout    3111
Sierra Nevada Celebration Ale          3000
Two Hearted Ale                        2728
                                       ... 
Viven Bruin                               1
American Hero                             1
Bourbon Barrel-Aged Adoration Ale         1
Hullabaloo                                1
BAB 401                                   1
Name: beer_name, Length: 56857, dtype: int64

In [12]:
df.brewery_name.value_counts()

Boston Beer Company (Samuel Adams)    39444
Dogfish Head Brewery                  33839
Stone Brewing Co.                     33066
Sierra Nevada Brewing Co.             28751
Bell's Brewery, Inc.                  25191
                                      ...  
Panimoravintola Koulu                     1
MonteKristo Beverages                     1
Doshisha Corporation                      1
Karmeliter Bräu                           1
Malmö Brygghus                            1
Name: brewery_name, Length: 5742, dtype: int64

In [13]:
df.review_profilename.value_counts()

northyorksammy    5817
BuckeyeNation     4661
mikesgroove       4617
Thorpe429         3518
womencantsail     3497
                  ... 
hughps               1
andascia             1
spacewolf11          1
dkboileau            1
erosannin            1
Name: review_profilename, Length: 33387, dtype: int64

In [20]:
df.describe()

Unnamed: 0,brewery_id,review_time,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid
count,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1586614.0,1518829.0,1586614.0
mean,3130.099,1224089000.0,3.815581,3.735636,3.841642,3.743701,3.79286,7.042387,21712.79
std,5578.104,76544270.0,0.7206219,0.6976167,0.6160928,0.6822184,0.7319696,2.322526,21818.34
min,1.0,840672000.0,0.0,1.0,0.0,1.0,1.0,0.01,3.0
25%,143.0,1173224000.0,3.5,3.5,3.5,3.5,3.5,5.2,1717.0
50%,429.0,1239203000.0,4.0,4.0,4.0,4.0,4.0,6.5,13906.0
75%,2372.0,1288568000.0,4.5,4.0,4.0,4.0,4.5,8.5,39441.0
max,28003.0,1326285000.0,5.0,5.0,5.0,5.0,5.0,57.7,77317.0


In [32]:
df.describe(include='all')

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
count,1586614.0,1586599,1586614.0,1586614.0,1586614.0,1586614.0,1586266,1586614,1586614.0,1586614.0,1586614,1518829.0,1586614.0
unique,,5742,,,,,33387,104,,,56857,,
top,,Boston Beer Company (Samuel Adams),,,,,northyorksammy,American IPA,,,90 Minute IPA,,
freq,,39444,,,,,5817,117586,,,3290,,
mean,3130.099,,1224089000.0,3.815581,3.735636,3.841642,,,3.743701,3.79286,,7.042387,21712.79
std,5578.104,,76544270.0,0.7206219,0.6976167,0.6160928,,,0.6822184,0.7319696,,2.322526,21818.34
min,1.0,,840672000.0,0.0,1.0,0.0,,,1.0,1.0,,0.01,3.0
25%,143.0,,1173224000.0,3.5,3.5,3.5,,,3.5,3.5,,5.2,1717.0
50%,429.0,,1239203000.0,4.0,4.0,4.0,,,4.0,4.0,,6.5,13906.0
75%,2372.0,,1288568000.0,4.5,4.0,4.0,,,4.0,4.5,,8.5,39441.0


In [29]:
df.columns

Index(['brewery_id', 'brewery_name', 'review_time', 'review_overall',
       'review_aroma', 'review_appearance', 'review_profilename', 'beer_style',
       'review_palate', 'review_taste', 'beer_name', 'beer_abv',
       'beer_beerid'],
      dtype='object')

Columns to remove:
- Brewery_id: Use this as the label encoder?
- review_time: not relevant
- beer_name: very indicative of beer style, not to be included
- review_profilename: not relevant
- review_overall: not sure yet
- beer_abv: very indicative of beer style
- beer_beerid: not needed

Have to include
- brewery_name
- review_aroma
- review_appearance
- review_palate
- review_taste

Target
- beer_style


# Create datasets

In [40]:
df_cleaned = df.copy()

In [41]:
df_cleaned.drop(['brewery_id','review_time','beer_name','review_profilename','review_overall','beer_abv','beer_beerid'],axis=1,inplace=True)

In [42]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 6 columns):
brewery_name         1586599 non-null object
review_aroma         1586614 non-null float64
review_appearance    1586614 non-null float64
beer_style           1586614 non-null object
review_palate        1586614 non-null float64
review_taste         1586614 non-null float64
dtypes: float64(4), object(2)
memory usage: 72.6+ MB


In [43]:
df_cleaned.isnull().sum()

brewery_name         15
review_aroma          0
review_appearance     0
beer_style            0
review_palate         0
review_taste          0
dtype: int64

In [44]:
df_cleaned.dropna(inplace=True)

In [45]:
df_cleaned.isnull().sum()

brewery_name         0
review_aroma         0
review_appearance    0
beer_style           0
review_palate        0
review_taste         0
dtype: int64

In [46]:
# need to use drop=True to drop the index column created
df_cleaned.reset_index(drop=True,inplace=True)

In [47]:
df_cleaned.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste
0,Vecchio Birraio,2.0,2.5,Hefeweizen,1.5,1.5
1,Vecchio Birraio,2.5,3.0,English Strong Ale,3.0,3.0
2,Vecchio Birraio,2.5,3.0,Foreign / Export Stout,3.0,3.0
3,Vecchio Birraio,3.0,3.5,German Pilsener,2.5,3.0
4,Caldera Brewing Company,4.5,4.0,American Double / Imperial IPA,4.0,4.5


In [48]:
df_cleaned.describe(include='all')

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste
count,1586599,1586599.0,1586599.0,1586599,1586599.0,1586599.0
unique,5742,,,104,,
top,Boston Beer Company (Samuel Adams),,,American IPA,,
freq,39444,,,117584,,
mean,,3.735638,3.841647,,3.743705,3.792864
std,,0.6976142,0.6160899,,0.6822131,0.7319658
min,,1.0,0.0,,1.0,1.0
25%,,3.5,3.5,,3.5,3.5
50%,,4.0,4.0,,4.0,4.0
75%,,4.0,4.0,,4.0,4.5


In [49]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586599 entries, 0 to 1586598
Data columns (total 6 columns):
brewery_name         1586599 non-null object
review_aroma         1586599 non-null float64
review_appearance    1586599 non-null float64
beer_style           1586599 non-null object
review_palate        1586599 non-null float64
review_taste         1586599 non-null float64
dtypes: float64(4), object(2)
memory usage: 72.6+ MB


# Encode categorical variables brewery_name and beer_style (target)

https://yashuseth.blog/2018/07/22/pytorch-neural-network-for-tabular-data-with-categorical-embeddings/

https://www.fast.ai/2018/04/29/categorical-embeddings/

https://www.kaggle.com/vadbeg/pytorch-nn-with-embeddings-and-catboost/notebook#Features-encoding

https://stackabuse.com/introduction-to-pytorch-for-classification/

In [50]:
from sklearn.preprocessing import LabelEncoder

In [53]:
# instantiate label encoder
LE = LabelEncoder()

In [54]:
cat_cols = ['brewery_name','beer_style']

In [415]:
# fit transform cat variables - 

label_encoders = {}
for cat_col in cat_cols:
        label_encoders[cat_col] = LE
        df_cleaned[cat_col] = label_encoders[cat_col].fit_transform(df_cleaned[cat_col])

In [57]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586599 entries, 0 to 1586598
Data columns (total 6 columns):
brewery_name         1586599 non-null int64
review_aroma         1586599 non-null float64
review_appearance    1586599 non-null float64
beer_style           1586599 non-null int64
review_palate        1586599 non-null float64
review_taste         1586599 non-null float64
dtypes: float64(4), int64(2)
memory usage: 72.6 MB


In [59]:
df_cleaned[['brewery_name','beer_style']].describe()

Unnamed: 0,brewery_name,beer_style
count,1586599.0,1586599.0
mean,2905.176,42.14239
std,1729.725,33.02083
min,0.0,0.0
25%,1325.0,12.0
50%,2880.0,31.0
75%,4581.0,74.0
max,5741.0,103.0


In [60]:
label_encoders

{'brewery_name': LabelEncoder(), 'beer_style': LabelEncoder()}

In [91]:
# save encoder
from joblib import dump

dump(label_encoders, '../models/label_encoders.joblib')

['../models/label_encoders.joblib']

In [61]:
# Need to StandardScaler the numeric columns
num_cols = ['review_aroma','review_appearance','review_palate','review_taste']

In [62]:
# instantiate scaler
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

In [92]:
dump(sc, '../models/scaler.joblib')

['../models/scaler.joblib']

In [63]:
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [64]:
df_cleaned.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste
0,5438,-2.487964,-2.177682,65,-3.288863,-3.132476
1,5438,-1.771235,-1.366111,51,-1.090136,-1.083199
2,5438,-1.771235,-1.366111,59,-1.090136,-1.083199
3,5438,-1.054506,-0.554541,61,-1.823045,-1.083199
4,1480,1.09568,0.257029,9,0.375682,0.966078


In [65]:
from src.data.sets import split_sets_random

In [66]:
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_cleaned, target_col='beer_style', test_ratio=0.2, to_numpy=True)

In [68]:
print(X_train.shape)
print(y_train.shape)

(951959, 5)
(951959,)


In [69]:
print(X_val.shape)
print(X_test.shape)

(317320, 5)
(317320, 5)


In [75]:
from src.data.sets import save_sets

save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test)

In [71]:
X_val

array([[ 4.88400000e+03,  3.78951241e-01,  2.57028931e-01,
         3.75681950e-01,  2.82985337e-01],
       [ 1.63000000e+02, -1.05450621e+00, -5.54541255e-01,
        -1.09013615e+00, -1.08319920e+00],
       [ 2.17000000e+02, -3.37777485e-01,  2.57028931e-01,
        -3.57227099e-01,  2.82985337e-01],
       ...,
       [ 1.26600000e+03,  3.78951241e-01,  1.06859912e+00,
         3.75681950e-01,  2.82985337e-01],
       [ 3.58000000e+03,  3.78951241e-01,  2.57028931e-01,
         1.10859100e+00,  1.64916987e+00],
       [ 4.70900000e+03, -1.05450621e+00,  1.06859912e+00,
        -3.57227099e-01, -4.00106932e-01]])

In [73]:
pwd()

'/home/jovyan/work/notebooks'

In [76]:
# need to know which are cat cols and which are num cols
categorical = df_cleaned.drop(['beer_style'] + num_cols,
                          axis=1).columns


In [77]:
categorical

Index(['brewery_name'], dtype='object')

In [80]:
cat_cols_idx, cont_cols_idx = list(), list()

for idx, column in enumerate(df_cleaned.drop('beer_style',
                                         axis=1).columns):
    if column in categorical:
        cat_cols_idx.append(idx)
    elif column in num_cols:
        cont_cols_idx.append(idx)

In [81]:
print(cat_cols_idx)
print(cont_cols_idx)

[0]
[1, 2, 3, 4]


In [87]:
cat_dim = [int(df_cleaned[col].nunique()) for col in cat_cols]
cat_dim = [[x, min(200, (x + 1) // 2)] for x in cat_dim]

for el in cat_dim:
    if el[0] < 10:
        el[1] = el[0]

cat_dim


[[5742, 200], [104, 52]]

In [88]:
no_of_embs = sum([y for x, y in cat_dim])

In [89]:
no_of_embs

252

# Create a pipeline

In [93]:
from sklearn.pipeline import Pipeline

In [94]:
# Create a Pipeline called num_transformer with one step that contains StandardScaler

num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [113]:
# create a function for cat cols in pipepline 
#https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer.html#sphx-glr-auto-examples-compose-plot-column-transformer-py

from sklearn.preprocessing import FunctionTransformer

def cat_label_encoder(data):
    cat_cols = ['brewery_name','beer_style']
    from sklearn.preprocessing import LabelEncoder
    LE = LabelEncoder()
    # create an empty dict to keep label encoder for each cat col
    label_encoders = {}
    # loop over cat_cols and encode
    for cat_col in cat_cols:
        label_encoders[cat_col] = LE
        data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])
        
    return data

cat_label_transformer = FunctionTransformer(cat_label_encoder)

In [156]:
def cat_label_encoder(data):
    cat_cols = ['brewery_name','beer_style']
    from sklearn.preprocessing import LabelEncoder
    from joblib import load

    #label_encoders = load('../models/label_encoders.joblib')
    # create an empty dict to keep label encoder for each cat col
    data[cat_cols[0]] = label_encoders['brewery_name'].fit_transform(data[cat_cols[0]])
    data[cat_cols[1]] = label_encoders['beer_style'].fit_transform(data[cat_cols[1]])
        
    return data

cat_label_transformer = FunctionTransformer(cat_label_encoder)

In [152]:
def style_label_encoder(data):
    name_col = ['beer_style']
    from sklearn.preprocessing import LabelEncoder
    from joblib import load

    #label_encoders = load('../models/label_encoders.joblib')
    # create an empty dict to keep label encoder for each cat col
    data[name_col] = label_encoders['beer_style'].fit_transform(data[name_col])
        
    return data

style_label_transformer = FunctionTransformer(style_label_encoder)

In [154]:
name_label_transformer = FunctionTransformer(label_encoders['brewery_name'])
style_label_transformer = FunctionTransformer(label_encoders['beer_style'])

In [134]:
brewery_name = ['brewery_name']
beer_style = ['beer_style']

In [160]:
style_label_transformer.fit_transform(df_sample)

TypeError: 'LabelEncoder' object is not callable

In [96]:
from sklearn.compose import ColumnTransformer

In [177]:
# Solution:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', sc, num_cols)
    ], remainder = 'passthrough'
)

In [261]:
# try on small dataset
df_sample = df.iloc[0:10,:]

In [262]:
df_sample.drop(['brewery_id','review_time','beer_name','review_profilename','review_overall','beer_abv','beer_beerid','beer_style'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [263]:
df_sample

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
0,Vecchio Birraio,2.0,2.5,1.5,1.5
1,Vecchio Birraio,2.5,3.0,3.0,3.0
2,Vecchio Birraio,2.5,3.0,3.0,3.0
3,Vecchio Birraio,3.0,3.5,2.5,3.0
4,Caldera Brewing Company,4.5,4.0,4.0,4.5
5,Caldera Brewing Company,3.5,3.5,3.0,3.5
6,Caldera Brewing Company,3.5,3.5,4.0,4.0
7,Caldera Brewing Company,2.5,3.5,2.0,3.5
8,Caldera Brewing Company,3.0,3.5,3.5,4.0
9,Caldera Brewing Company,3.5,5.0,4.0,4.0


In [196]:
df_sample.to_csv('../data/interim/df_sample.csv')

In [187]:
preprocess_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('cat_cols', rev_encode_transformer)])

In [188]:
df_new = preprocess_pipe.fit_transform(df_sample)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [186]:
def rev_encode(data):
    data.iloc[:,0] = label_encoders['brewery_name'].fit_transform(data.iloc[:,0])
    data.iloc[:,3] = label_encoders['beer_style'].fit_transform(data.iloc[:,3])
    
    return data
rev_encode_transformer = FunctionTransformer(rev_encode)

In [121]:
df_new

array([[-1.52752523, -1.58113883, -1.89010336, -2.375     ,  1.        ,
         4.        ],
       [-0.80013226, -0.79056942, -0.06097108, -0.5       ,  1.        ,
         1.        ],
       [-0.80013226, -0.79056942, -0.06097108, -0.5       ,  1.        ,
         2.        ],
       [-0.0727393 ,  0.        , -0.67068184, -0.5       ,  1.        ,
         3.        ],
       [ 2.10943961,  0.79056942,  1.15845045,  1.375     ,  0.        ,
         0.        ],
       [ 0.65465367,  0.        , -0.06097108,  0.125     ,  0.        ,
         5.        ],
       [ 0.65465367,  0.        ,  1.15845045,  0.75      ,  0.        ,
         5.        ],
       [-0.80013226,  0.        , -1.2803926 ,  0.125     ,  0.        ,
         5.        ],
       [-0.0727393 ,  0.        ,  0.54873968,  0.75      ,  0.        ,
         5.        ],
       [ 0.65465367,  2.37170825,  1.15845045,  0.75      ,  0.        ,
         5.        ]])

In [146]:
df_num = num_transformer.fit_transform(df_sample[num_cols])

In [147]:
df_num

array([[-1.52752523, -1.58113883, -1.89010336, -2.375     ],
       [-0.80013226, -0.79056942, -0.06097108, -0.5       ],
       [-0.80013226, -0.79056942, -0.06097108, -0.5       ],
       [-0.0727393 ,  0.        , -0.67068184, -0.5       ],
       [ 2.10943961,  0.79056942,  1.15845045,  1.375     ],
       [ 0.65465367,  0.        , -0.06097108,  0.125     ],
       [ 0.65465367,  0.        ,  1.15845045,  0.75      ],
       [-0.80013226,  0.        , -1.2803926 ,  0.125     ],
       [-0.0727393 ,  0.        ,  0.54873968,  0.75      ],
       [ 0.65465367,  2.37170825,  1.15845045,  0.75      ]])

In [185]:
label_encoders['brewery_name'].fit_transform(df_sample.iloc[:,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0])

In [172]:
label_encoders['brewery_name']

LabelEncoder()

In [175]:
rev_encode(df_sample)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste
0,1,2.0,2.5,4,1.5,1.5
1,1,2.5,3.0,1,3.0,3.0
2,1,2.5,3.0,2,3.0,3.0
3,1,3.0,3.5,3,2.5,3.0
4,0,4.5,4.0,0,4.0,4.5
5,0,3.5,3.5,5,3.0,3.5
6,0,3.5,3.5,5,4.0,4.0
7,0,2.5,3.5,5,2.0,3.5
8,0,3.0,3.5,5,3.5,4.0
9,0,3.5,5.0,5,4.0,4.0


In [251]:
def format_features(brewery_name: int, review_aroma: int, review_appearance: int, review_palate: int, review_taste: int):
      
        return {
        'brewery_name': [brewery_name],
        'review_aroma': [review_aroma],
        'review_appearance': [review_appearance],
        'review_palate': [review_palate],
        'review_taste': [review_taste]
    }

In [252]:
features = format_features(5,2.2,3.1,4.2,1.1)

In [253]:
obs = pd.DataFrame(features)

In [254]:
obs

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
0,5,2.2,3.1,4.2,1.1


In [255]:
obs[num_cols] = sc.fit_transform(obs[num_cols])

In [302]:
obs['brewery_name']

0    5
Name: brewery_name, dtype: int64

In [246]:
obs_np = obs.to_numpy()

In [305]:
def single_tensor(obs):
    """Converts single row to tensor """
    data_cat = []
    data_cont = []
    num_cols = ['review_aroma','review_appearance','review_palate','review_taste']
    data_cat = torch.tensor(obs['brewery_name'].to_numpy())
    data_cont = torch.tensor(obs[num_cols].to_numpy())
                
    data = [data_cat, data_cont]
    result = {'data': data}
    
    return result

In [307]:
obs_tensor = single_tensor(obs)

In [264]:
df_sample['brewery_name'] = label_encoders['brewery_name'].fit_transform(df_sample['brewery_name'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [265]:
df_sample

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
0,1,2.0,2.5,1.5,1.5
1,1,2.5,3.0,3.0,3.0
2,1,2.5,3.0,3.0,3.0
3,1,3.0,3.5,2.5,3.0
4,0,4.5,4.0,4.0,4.5
5,0,3.5,3.5,3.0,3.5
6,0,3.5,3.5,4.0,4.0
7,0,2.5,3.5,2.0,3.5
8,0,3.0,3.5,3.5,4.0
9,0,3.5,5.0,4.0,4.0


In [268]:
df_sample[num_cols] = sc.fit_transform(df_sample[num_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [269]:
df_sample

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste
0,1,-1.527525,-1.581139,-1.890103,-2.375
1,1,-0.800132,-0.790569,-0.060971,-0.5
2,1,-0.800132,-0.790569,-0.060971,-0.5
3,1,-0.072739,0.0,-0.670682,-0.5
4,0,2.10944,0.790569,1.15845,1.375
5,0,0.654654,0.0,-0.060971,0.125
6,0,0.654654,0.0,1.15845,0.75
7,0,-0.800132,0.0,-1.280393,0.125
8,0,-0.072739,0.0,0.54874,0.75
9,0,0.654654,2.371708,1.15845,0.75


In [336]:
class EmbeddingDataset(Dataset):
    def __init__(self, data, targets=None,
                 is_train=True, cat_cols_idx=None,
                 cont_cols_idx=None):
        self.data = data
        self.targets = targets
        self.is_train = is_train
        self.cat_cols_idx = cat_cols_idx
        self.cont_cols_idx = cont_cols_idx
    
    def __getitem__(self, idx):
        row = self.data[idx].astype('float32')
        
        data_cat = []
        data_cont = []
        
        result = None
        
        if self.cat_cols_idx:
            data_cat = torch.tensor(row[self.cat_cols_idx])
            
        if self.cont_cols_idx:
            data_cont = torch.tensor(row[self.cont_cols_idx])
                
        data = [data_cat, data_cont]
                
        if self.is_train:
            result = {'data': data,
                      'target': torch.tensor(self.targets[idx])}
        else:
            result = {'data': data}
            
        return result
            
    
    def __len__(self):
        return(len(self.data))

In [337]:
from torch.utils.data import Dataset
sample_embed = EmbeddingDataset(df_sample.to_numpy(),
                             cat_cols_idx=[0],
                             cont_cols_idx=[1,2,3,4],
                             is_train=False)

In [338]:
sample_embed

<__main__.EmbeddingDataset at 0x7f8a1c87b410>

In [228]:
# embedding example
class ClassificationEmbdNN(torch.nn.Module):
    
    def __init__(self, emb_dims, no_of_cont=None):
        super(ClassificationEmbdNN, self).__init__()
        
        self.emb_layers = torch.nn.ModuleList([torch.nn.Embedding(x, y)
                                               for x, y in emb_dims])
        
        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.emb_dropout = torch.nn.Dropout(0.2)
        
        self.no_of_cont = 0
        if no_of_cont:
            self.no_of_cont = no_of_cont
            self.bn_cont = torch.nn.BatchNorm1d(no_of_cont)
        
        self.fc1 = torch.nn.Linear(in_features=self.no_of_embs + self.no_of_cont, 
                                   out_features=208)
        self.dropout1 = torch.nn.Dropout(0.2)
        self.bn1 = torch.nn.BatchNorm1d(208)
        self.act1 = torch.nn.ReLU()
        
        self.fc2 = torch.nn.Linear(in_features=208, 
                                   out_features=208)
        self.dropout2 = torch.nn.Dropout(0.2)
        self.bn2 = torch.nn.BatchNorm1d(208)
        self.act2 = torch.nn.ReLU()
        
#         self.fc3 = torch.nn.Linear(in_features=256, 
#                                    out_features=64)
#         self.dropout3 = torch.nn.Dropout(0.2)
#         self.bn3 = torch.nn.BatchNorm1d(64)
#         self.act3 = torch.nn.ReLU()
        
        self.fc3 = torch.nn.Linear(in_features=208, 
                                   out_features=104)
        self.act3 = torch.nn.Softmax()
        
    def forward(self, x_cat, x_cont=None):
        if self.no_of_embs != 0:
            x = [emb_layer(x_cat[:, i])
                 for i, emb_layer in enumerate(self.emb_layers)]
        
            x = torch.cat(x, 1)
            x = self.emb_dropout(x)
            
        if self.no_of_cont != 0:
            x_cont = self.bn_cont(x_cont)
            
            if self.no_of_embs != 0:
                x = torch.cat([x, x_cont], 1)
            else:
                x = x_cont
        
        x = self.fc1(x)
        x = self.dropout1(x)
        x = self.bn1(x)
        x = self.act1(x)
        
        x = self.fc2(x)
        x = self.dropout2(x)
        x = self.bn2(x)
        x = self.act2(x)
        
#         x = self.fc3(x)
#         x = self.dropout3(x)
#         x = self.bn3(x)
#         x = self.act3(x)
        
        x = self.fc3(x)
        x = self.act3(x)
        
        return x

In [229]:
model = ClassificationEmbdNN(emb_dims=[[5742, 252]], 
                             no_of_cont=4)

In [231]:
model.load_state_dict(torch.load('../models/embed_3layers.pt'))

<All keys matched successfully>

In [308]:
obs_tensor

{'data': [tensor([5]), tensor([[0., 0., 0., 0.]], dtype=torch.float64)]}

In [395]:
# get model and load
def get_model():
    
    # load model obj
    model = torch.load('../models/model.pt')
    #set to trained dict of weights
    model.load_state_dict(torch.load('../models/embed_3layers.pt'))
#     model = ClassificationEmbdNN(emb_dims=[[5742, 252]], 
#                              no_of_cont=4)
    return model

In [396]:
model = get_model()

In [397]:
model

ClassificationEmbdNN(
  (emb_layers): ModuleList(
    (0): Embedding(5742, 252)
  )
  (emb_dropout): Dropout(p=0.2, inplace=False)
  (bn_cont): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=256, out_features=208, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (bn1): BatchNorm1d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU()
  (fc2): Linear(in_features=208, out_features=208, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (bn2): BatchNorm1d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU()
  (fc3): Linear(in_features=208, out_features=104, bias=True)
  (act3): Softmax(dim=None)
)

In [398]:
def predict(obs, model, single=False):
    """obs = dataset as tensor embed obj
       model = model state_dict loaded
       single = set to true if single input"""
    model.eval()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    
    with torch.no_grad():
        predictions = None
        
        if single:
            output = model(obs['data'][0].unsqueeze(dim=0).to(device, 
                                     dtype=torch.long), 
                       obs['data'][1].to(device, 
                                     dtype=torch.float)).cpu().numpy()
        else:
            for i, batch in enumerate(obs):   
            
                output = model(batch['data'][0].to(device, 
                                               dtype=torch.long), 
                               batch['data'][1].to(device, 
                                               dtype=torch.float)).cpu().numpy()
            
                if i == 0:
                    predictions = output
                
                else: 
                
                    predictions = np.vstack((predictions, output))   

        predictions = output
          
    return predictions

In [399]:
from torch.utils.data import DataLoader

In [410]:
sample_loader = DataLoader(sample_embed, 
                         batch_size=10)
sample_predictions = predict(sample_loader, model)



In [411]:
sample_predictions

array([[0.00944351, 0.00480571, 0.00745835, ..., 0.00855652, 0.00689196,
        0.01454395],
       [0.01037449, 0.00466179, 0.00807008, ..., 0.00766635, 0.00694726,
        0.0130783 ],
       [0.01037449, 0.00466179, 0.00807008, ..., 0.00766635, 0.00694726,
        0.0130783 ],
       ...,
       [0.01142815, 0.00981393, 0.00902319, ..., 0.0109056 , 0.00726931,
        0.01235791],
       [0.01153892, 0.0096069 , 0.00856433, ..., 0.01108403, 0.00692934,
        0.01186901],
       [0.01166372, 0.00970882, 0.00780067, ..., 0.01027665, 0.00704606,
        0.01258745]], dtype=float32)

In [412]:
sample_predictions.argmax(1)

array([32, 32, 32, 32, 78, 78, 78, 78, 78, 78])

In [427]:
from joblib import load
label_encoders = load('../models/label_encoders.joblib')

In [428]:
label_encoders['beer_style'].inverse_transform(sample_predictions.argmax(1))

array(['Braggot', 'Braggot', 'Braggot', 'Braggot',
       'Maibock / Helles Bock', 'Maibock / Helles Bock',
       'Maibock / Helles Bock', 'Maibock / Helles Bock',
       'Maibock / Helles Bock', 'Maibock / Helles Bock'], dtype=object)

In [402]:
obs_predictions = predict(obs_tensor, model, single=True)



In [403]:
obs_predictions

array([[0.01825028, 0.00950472, 0.01115052, 0.00797018, 0.01037026,
        0.00943619, 0.00833104, 0.0055263 , 0.01117526, 0.0066663 ,
        0.00951379, 0.00991232, 0.00961155, 0.00955466, 0.00845375,
        0.00533909, 0.0058498 , 0.00765817, 0.00932209, 0.00436145,
        0.00715782, 0.0083602 , 0.00963618, 0.01734886, 0.0070847 ,
        0.0102426 , 0.01105385, 0.01369664, 0.0061642 , 0.00584857,
        0.01318426, 0.00566045, 0.01037427, 0.0123582 , 0.00841531,
        0.00554768, 0.00840526, 0.00772558, 0.00874808, 0.00979059,
        0.00655429, 0.01040185, 0.01088449, 0.01109528, 0.0105235 ,
        0.00660817, 0.01024053, 0.01050217, 0.00978048, 0.00653927,
        0.01143888, 0.00684217, 0.01329905, 0.01126397, 0.01470143,
        0.00862656, 0.00925498, 0.01051404, 0.01000731, 0.00677265,
        0.00903034, 0.00774935, 0.01434666, 0.01191202, 0.00737336,
        0.01146365, 0.00970856, 0.01095648, 0.00860585, 0.00821672,
        0.00654066, 0.0072486 , 0.00736873, 0.02

In [406]:
obs_predictions.argmax(1).astype(int)

array([73])

In [430]:
label_encoders['beer_style'].inverse_transform(obs_predictions.argmax(1))

array(['Kölsch'], dtype=object)

In [421]:
label_encoders['beer_style'].classes_

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103])

In [424]:
label_encoders['beer_style'].inverse_transform([28,84])

array([28, 84])

In [431]:
# function to predict from obs
def predict(obs, model, single=False):
    """obs = dataset as tensor embed obj
       model = model state_dict loaded
       single = set to true if single input"""
    
    # set to eval
    model.eval()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    
    with torch.no_grad():
        predictions = None
        
        if single:
            output = model(obs['data'][0].unsqueeze(dim=0).to(device, 
                                     dtype=torch.long), 
                       obs['data'][1].to(device, 
                                     dtype=torch.float)).cpu().numpy()
        else:
            for i, batch in enumerate(obs):   
            
                output = model(batch['data'][0].to(device, 
                                               dtype=torch.long), 
                               batch['data'][1].to(device, 
                                               dtype=torch.float)).cpu().numpy()
            
                if i == 0:
                    predictions = output
                
                else: 
                
                    predictions = np.vstack((predictions, output))   

        predictions = output
     
    from joblib import load
    label_encoders = load('../models/label_encoders.joblib')
    label = label_encoders['beer_style'].inverse_transform(predictions.argmax(1))
    
          
    return label

In [434]:
print(predict(obs_tensor, model, single=True))

['Kölsch']


