## Preprocessing

In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
# Getting Packages
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from joblib import load
from sklearn.preprocessing import StandardScaler
from joblib import dump


In [24]:
# Getting Data 
df_raw = pd.read_csv("../data/raw/beer_reviews.csv")
df_raw.head()


Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [25]:
# Info Data
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [26]:
# Drop the columns
features_drop = ['brewery_id','review_time','review_overall','review_profilename','beer_name','beer_beerid']
df_prep = df_raw.drop(features_drop, axis=1)
df_prep.shape

(1586614, 7)

In [27]:
# Dropping_nan for beer_style (incidents not useful for training or validation)
df_prep.dropna(subset=['beer_style'], inplace=True)
df_prep.reset_index(drop=True, inplace=True)

In [28]:
# Hash encoding column 'brewery_name'
from src.data.sets import apply_hashing_trick

categorical_column = 'brewery_name'

num_buckets = 10

df_prep = apply_hashing_trick(df_prep, categorical_column, num_buckets)
df_prep.drop('brewery_name', axis=1, inplace=True)


In [29]:
# LabelEncoder for target 
label_encoder = LabelEncoder()
df_prep['beer_style'] = label_encoder.fit_transform(df_prep['beer_style'])
print(df_prep.head())


   review_aroma  review_appearance  beer_style  review_palate  review_taste  \
0           2.0                2.5          65            1.5           1.5   
1           2.5                3.0          51            3.0           3.0   
2           2.5                3.0          59            3.0           3.0   
3           3.0                3.5          61            2.5           3.0   
4           4.5                4.0           9            4.0           4.5   

   beer_abv  hashed_brewery_name  
0       5.0                    6  
1       6.2                    6  
2       6.5                    6  
3       5.0                    6  
4       7.7                    5  


In [30]:
# Pop Target 
from src.data.sets import pop_target
df_prep, target = pop_target(df=df_prep, target_col='beer_style')


In [31]:
# Apply the scaler

# scaler = load('../models/scaler.joblib')
# df_prep = scaler.transform(df_prep)

scaler = StandardScaler()
features = pd.DataFrame(scaler.fit_transform(df_prep), columns=df_prep.columns)


In [32]:
# Slitting data set
from src.data.sets import split_sets_random

X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(features, target=target, test_ratio=0.2, to_numpy=True)

In [33]:
from src.data.sets import save_sets

save_sets(X_train, y_train, X_val, y_val, X_test, y_test, path = '../data/processed/')