In [38]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
from ds.data.sets import save_sets_v2

import re
import joblib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [40]:
random_state = 55

# Read Data

In [41]:
df = pd.read_csv("../data/raw/beer_reviews.csv")

- brewery_name: Union[str, None] = Field(default=None, description="Name of brewery", max_length=100)
- review_aroma: Union[float, None] = Field(default=None, description="How good does the beer smell", ge=1, le=5)
- review_appearance: Union[float, None] = Field(default=None, description="How good does the beer look", ge=1, le=5)
- review_palate: Union[float, None] = Field(default=None, description="Something else to do with taste?", ge=1, le=5)
- review_taste: Union[float, None] = Field(default=None, description="How good does the beer taste?", ge=1, le=5)
- beer_abv: Union[float, None] = Field(default=None, description="Alchohol by volume", ge=1, le=5)

# Drop Uneeded Cols

In [42]:
drops = [
    "brewery_id",
    "review_time",
    "review_profilename",
    "beer_beerid"
]
df = df.drop(columns=drops)
df

Unnamed: 0,brewery_name,review_overall,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_name,beer_abv
0,Vecchio Birraio,1.5,2.0,2.5,Hefeweizen,1.5,1.5,Sausa Weizen,5.0
1,Vecchio Birraio,3.0,2.5,3.0,English Strong Ale,3.0,3.0,Red Moon,6.2
2,Vecchio Birraio,3.0,2.5,3.0,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5
3,Vecchio Birraio,3.0,3.0,3.5,German Pilsener,2.5,3.0,Sausa Pils,5.0
4,Caldera Brewing Company,4.0,4.5,4.0,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7
...,...,...,...,...,...,...,...,...,...
1586609,The Defiant Brewing Company,5.0,4.0,3.5,Pumpkin Ale,4.0,4.0,The Horseman's Ale,5.2
1586610,The Defiant Brewing Company,4.0,5.0,2.5,Pumpkin Ale,2.0,4.0,The Horseman's Ale,5.2
1586611,The Defiant Brewing Company,4.5,3.5,3.0,Pumpkin Ale,3.5,4.0,The Horseman's Ale,5.2
1586612,The Defiant Brewing Company,4.0,4.5,4.5,Pumpkin Ale,4.5,4.5,The Horseman's Ale,5.2


# Drop NA brewerys

In [43]:
df =  df.loc[~df["brewery_name"].isna(),:]

# Drop values outside of api data validation from all

In [44]:
num_cols = [
    "review_overall",
    "review_aroma",
    "review_appearance",
    "review_palate",
    "review_taste",
]
for col in num_cols:
    df.loc[df[col]<1,col] = np.nan
    df.loc[df[col]>5,col] = np.nan

In [45]:
string_cols = [
    "brewery_name",
    "beer_style",
    "beer_name"
]
for col in string_cols:
    df.loc[:,col] = df.loc[:,col].str.lower().str.strip()
    df.loc[:,col] = df.loc[:,col].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    df.loc[:,col] = df.loc[:,col].str.replace(r"[^\w\s\d/]", "", regex=True)
    df.loc[:,col] = df.loc[:,col].str.replace(r"\s+", " ", regex=True)


# Split Multiple Brewery entries and append <- keeps multiple entries

In [46]:
# explode multi brewary name and add back
multi_brewery = df["brewery_name"].str.split(" / ")
multi_brewery = df.loc[multi_brewery.str.len()>1,:]
multi_brewery.loc[:,"brewery_name"] = multi_brewery.loc[:,"brewery_name"].str.split(" / ")
multi_brewery = multi_brewery.explode("brewery_name")

In [47]:
df = pd.concat([df, multi_brewery])

# Extract Outcome

In [48]:
target = df.pop("beer_style")

# Create Splits

In [49]:
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.2, stratify=target, random_state=random_state)
x_test, x_validation, y_test, y_validation = train_test_split(x_test, y_test, test_size=0.5, stratify=y_test, random_state=random_state)

# Drop bad values from train and test

In [50]:
# drop abv > 19
x_train.loc[x_train["beer_abv"]>19,"beer_abv"] = np.nan
x_test.loc[x_test["beer_abv"]>19,"beer_abv"] = np.nan

# Drop values outside of api data validation from all

In [53]:
datasets = [
    x_train,
    y_train,
    x_test,
    y_test,
    x_validation,
    y_validation,
]
joblib.dump(datasets, "../data/processed/datasetsjmone")

['../data/processed/datasetsjmone']