# ADV_DSI_AT2 - Prepare data for training pytorch neural network - baseline model

In [181]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

from scipy.stats import mode
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

## Explore Data

In [182]:
df = pd.read_csv('../data/processed/beer_train_set.csv')

In [183]:
df.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_style
0,Avery Brewing Company,4.0,3.5,3.5,4.0,5.5,American Amber / Red Ale
1,Avery Brewing Company,3.5,4.5,3.5,4.0,5.5,American Amber / Red Ale
2,Avery Brewing Company,3.5,3.0,3.0,3.0,5.5,American Amber / Red Ale
3,Avery Brewing Company,3.5,3.5,3.0,4.0,5.5,American Amber / Red Ale
4,Avery Brewing Company,4.0,4.5,4.0,4.5,5.5,American Amber / Red Ale


In [184]:
df.shape

(7500, 7)

## Prepare Data

In [226]:
df_cleaned = df.copy()

In [227]:
# Target variable must be converted to ordinal. Order ategories from most to least frequent, thus beer_style = 0 = American IPA = the mode of the data.
# convert to int

cats = [['American IPA', 'American Double / Imperial IPA', 'Russian Imperial Stout', 'American Pale Ale (APA)', 'American Double / Imperial Stout', 'American Strong Ale', 'American Porter', 'American Amber / Red Ale', 'Belgian Strong Dark Ale', 'Fruit / Vegetable Beer']]

col_encoder = OrdinalEncoder(categories = cats)
df_cleaned['beer_style'] = col_encoder.fit_transform(df_cleaned[['beer_style']])
df_cleaned['beer_style'] = df_cleaned['beer_style'].astype(int)


In [228]:
# remove the target variable

# target = df_cleaned.pop('beer_style')
target_col = 'beer_style'

In [229]:
# list categorical variables

cat_cols = ['brewery_name']

In [230]:
# List numerical variables

num_cols = list(set(df_cleaned.columns) - (set(cat_cols) | set([target_col])))
num_cols

['beer_abv',
 'review_taste',
 'review_appearance',
 'review_aroma',
 'review_palate']

In [231]:
# Use default standard scaler to scale numeric cols.

sc = StandardScaler()
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [232]:
# OneHotEncode brewery_name

ohe = OneHotEncoder(sparse=False)
X_cat = pd.DataFrame(ohe.fit_transform(df_cleaned[cat_cols]))
X_cat.columns = ohe.get_feature_names(cat_cols)
df_cleaned.drop(cat_cols, axis=1, inplace=True)
X = pd.concat([df_cleaned, X_cat], axis=1)
X



Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_style,brewery_name_Avery Brewing Company,"brewery_name_Bell's Brewery, Inc.",brewery_name_Boston Beer Company (Samuel Adams),brewery_name_Dogfish Head Brewery,brewery_name_Founders Brewing Company,brewery_name_Lagunitas Brewing Company,brewery_name_Rogue Ales,brewery_name_Sierra Nevada Brewing Co.,brewery_name_Stone Brewing Co.,brewery_name_Victory Brewing Company
0,0.155525,-0.998028,-0.674427,0.073424,-0.890882,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.675865,0.943912,-0.674427,0.073424,-0.890882,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.675865,-1.968997,-1.508700,-1.443602,-0.890882,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.675865,-0.998028,-1.508700,0.073424,-0.890882,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.155525,0.943912,0.159847,0.831937,-0.890882,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,-1.507255,-1.968997,-1.508700,-0.685089,0.257065,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7496,0.155525,0.943912,0.159847,0.073424,0.257065,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7497,0.155525,0.943912,0.159847,-0.685089,0.257065,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7498,-1.507255,-0.998028,-0.674427,-2.202115,0.257065,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [239]:
def pop_target(df, target_col, to_numpy=False):
    """Extract target variable from dataframe and convert to nympy arrays if required

    Parameters
    ----------
    df : pd.DataFrame
        Dataframe
    target_col : str
        Name of the target variable
    to_numpy : bool
        Flag stating to convert to numpy array or not

    Returns
    -------
    pd.DataFrame/Numpy array
        Subsetted Pandas dataframe containing all features
    pd.DataFrame/Numpy array
        Subsetted Pandas dataframe containing the target
    """

    df_copy = df.copy()
    target = df_copy.pop(target_col)
    
    if to_numpy:
        df_copy = df_copy.to_numpy()
        target = target.to_numpy()
    
    return df_copy, target

In [240]:
# Solution
def split_sets_random(df, target_col, test_ratio=0.2, to_numpy=False):
    """Split sets randomly

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe
    target_col : str
        Name of the target column
    test_ratio : float
        Ratio used for the validation and testing sets (default: 0.2)

    Returns
    -------
    Numpy Array
        Features for the training set
    Numpy Array
        Target for the training set
    Numpy Array
        Features for the validation set
    Numpy Array
        Target for the validation set
    Numpy Array
        Features for the testing set
    Numpy Array
        Target for the testing set
    """
    
    from sklearn.model_selection import train_test_split
    
    features, target = pop_target(df=df, target_col=target_col, to_numpy=to_numpy)
    
    X_data, X_test, y_data, y_test = train_test_split(features, target, test_size=test_ratio, random_state=8)
    
    val_ratio = test_ratio / (1 - test_ratio)
    X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=val_ratio, random_state=8)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [242]:
def save_sets(X_train=None, y_train=None, X_val=None, y_val=None, X_test=None, y_test=None, path='../data/processed/'):
    """Save the different sets locally

    Parameters
    ----------
    X_train: Numpy Array
        Features for the training set
    y_train: Numpy Array
        Target for the training set
    X_val: Numpy Array
        Features for the validation set
    y_val: Numpy Array
        Target for the validation set
    X_test: Numpy Array
        Features for the testing set
    y_test: Numpy Array
        Target for the testing set
    path : str
        Path to the folder where the sets will be saved (default: '../data/processed/')

    Returns
    -------
    """
    import numpy as np

    if X_train is not None:
      np.save(f'{path}X_train', X_train)
    if X_val is not None:
      np.save(f'{path}X_val',   X_val)
    if X_test is not None:
      np.save(f'{path}X_test',  X_test)
    if y_train is not None:
      np.save(f'{path}y_train', y_train)
    if y_val is not None:
      np.save(f'{path}y_val',   y_val)
    if y_test is not None:
      np.save(f'{path}y_test',  y_test)


In [241]:
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(X, target_col='beer_style', test_ratio=0.2, to_numpy=True)

In [245]:
save_sets(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, X_test=X_test, y_test=y_test, path='../data/processed/')

In [246]:
class PytorchDataset(Dataset):
    """
    Pytorch dataset
    ...

    Attributes
    ----------
    X_tensor : Pytorch tensor
        Features tensor
    y_tensor : Pytorch tensor
        Target tensor

    Methods
    -------
    __getitem__(index)
        Return features and target for a given index
    __len__
        Return the number of observations
    to_tensor(data)
        Convert Pandas Series to Pytorch tensor
    """
        
    def __init__(self, X, y):
        self.X_tensor = self.to_tensor(X)
        self.y_tensor = self.to_tensor(y)
    
    def __getitem__(self, index):
        return self.X_tensor[index], self.y_tensor[index]
        
    def __len__ (self):
        return len(self.X_tensor)
    
    def to_tensor(self, data):
        return torch.Tensor(np.array(data))

In [247]:
train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [248]:
y_train

array([2, 2, 4, ..., 0, 3, 9])

## Baseline Model

In [249]:
# For a baseline we will predict every record as the mode of the training data.
# the beer_style categories were passed to ordinal enoder i order of most to least frequent, thus 

mode(y_data)

  mode(y_data)


ModeResult(mode=array([1]), count=array([809]))

In [250]:
y_base = np.full((len(y_data), 1), 1)

In [251]:
accuracy_score(y_data, y_base)

0.13483333333333333

In [252]:
f1_score(y_data, y_base, average = 'weighted')

0.032039996083614825

## Define model architecture

In [282]:
class PytorchMultiClass(nn.Module):
    def __init__(self, num_features):
        super(PytorchMultiClass, self).__init__()
        
        self.layer_1 = nn.Linear(num_features, 32)
        self.layer_2 = nn.Linear(32, 32)
        self.layer_out = nn.Linear(32, 10)

    def forward(self, x):
        x = F.dropout(F.relu(self.layer_1(x)), training=self.training)
        return self.layer_out(x)

In [283]:
model = PytorchMultiClass(X_data.shape[1])

In [284]:
model

PytorchMultiClass(
  (layer_1): Linear(in_features=15, out_features=32, bias=True)
  (layer_2): Linear(in_features=32, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=10, bias=True)
)

In [285]:
X_data.shape[1]

15

In [274]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

In [286]:
device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=15, out_features=32, bias=True)
  (layer_2): Linear(in_features=32, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=10, bias=True)
)

In [287]:
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=15, out_features=32, bias=True)
  (layer_2): Linear(in_features=32, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=10, bias=True)
)


## Train Model

In [288]:
criterion = nn.CrossEntropyLoss()

In [289]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [290]:
def train_classification(train_data, model, criterion, optimizer, batch_size, device, scheduler=None, generate_batch=None):
    """Train a Pytorch multi-class classification model

    Parameters
    ----------
    train_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    optimizer: torch.optim
        Optimizer
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    scheduler : torch.optim.lr_scheduler
        Pytorch Scheduler used for updating learning rate
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """
    
    # Set model to training mode
    model.train()
    train_loss = 0
    train_acc = 0
    
    # Create data loader
    data = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:

        # Reset gradients
        optimizer.zero_grad()
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Make predictions
        output = model(feature)
        
        # Calculate loss for given batch
        loss = criterion(output, target_class.long())

        # Calculate global loss
        train_loss += loss.item()
        
        # Calculate gradients
        loss.backward()

        # Update Weights
        optimizer.step()
        
        # Calculate global accuracy
        train_acc += (output.argmax(1) == target_class).sum().item()

    # Adjust the learning rate
    if scheduler:
        scheduler.step()

    return train_loss / len(train_data), train_acc / len(train_data)

In [291]:
def test_classification(test_data, model, criterion, batch_size, device, generate_batch=None):
    """Calculate performance of a Pytorch multi-class classification model

    Parameters
    ----------
    test_data : torch.utils.data.Dataset
        Pytorch dataset
    model: torch.nn.Module
        Pytorch Model
    criterion: function
        Loss function
    bacth_size : int
        Number of observations per batch
    device : str
        Name of the device used for the model
    collate_fn : function
        Function defining required pre-processing steps

    Returns
    -------
    Float
        Loss score
    Float:
        Accuracy Score
    """    
    
    # Set model to evaluation mode
    model.eval()
    test_loss = 0
    test_acc = 0
    
    # Create data loader
    data = DataLoader(test_data, batch_size=batch_size, collate_fn=generate_batch)
    
    # Iterate through data by batch of observations
    for feature, target_class in data:
        
        # Load data to specified device
        feature, target_class = feature.to(device), target_class.to(device)
        
        # Set no update to gradients
        with torch.no_grad():
            
            # Make predictions
            output = model(feature)
            
            # Calculate loss for given batch
            loss = criterion(output, target_class.long())

            # Calculate global loss
            test_loss += loss.item()
            
            # Calculate global accuracy
            test_acc += (output.argmax(1) == target_class).sum().item()

    return test_loss / len(test_data), test_acc / len(test_data)

In [292]:
N_EPOCHS = 50
BATCH_SIZE = 32

In [293]:
# Solution:
# from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

Epoch: 0
	(train)	|	Loss: 0.0607	|	Acc: 26.6%
	(valid)	|	Loss: 0.0493	|	Acc: 40.3%
Epoch: 1
	(train)	|	Loss: 0.0560	|	Acc: 30.5%
	(valid)	|	Loss: 0.0494	|	Acc: 33.7%
Epoch: 2
	(train)	|	Loss: 0.0568	|	Acc: 29.5%
	(valid)	|	Loss: 0.0507	|	Acc: 35.5%
Epoch: 3
	(train)	|	Loss: 0.0577	|	Acc: 29.0%
	(valid)	|	Loss: 0.0524	|	Acc: 31.7%
Epoch: 4
	(train)	|	Loss: 0.0578	|	Acc: 28.3%
	(valid)	|	Loss: 0.0506	|	Acc: 38.9%
Epoch: 5
	(train)	|	Loss: 0.0568	|	Acc: 28.8%
	(valid)	|	Loss: 0.0503	|	Acc: 33.7%
Epoch: 6
	(train)	|	Loss: 0.0571	|	Acc: 29.4%
	(valid)	|	Loss: 0.0516	|	Acc: 33.1%
Epoch: 7
	(train)	|	Loss: 0.0574	|	Acc: 29.9%
	(valid)	|	Loss: 0.0492	|	Acc: 37.7%
Epoch: 8
	(train)	|	Loss: 0.0573	|	Acc: 29.0%
	(valid)	|	Loss: 0.0508	|	Acc: 31.9%
Epoch: 9
	(train)	|	Loss: 0.0566	|	Acc: 30.0%
	(valid)	|	Loss: 0.0512	|	Acc: 34.0%
Epoch: 10
	(train)	|	Loss: 0.0588	|	Acc: 27.7%
	(valid)	|	Loss: 0.0508	|	Acc: 34.2%
Epoch: 11
	(train)	|	Loss: 0.0578	|	Acc: 28.4%
	(valid)	|	Loss: 0.0518	|	Acc: 32.3%
Ep

In [294]:
torch.save(model, "../models/pytorch_beer_classifier_2_layer.pt")

In [295]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.1f}')

	Loss: 0.0506	|	Accuracy: 0.4
