# Model building

https://www.kaggle.com/vadbeg/pytorch-nn-with-embeddings-and-catboost/notebook#PyTorch

mostly based off this example, plus parts of code form tutorial 5 lab 3

In [1]:
# import load_data function from 
%load_ext autoreload
%autoreload 2

# fix system path
import sys
sys.path.append("/home/jovyan/work")

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministick = True
    torch.backends.cudnn.benchmark = False 
    
set_seed(27)

In [4]:
from src.data.sets import load_sets

X_train, y_train, X_val, y_val, X_test, y_test = load_sets()

In [5]:
X_test.shape

(317320, 5)

In [6]:
X_train.shape

(951959, 5)

In [7]:
X_val.shape

(317320, 5)

In [8]:
# need to convert to tensors
from src.models.pytorch import EmbeddingDataset

In [9]:
train_dataset = EmbeddingDataset(X_train, 
                                      targets=y_train,
                                      cat_cols_idx=[0],
                                      cont_cols_idx=[1,2,3,4])

val_dataset = EmbeddingDataset(X_val, 
                                      targets=y_val,
                                      cat_cols_idx=[0],
                                      cont_cols_idx=[1,2,3,4])


test_dataset = EmbeddingDataset(X_test,
                                     cat_cols_idx=[0],
                                     cont_cols_idx=[1,2,3,4],
                                     is_train=False)

In [10]:
print(f'First element of train_dataset: {train_dataset[1]}',
      f'First element of val_dataset: {val_dataset[1]}',
      f'First element of test_dataset: {test_dataset[1]}',sep='\n')

First element of train_dataset: {'data': [tensor([4918.]), tensor([-3.2047, -2.1777, -0.3572, -0.4001])], 'target': tensor(13)}
First element of val_dataset: {'data': [tensor([163.]), tensor([-1.0545, -0.5545, -1.0901, -1.0832])], 'target': tensor(31)}
First element of test_dataset: {'data': [tensor([701.]), tensor([ 0.3790,  0.2570,  0.3757, -0.4001])]}


In [11]:
# embedding example
class ClassificationEmbdNN(torch.nn.Module):
    
    def __init__(self, emb_dims, no_of_cont=None):
        super(ClassificationEmbdNN, self).__init__()
        
        self.emb_layers = torch.nn.ModuleList([torch.nn.Embedding(x, y)
                                               for x, y in emb_dims])
        
        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.emb_dropout = torch.nn.Dropout(0.2)
        
        self.no_of_cont = 0
        if no_of_cont:
            self.no_of_cont = no_of_cont
            self.bn_cont = torch.nn.BatchNorm1d(no_of_cont)
        
        self.fc1 = torch.nn.Linear(in_features=self.no_of_embs + self.no_of_cont, 
                                   out_features=208)
        self.dropout1 = torch.nn.Dropout(0.2)
        self.bn1 = torch.nn.BatchNorm1d(208)
        self.act1 = torch.nn.ReLU()
        
        self.fc2 = torch.nn.Linear(in_features=208, 
                                   out_features=208)
        self.dropout2 = torch.nn.Dropout(0.2)
        self.bn2 = torch.nn.BatchNorm1d(208)
        self.act2 = torch.nn.ReLU()
        
#         self.fc3 = torch.nn.Linear(in_features=256, 
#                                    out_features=64)
#         self.dropout3 = torch.nn.Dropout(0.2)
#         self.bn3 = torch.nn.BatchNorm1d(64)
#         self.act3 = torch.nn.ReLU()
        
        self.fc3 = torch.nn.Linear(in_features=208, 
                                   out_features=104)
        self.act3 = torch.nn.Softmax()
        
    def forward(self, x_cat, x_cont=None):
        if self.no_of_embs != 0:
            x = [emb_layer(x_cat[:, i])
                 for i, emb_layer in enumerate(self.emb_layers)]
        
            x = torch.cat(x, 1)
            x = self.emb_dropout(x)
            
        if self.no_of_cont != 0:
            x_cont = self.bn_cont(x_cont)
            
            if self.no_of_embs != 0:
                x = torch.cat([x, x_cont], 1)
            else:
                x = x_cont
        
        x = self.fc1(x)
        x = self.dropout1(x)
        x = self.bn1(x)
        x = self.act1(x)
        
        x = self.fc2(x)
        x = self.dropout2(x)
        x = self.bn2(x)
        x = self.act2(x)
        
#         x = self.fc3(x)
#         x = self.dropout3(x)
#         x = self.bn3(x)
#         x = self.act3(x)
        
        x = self.fc3(x)
        x = self.act3(x)
        
        return x

In [12]:
model = ClassificationEmbdNN(emb_dims=[[5742, 252]], 
                             no_of_cont=4)

In [13]:
from src.models.pytorch import get_device

device = get_device()
model.to(device)

ClassificationEmbdNN(
  (emb_layers): ModuleList(
    (0): Embedding(5742, 252)
  )
  (emb_dropout): Dropout(p=0.2, inplace=False)
  (bn_cont): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=256, out_features=208, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (bn1): BatchNorm1d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU()
  (fc2): Linear(in_features=208, out_features=208, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (bn2): BatchNorm1d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU()
  (fc3): Linear(in_features=208, out_features=104, bias=True)
  (act3): Softmax(dim=None)
)

In [18]:
model

ClassificationEmbdNN(
  (emb_layers): ModuleList(
    (0): Embedding(5742, 252)
  )
  (emb_dropout): Dropout(p=0.2, inplace=False)
  (bn_cont): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=256, out_features=208, bias=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (bn1): BatchNorm1d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU()
  (fc2): Linear(in_features=208, out_features=208, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (bn2): BatchNorm1d(208, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU()
  (fc3): Linear(in_features=208, out_features=104, bias=True)
  (act3): Softmax(dim=None)
)

In [19]:
criterion = torch.nn.CrossEntropyLoss()

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [21]:
BATCH_SIZE = 300
N_EPOCHS = 10

In [22]:
train_loader = DataLoader(train_dataset,batch_size=BATCH_SIZE)

valid_loader = DataLoader(val_dataset,batch_size=BATCH_SIZE)

In [85]:
next(iter(train_loader))

{'data': [tensor([[1897.],
          [4918.],
          [1701.],
          [1963.],
          [4863.],
          [5448.],
          [ 836.],
          [ 413.],
          [ 370.],
          [ 701.],
          [1963.],
          [2421.],
          [2262.],
          [3805.],
          [5497.],
          [ 726.],
          [1198.],
          [2368.],
          [4729.],
          [2235.],
          [2859.],
          [4598.],
          [1963.],
          [ 177.],
          [2642.],
          [4843.],
          [2885.],
          [ 688.],
          [4074.],
          [3431.],
          [3906.],
          [4576.],
          [4794.],
          [2880.],
          [3315.],
          [1335.],
          [4794.],
          [2019.],
          [ 809.],
          [3189.],
          [4872.],
          [3804.],
          [5208.],
          [5693.],
          [3556.],
          [3431.],
          [ 154.],
          [ 151.],
          [  45.],
          [4598.],
          [2464.],
          [4598.],
    

In [86]:
next(iter(valid_loader))

{'data': [tensor([[4884.],
          [ 163.],
          [ 217.],
          [ 163.],
          [2262.],
          [5378.],
          [4280.],
          [ 413.],
          [5003.],
          [2461.],
          [2461.],
          [2316.],
          [3841.],
          [5066.],
          [4453.],
          [2461.],
          [3341.],
          [ 151.],
          [4830.],
          [2239.],
          [5014.],
          [3237.],
          [2514.],
          [5378.],
          [5014.],
          [5448.],
          [3189.],
          [5242.],
          [2523.],
          [1605.],
          [2601.],
          [5364.],
          [ 163.],
          [5078.],
          [ 224.],
          [2689.],
          [4132.],
          [ 375.],
          [ 360.],
          [3888.],
          [2239.],
          [5392.],
          [1334.],
          [4598.],
          [1951.],
          [4680.],
          [1306.],
          [5330.],
          [ 154.],
          [3804.],
          [4950.],
          [ 386.],
    

In [23]:
from tqdm import tqdm_notebook as tqdm

In [24]:
def train_network(model, train_loader, valid_loader,
                  loss_func, optimizer, n_epochs=20,
                  saved_model='model.pt'):
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    train_losses = list()
    valid_losses = list()
    
    valid_loss_min = np.Inf
    
    for epoch in range(n_epochs):
        train_loss = 0.0
        valid_loss = 0.0
        
#         train_auc = 0.0
#         valid_auc = 0.0
        
        train_acc = 0.0
        valid_acc = 0.0
        
        model.train()
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            
            output = model(batch['data'][0].to(device, 
                                               dtype=torch.long),
                           batch['data'][1].to(device, 
                                               dtype=torch.float))
            
            
            loss = loss_func(output, batch['target'].to(device, 
                                                        dtype=torch.long))
            
            loss.backward()
            optimizer.step()
            # Calculate global accuracy
            train_acc += (output.argmax(1) == batch['target']).sum().item()
#             train_auc += roc_auc_score(batch['target'].cpu().numpy(),
#                                                output.detach().cpu().numpy(),
#                                                multi_class = "ovo")

            train_loss += loss.item() * batch['data'][0].size(0)  #!!!
    

        model.eval()
        for batch in tqdm(valid_loader):
            output = model(batch['data'][0].to(device, 
                                               dtype=torch.long),
                           batch['data'][1].to(device, 
                                               dtype=torch.float))
            
            
            loss = loss_func(output, batch['target'].to(device, 
                                                        dtype=torch.long))
            
#             valid_auc += roc_auc_score(batch['target'].cpu().numpy(),
#                                                output.detach().cpu().numpy(),
#                                                multi_class = "ovo")
            valid_loss += loss.item() * batch['data'][0].size(0)  #!!!
            # Calculate global accuracy
            valid_acc += (output.argmax(1) == batch['target']).sum().item()
        
#         train_loss = np.sqrt(train_loss / len(train_loader.sampler.indices))
#         valid_loss = np.sqrt(valid_loss / len(valid_loader.sampler.indices))

#         train_auc = train_auc / len(train_loader)
#         valid_auc = valid_auc / len(valid_loader)
        
#         train_losses.append(train_loss)
#         valid_losses.append(valid_loss)

        print('Epoch: {}. Training loss: {:.6f}. Validation loss: {:.6f}'
              .format(epoch, train_loss, valid_loss))
        print('Training AUC: {:.6f}. Validation AUC: {:.6f}'
              .format(train_acc, valid_acc))
        
        if valid_loss < valid_loss_min:  # let's save the best weights to use them in prediction
            print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model...'
                  .format(valid_loss_min, valid_loss))
            
            torch.save(model.state_dict(), saved_model)
            valid_loss_min = valid_loss
            
    
    return train_losses, valid_losses

In [102]:
train_losses, valid_losses = train_network(model=model, 
                                           train_loader=train_loader, 
                                           valid_loader=valid_loader, 
                                           loss_func=criterion, 
                                           optimizer=optimizer,
                                           n_epochs=N_EPOCHS, 
                                           saved_model='../models/embed_3layers.pt')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))






Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 0. Training loss: 4421724.604657. Validation loss: 1473862.318449
Training AUC: 5908.000000. Validation AUC: 2346.000000
Validation loss decreased (inf --> 1473862.318449). Saving model...


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 1. Training loss: 4421724.043646. Validation loss: 1473859.110718
Training AUC: 5853.000000. Validation AUC: 2390.000000
Validation loss decreased (1473862.318449 --> 1473859.110718). Saving model...


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 2. Training loss: 4421721.100763. Validation loss: 1473863.261547
Training AUC: 5896.000000. Validation AUC: 2349.000000


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 3. Training loss: 4421726.304363. Validation loss: 1473860.931282
Training AUC: 5861.000000. Validation AUC: 2359.000000


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 4. Training loss: 4421723.953760. Validation loss: 1473859.757814
Training AUC: 5849.000000. Validation AUC: 2359.000000


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 5. Training loss: 4421724.728816. Validation loss: 1473861.321592
Training AUC: 5849.000000. Validation AUC: 2317.000000


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 6. Training loss: 4421720.038199. Validation loss: 1473861.021938
Training AUC: 5697.000000. Validation AUC: 2417.000000


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 7. Training loss: 4421728.686065. Validation loss: 1473860.797329
Training AUC: 5810.000000. Validation AUC: 2387.000000


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 8. Training loss: 4421722.932936. Validation loss: 1473860.643997
Training AUC: 5801.000000. Validation AUC: 2354.000000


HBox(children=(FloatProgress(value=0.0, max=3174.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))


Epoch: 9. Training loss: 4421724.683504. Validation loss: 1473858.867426
Training AUC: 5926.000000. Validation AUC: 2361.000000
Validation loss decreased (1473859.110718 --> 1473858.867426). Saving model...


#### forgot to divide the loss and accuracy by length of data set

In [107]:
print('Training Accuracy: {:.2f}%'.format(5926.0/300.0))
print('Validation Accuracy: {:.2f}%'.format(2361.0/300.0))

Training Accuracy: 19.75%
Validation Accuracy: 7.87%


# Predict with test set

In [25]:
def predict(data_loader, model):
    model.eval()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    
    with torch.no_grad():
        predictions = None
        
        for i, batch in enumerate(tqdm(data_loader)):   
            
            output = model(batch['data'][0].to(device, 
                                               dtype=torch.long), 
                           batch['data'][1].to(device, 
                                               dtype=torch.float)).cpu().numpy()
            
            if i == 0:
                predictions = output
                
            else: 
                
                predictions = np.vstack((predictions, output))
                
    return predictions

In [26]:
model.load_state_dict(torch.load('../models/embed_3layers.pt'))

test_loader = DataLoader(test_dataset, 
                         batch_size=BATCH_SIZE)

nn_predictions = predict(test_loader, model)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=1058.0), HTML(value='')))






In [27]:
nn_predictions

array([[0.01260628, 0.00742896, 0.00644201, ..., 0.00936545, 0.01223338,
        0.00800614],
       [0.00870188, 0.00563232, 0.01582166, ..., 0.0086471 , 0.01082051,
        0.0085842 ],
       [0.00938711, 0.0065556 , 0.01092372, ..., 0.00528657, 0.00847999,
        0.01576285],
       ...,
       [0.01357877, 0.00672801, 0.0100839 , ..., 0.00834409, 0.00656633,
        0.01293461],
       [0.00949264, 0.00695431, 0.01194292, ..., 0.01046819, 0.00798348,
        0.00741773],
       [0.01299331, 0.00624016, 0.01440495, ..., 0.00893319, 0.00758309,
        0.01268085]], dtype=float32)

In [28]:
test_acc = (nn_predictions.argmax(1) == y_test).sum().item()

In [29]:
test_acc/300

7.533333333333333

In [30]:
from sklearn.metrics import roc_auc_score, classification_report

In [31]:
# compute other metrics
roc_auc_score(y_test,nn_predictions, multi_class='ovr', average='macro')

0.4919978611119817

In [32]:
print(y_test)
print(nn_predictions.argmax(1))

[ 83 102  11 ...  43   9  14]
[73 50 80 ... 12 32 99]


In [36]:
def convert_cr_to_dataframe(report_dict:dict) -> pd.DataFrame:
    """
    Converts the dictionary format of the Classification Report (CR) to a
    dataframe for easy of sorting
    :param report_dict: The dictionary returned by 
    sklearn.metrics.classification_report.
    :return: Returns a dataframe of the same information.
    """
    beer_style = list(report_dict.keys())
    beer_style.remove('accuracy')
    beer_style.remove('macro avg')
    beer_style.remove('weighted avg')
    precision = []
    recall = []
    f1 = []
    support = []
    for key, value in report_dict.items():
        if key not in ['accuracy', 'macro avg', 'weighted avg']:
            precision.append(value['precision'])
            recall.append(value['recall'])
            f1.append(value['f1-score'])
            support.append(value['support'])
    result = pd.DataFrame({'beer_style': beer_style,
                           'precision': precision,
                           'recall': recall,
                           'f1': f1,
                           'support': support})
    return result

In [35]:
from joblib import load

label_encoders = load('../models/label_encoders.joblib')

In [40]:
pd.options.display.max_rows = 150
report_dict = classification_report(label_encoders['beer_style'].inverse_transform(y_test),
                                    label_encoders['beer_style'].inverse_transform(nn_predictions.argmax(1)),
                                    output_dict=True)
report_df = convert_cr_to_dataframe(report_dict)
print(report_df)
#classification_report(y_test, nn_predictions.argmax(1))

  _warn_prf(average, modifier, msg_start, len(result))


                              beer_style  precision    recall        f1  \
0                                Altbier   0.004481  0.013342  0.006709   
1                 American Adjunct Lager   0.000000  0.000000  0.000000   
2               American Amber / Red Ale   0.031808  0.013229  0.018686   
3             American Amber / Red Lager   0.141541  0.091302  0.111002   
4                    American Barleywine   0.001211  0.000184  0.000319   
5                     American Black Ale   0.005499  0.025257  0.009032   
6                    American Blonde Ale   0.008511  0.000775  0.001421   
7                     American Brown Ale   0.000000  0.000000  0.000000   
8                American Dark Wheat Ale   0.000000  0.000000  0.000000   
9         American Double / Imperial IPA   0.000000  0.000000  0.000000   
10    American Double / Imperial Pilsner   0.018263  0.117391  0.031609   
11      American Double / Imperial Stout   0.135552  0.027775  0.046103   
12                       

In [44]:
report_df.to_csv('../data/processed/class_report.csv')

In [127]:
torch.save(model, "../models/model.pt")

In [39]:
label_encoders['beer_style'].inverse_transform(nn_predictions.argmax(1)).tolist()

['Kölsch',
 'English Stout',
 'Munich Dunkel Lager',
 'Munich Helles Lager',
 'Kölsch',
 'Witbier',
 'English Stout',
 'Sahti',
 'Braggot',
 'Braggot',
 'Braggot',
 'Kölsch',
 'Kölsch',
 'Kölsch',
 'Faro',
 'Kristalweizen',
 'Low Alcohol Beer',
 'American Black Ale',
 'Low Alcohol Beer',
 'Kölsch',
 'American Amber / Red Ale',
 'Dortmunder / Export Lager',
 'American Black Ale',
 'Kölsch',
 'Schwarzbier',
 'Braggot',
 'Maibock / Helles Bock',
 'Braggot',
 'Scottish Gruit / Ancient Herbed Ale',
 'Belgian Strong Dark Ale',
 'Witbier',
 'Munich Helles Lager',
 'Japanese Rice Lager',
 'Kölsch',
 'Kölsch',
 'Milk / Sweet Stout',
 'Braggot',
 'Kölsch',
 'Kölsch',
 'American Black Ale',
 'Witbier',
 'Kölsch',
 'Vienna Lager',
 'Kölsch',
 'Kölsch',
 'Maibock / Helles Bock',
 'Kölsch',
 'Braggot',
 'Braggot',
 'Braggot',
 'Kölsch',
 'Witbier',
 'Belgian IPA',
 'Braggot',
 'Maibock / Helles Bock',
 'Scottish Ale',
 'Japanese Rice Lager',
 'Kölsch',
 'Kölsch',
 'Braggot',
 'Kölsch',
 'English Pal