# Imports

In [1]:
import os
import random

from tqdm.notebook import tqdm
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

from data_utils.dataset import prepare_data
from data_utils.utils import read_json
from models.rec_ace import RecACEWrapModel

## Seeds

In [2]:
SEED = 42

# Set the random seed for Python
random.seed(SEED)

# Set the random seed for numpy
np.random.seed(SEED)

# Set the random seed for pandas
# pandas gets its random seed from numpy, so using numpy's seed will affect pandas

# Set the random seed for NLTK
# NLTK gets its random seed from the Python random number generator (using random.seed())

torch.manual_seed(SEED)

<torch._C.Generator at 0x110fb2e70>

# Data Paths

In [3]:
datasets_dict = {
    # Default
    'Default Train Clean': 'data/default/train_clean.json',
    'Default Train Other': 'data/default/train_other.json',
    'Default Dev Clean': 'data/default/dev_clean.json',
    'Default Dev Other': 'data/default/dev_other.json',
    'Default Test Clean': 'data/default/test_clean.json',
    'Default Test Other': 'data/default/test_other.json',
    # Video
    'Video Train Clean': 'data/video/train_clean.json',
    'Video Train Other': 'data/video/train_other.json',
    'Video Dev Clean': 'data/video/dev_clean.json',
    'Video Dev Other': 'data/video/dev_other.json',
    'Video Test Clean': 'data/video/test_clean.json',
    'Video Test Other': 'data/video/test_other.json', 
}

# Main

In [4]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
DEVICE

'cpu'

## Load essentials

In [6]:
debug = True

# Base architecture
t5_type = 't5-small'

# What do we train - original / rec_ace
model_type = 'rec_ace'

# How to quantize the confidence vectors [only required for rec_ac]
bin_size=10

### Tokenizer

In [7]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


## Read Data

In [8]:
train_set = read_json(json_path=datasets_dict['Default Train Clean'])
dev_set = read_json(json_path=datasets_dict['Default Dev Clean'])
test_set = read_json(json_path=datasets_dict['Default Test Clean'])

## Prepare as DataLoader

In [9]:
batch_size = 8
train_loader = prepare_data(data=train_set, tokenizer=tokenizer, batch_size=batch_size, shuffle=True, debug=debug)
# train_loader = prepare_data(data=dev_set, tokenizer=tokenizer, batch_size=batch_size, shuffle=True)
dev_loader = prepare_data(data=dev_set, tokenizer=tokenizer, batch_size=batch_size, shuffle=False)
test_loader = prepare_data(data=test_set , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)

Debug Mode - using only 10390 out of 103895, training datapoints
- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens


### Model

In [10]:
model = RecACEWrapModel(t5_type=t5_type, model_type=model_type, bin_size=bin_size)
model.to(DEVICE)

RecACEWrapModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_feat

## Optimizer

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

## Training

In [12]:
def training(model, n_epochs, train_data, dev_data, optimizer, criterion):
    """ Training loop for the model

        Args:
            model (nn.Module): Model to train
            n_epochs (int): Number of epochs to train
            train_data (DataLoader): DataLoader with train data
            dev_data (DataLoader): DataLoader with dev data
            optimizer (torch.optim): Optimizer for the model
            criterion (torch.nn): Loss function

        Returns:
            dict: Dictionary with train and dev losses and accuracies
    """

    # metrics placeholder for recording training stats
    metrics = {
        'loss': {
            'train': [],
            'dev':   []
        },
        'acc': {
            'train': [],
            'dev':   []
        }
    }

    print(f'Training model of type: {model.model_type}')
    print('- Scores vector will be ' + ('ignored' if model.model_type == 'original' else 'used'))
    pbar = tqdm(range(n_epochs), position=0)
    for epoch in pbar:
        pbar.set_description(f"Epoch: {epoch+1}/{n_epochs}")

        train_losses, train_acc = [], []
        dev_losses, dev_acc = [], []

        ### TRAIN
        model.train()

        # Iterating over batches in train data
        pbar_train = tqdm(train_data, desc="Train")
        for batch in pbar_train:

            optimizer.zero_grad()

            X = batch['sentences'].to(DEVICE)
            S = batch['scores'].to(DEVICE)
            y = batch['labels'].to(DEVICE)
            loss = model(input_ids=X, labels=y, scores_ids=S).loss

            train_losses.append(loss.item())
            loss.backward()
            optimizer.step()

        ### Evaluate DEV set
        model.eval()

        # No need for gradients when evaluating
        with torch.no_grad():
            pbar_dev = tqdm(dev_data, desc="Dev")
            for batch in pbar_dev:

                X = batch['sentences'].to(DEVICE)
                S = batch['scores'].to(DEVICE)
                y = batch['labels'].to(DEVICE)

                loss = model(input_ids=X, labels=y, scores_ids=S).loss
       
                # Calculate DEV loss
                dev_losses.append(loss.item())


        # Collect epoch's avg scores
        metrics['loss']['train'].append(np.mean(train_losses))
        metrics['loss']['dev'].append(np.mean(dev_losses))

        print(f"\tDone Epoch: {epoch+1}/{n_epochs}, \t Train Loss AVG: {metrics['loss']['train'][-1]:.04}, Dev Loss AVG: {metrics['loss']['dev'][-1]:.04}")

    return model, metrics

In [13]:
criterion = torch.nn.CrossEntropyLoss()
model, metrics = training(model=model,
                          n_epochs=20,
                          train_data=train_loader,
                          dev_data=dev_loader,
                          optimizer=optimizer,
                          criterion=criterion)

Training model of type: rec_ace
- Scores vector will be used


  0%|          | 0/20 [00:00<?, ?it/s]

Train:   0%|          | 0/1299 [00:00<?, ?it/s]

KeyboardInterrupt: 