# Imports

In [44]:
import os
import random

from tqdm import tqdm, tqdm_note
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

from data_utils.dataset import prepare_data
from data_utils.utils import read_json

## Seeds

In [45]:
SEED = 42

# Set the random seed for Python
random.seed(SEED)

# Set the random seed for numpy
np.random.seed(SEED)

# Set the random seed for pandas
# pandas gets its random seed from numpy, so using numpy's seed will affect pandas

# Set the random seed for NLTK
# NLTK gets its random seed from the Python random number generator (using random.seed())

torch.manual_seed(SEED)

<torch._C.Generator at 0x105f961f0>

# Data Paths

In [46]:
datasets_dict = {
    # Default
    'Default Train Clean': 'data/default/train_clean.json',
    'Default Train Other': 'data/default/train_other.json',
    'Default Dev Clean': 'data/default/dev_clean.json',
    'Default Dev Other': 'data/default/dev_other.json',
    'Default Test Clean': 'data/default/test_clean.json',
    'Default Test Other': 'data/default/test_other.json',
    # Video
    'Video Train Clean': 'data/video/train_clean.json',
    'Video Train Other': 'data/video/train_other.json',
    'Video Dev Clean': 'data/video/dev_clean.json',
    'Video Dev Other': 'data/video/dev_other.json',
    'Video Test Clean': 'data/video/test_clean.json',
    'Video Test Other': 'data/video/test_other.json', 
}

# Main

In [47]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load essentials

In [9]:
model_type = 't5-small'

### Tokenizer

In [10]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


### Model

In [11]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

## Read Data

In [8]:
train_set = read_json(json_path=datasets_dict['Default Train Clean'])
dev_set = read_json(json_path=datasets_dict['Default Dev Clean'])
test_set = read_json(json_path=datasets_dict['Default Test Clean'])

## Prepare as DataLoader

In [29]:
batch_size = 8
train_loader = prepare_data(data=train_set, tokenizer=tokenizer, batch_size=batch_size, shuffle=True)
dev_loader = prepare_data(data=dev_set, tokenizer=tokenizer, batch_size=batch_size, shuffle=False)
test_loader = prepare_data(data=test_set , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)

100%|██████████████████████████████████████| 103895/103895 [00:09<00:00, 10425.14it/s]
100%|██████████████████████████████████████████| 2697/2697 [00:00<00:00, 15090.89it/s]
100%|██████████████████████████████████████████| 2615/2615 [00:00<00:00, 15048.03it/s]


## Optimizer

In [18]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

## Training

In [42]:
def training(model, n_epochs, train_data, dev_data, optimizer, criterion):
    """ Training loop for the model

        Args:
            model (nn.Module): Model to train
            n_epochs (int): Number of epochs to train
            train_data (DataLoader): DataLoader with train data
            dev_data (DataLoader): DataLoader with dev data
            optimizer (torch.optim): Optimizer for the model
            criterion (torch.nn): Loss function

        Returns:
            dict: Dictionary with train and dev losses and accuracies
    """

    # metrics placeholder for recording training stats
    metrics = {
        'loss': {
            'train': [],
            'dev':   []
        },
        'acc': {
            'train': [],
            'dev':   []
        }
    }
    pbar = tqdm(range(n_epochs), position=0, desc=f"\tEpoch: {1}/{n_epochs}")
    for epoch in pbar:

        train_losses, train_acc = [], []
        dev_losses, dev_acc = [], []

        ### TRAIN
        model.train()

        # Iterating over batches in train data
        pbar_train = tqdm_notebook(train_data, position=1)
        for i_batch, batch in enumerate(pbar_train):
            pbar_train.set_description(f"Training on batch: {i_batch+1}/{len(train_data)}")

            optimizer.zero_grad()

            X = batch['sentences'].to(DEVICE)
            y = batch['labels'].to(DEVICE)

            loss = model(input_ids=X, labels=y).loss

            train_losses.append(loss.item())
            loss.backward()
            optimizer.step()

        ### Evaluate DEV set
        model.eval()

        # No need for gradients when evaluating
        with torch.no_grad():
            pbar_dev = tqdm_notebook(dev_data, position=2)
            for i_batch, batch in enumerate(pbar_dev):
                pbar_dev.set_description(f"DEV Iteration: {i_batch+1}/{len(dev_data)}")


                X = batch['sentences'].to(DEVICE)
                y = batch['labels'].to(DEVICE)

                loss = model(input_ids=X, labels=y).loss
       
                # Calculate DEV loss
                loss = criterion(preds, y_hot)
                dev_losses.append(loss.item())


        # Collect epoch's avg scores
        metrics['loss']['train'].append(np.mean(train_losses))
        metrics['loss']['dev'].append(np.mean(dev_losses))

        pbar.set_description(f"\tEpoch: {epoch+1}/{n_epochs}, \t Train Loss AVG: {metrics['loss']['train'][-1]:.04}, Dev Loss AVG: {metrics['loss']['dev'][-1]:.04}")

    return model, metrics

In [49]:
criterion = torch.nn.CrossEntropyLoss()
model, metrics = training(model=model,
                          n_epochs=20,
                          train_data=train_loader,
                          dev_data=dev_loader,
                          optimizer=optimizer,
                          criterion=criterion)

	Epoch: 1/20:   0%|                                            | 0/20 [00:00<?, ?it/s]


NameError: name 'tqdm_notebook' is not defined