# IMPORTS

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader, Sampler
from tqdm import tqdm

import re
import os
import copy
import requests
import zipfile
import pickle
import gensim
import random
import gensim.downloader as gloader
import numpy as np
import pandas as pd

In [None]:
# In order to use key_to_index attribute from the embedding model
! pip install gensim==4.1.2
import gensim
import gensim.downloader as gloader



In [None]:
EMBEDDING_DIMENSION = 100
BATCH_SIZE = 32
NUM_CLASSES = 2
EPOCHS = 3

# device = torch.device('cpu')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use. 
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True        
    
fix_random(42)
!nvidia-smi

Wed Dec 15 09:13:43 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    28W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# PRE-PROCESSING

In [None]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)


def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                           params={'id': toy_data_url_id},
                                           stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")


def pre_process(dataset, filename):  # clean the dataset
    dataset.drop(dataset.columns[0], axis=1, inplace=True)  # remove first column of dataframe containing numbers
    dataset.drop(['ID'], axis=1, inplace=True)
    # remove numbers before each evidence
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'^\d+\t', '', x))
    # remove everything after the period
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r' \..*', ' .', x))
    # remove round brackets and what they contain
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'-LRB-.*-RRB-', '', x))
    # remove square brackets and what they contain
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'-LSB-.*-RSB-', '', x))

    n_before = dataset.shape[0]
    # removes instances longer than a threshold on evidence
    # TODO: only on train
    dataset = dataset[dataset['Evidence'].str.split().str.len() <= 100]
    # remove all rows where there are single brackets in the evidence
    dataset = dataset[~dataset['Evidence'].str.contains('|'.join(['-LRB-', '-LSB-', '-RRB-', '-RSB-']))]
    n_after = dataset.shape[0]

    # removes punctuation and excessive spaces
    dataset = dataset.applymap(lambda x: re.sub(r'[^\w\s]', '', x))
    dataset = dataset.applymap(lambda x: re.sub(r' +', ' ', x))
    dataset = dataset.applymap(lambda x: re.sub(r'^ +', '', x))
    dataset = dataset.applymap(lambda x: x.lower())

    labels = {'supports': 1, 'refutes': 0}
    dataset = dataset.replace({'Label': labels})
    # removes rows with empty elements
    dataset = dataset[dataset['Evidence'] != '']
    dataset = dataset[dataset['Claim'] != '']
    dataset = dataset[dataset['Label'] != '']



    rem_elements = n_before - n_after
    print(f"Removed {rem_elements}\t ({100 * rem_elements / n_before:.2F}%)"
          f" elements because of inconsistency on {filename}")
    return dataset


#########################################

try:
    from google.colab import drive
    IN_COLAB=True
except:
    IN_COLAB=False

if IN_COLAB:
    print("We're running Colab")
    # Mount the Google Drive at mount
    mount='/content/gdrive'
    print("Colab: mounting Google drive on ", mount)
    drive.mount(mount)

    # Switch to the directory on the Google Drive that you want to use
    drive_root = mount + "/My Drive/NLP/Assignment2"
    
    # Create drive_root if it doesn't exist
    create_drive_root = True
    if create_drive_root:
        print("\nColab: making sure ", drive_root, " exists.")
        os.makedirs(drive_root, exist_ok=True)
    
    # Change to the directory
    print("\nColab: Changing directory to ", drive_root)
    %cd $drive_root
    print("Checking working directory:")
    %pwd

# download_data('dataset')

if not len(os.listdir("dataset_cleaned")):
    for file in os.listdir("dataset"):
        dataset_cleaned = pre_process(pd.read_csv("dataset/" + file, sep=','), file)
        dataset_cleaned.to_csv(os.path.join("dataset_cleaned", file))

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Mounted at /content/gdrive

Colab: making sure  /content/gdrive/My Drive/NLP/Assignment2  exists.

Colab: Changing directory to  /content/gdrive/My Drive/NLP/Assignment2
/content/gdrive/My Drive/NLP/Assignment2
Checking working directory:


# CLASSES

## FactDataset

In [None]:
class FactDataset(Dataset):
    def __init__(self, emb_dim, glove_dict, glove_matrix, split='train', tokenizer=None):
        pairs_claim_evid = pd.read_csv(f"dataset_cleaned/{split}_pairs.csv")
        # tokenization & embeddings
        text = pairs_claim_evid['Claim'] + ' ' + pairs_claim_evid['Evidence']
        text = text.astype(str).to_list()

        # IMPORTANT: if a tokenizer is already computed for example on the train set, for val set we have to extend
        if tokenizer:
            self.tokenizer = tokenizer
            self.tokenizer.dataset_sentences = text
        else:
            self.tokenizer = Tokenizer(text, emb_dim, glove_dict, glove_matrix)

        self.tokenizer.tokenize()
        self.val_to_key = self.tokenizer.get_val_to_key()
        self.emb_matrix = self.tokenizer.build_embedding_matrix()
        print(len(self.emb_matrix))

        lengths = pairs_claim_evid['Evidence'].str.split().str.len().to_numpy()

        # print(len([i for i, el in enumerate(lengths) if el > 100]))

        self.max_claim_len = int(pairs_claim_evid['Claim'].str.split().str.len().max())
        self.max_evid_len = int(pairs_claim_evid['Evidence'].str.split().str.len().max())
        self.max_seq_len = int(max(self.max_evid_len, self.max_claim_len))

        # tester function
        if not self._check_tokenizer():
            raise ValueError

        # create x as list of sentences
        # sentences are splitted in claim evidence
        self.x = []
        print(f"Creating FactDataset: \n")
        for i, (claim_sen, evid_sen) in tqdm(enumerate(zip(pairs_claim_evid['Claim'], pairs_claim_evid['Evidence']))):
            sentence = [[], []]
            for word in claim_sen.split():
                sentence[0].append(self.val_to_key[word])
            for word in evid_sen.split():
                sentence[1].append(self.val_to_key[word])

            # Padding deprecated here
            # sentence[0] = sentence[0] + [0] * (self.max_seq_len - len(claim_sen.split()))
            # sentence[1] = sentence[1] + [0] * (self.max_seq_len - len(evid_sen.split()))
            self.x.append(sentence)

         
        # self.x = torch.tensor(self.x)
        # self.y = torch.tensor(pairs_claim_evid['Label'])

        # one-hot: [0,1] => supports, [1,0] => refutes
        self.y = [[0, 1] if l==1 else [1, 0] for l in pairs_claim_evid['Label'].to_list()]
        self.n_samples = pairs_claim_evid.shape[0]
        print(f"Max sequence length: {self.max_seq_len}")

    def __getitem__(self, index):
        # xi = [[claim], [evid]], yi = [label]
        return [self.x[index], self.y[index]]

    def __len__(self):
        return self.n_samples

    def _check_tokenizer(self):
        word = 'the'
        for i, el in enumerate(self.tokenizer.glove_matrix[self.tokenizer.glove_dict[word]]):
            if el != self.emb_matrix[self.val_to_key[word]][i]:
                print("Check Tokenizer, possible bugs")
                return False
        return True
    def get_tokenizer(self):
        return self.tokenizer

## Tokenizer

In [None]:
class Tokenizer(object):
    def __init__(self, dataset_sentences, embedding_dim, glove_dict, glove_matrix):
        self.embedding_matrix = None
        self.value_to_key = {}
        self.value_to_key_new = {}
        self.key_to_value = {}
        self.num_unique_words = 0
        self.dataset_sentences = dataset_sentences
        self.embedding_dim = embedding_dim
        self.glove_dict = glove_dict
        self.glove_matrix = glove_matrix
        self.unique_words = set()

    def get_val_to_key(self):
        return copy.deepcopy(self.value_to_key)

    def tokenize(self):
        self.value_to_key_new = {}
        unique_words = set()
        for sen in self.dataset_sentences:
            for w in sen.split():
                unique_words.add(w)  # get set of unique words
        new_unique = unique_words - self.unique_words
        for i, word in enumerate(new_unique):
            if self.embedding_matrix is not None:
                self.key_to_value[i + len(self.embedding_matrix)] = word  # build two dictionaries for key value correspondence
                self.value_to_key[word] = i + len(self.embedding_matrix)
            else:
                self.key_to_value[i] = word  # build two dictionaries for key value correspondence
                self.value_to_key[word] = i
            self.value_to_key_new[word] = i

        self.num_unique_words = len(new_unique)
        self.unique_words = self.unique_words | new_unique  # union of unique words and new unique words


    def __build_embedding_matrix_glove(self):
        oov_words = []
        tmp_embedding_matrix = np.zeros((self.num_unique_words, self.embedding_dim), dtype=np.float32)
        len_old_emb_matrix = len(self.embedding_matrix) if self.embedding_matrix is not None else 0
        print(f"Finding OOVs: ")
        for word, idx in tqdm(self.value_to_key_new.items()):
            try:
                embedding_vector = self.glove_matrix[self.glove_dict[word]]
                tmp_embedding_matrix[idx] = embedding_vector
            except (KeyError, TypeError):
                oov_words.append((word, idx + len_old_emb_matrix))
        if self.embedding_matrix is not None:
            self.embedding_matrix = np.vstack((self.embedding_matrix, tmp_embedding_matrix))
        else:
            self.embedding_matrix = tmp_embedding_matrix
        return oov_words

    def build_embedding_matrix(self):
        oov_words = self.__build_embedding_matrix_glove()
        print(f"Solving OOVs: ")
        for word, idx in tqdm(oov_words):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=self.embedding_dim)
            self.embedding_matrix[idx] = embedding_vector

        # PADDING (feat. David Guetta)
        first = self.embedding_matrix[0]
        if np.count_nonzero(first) != 0:
            first = self.embedding_matrix[0]
            self.embedding_matrix[0] = np.zeros(self.embedding_dim)
            self.embedding_matrix = np.vstack((self.embedding_matrix, first))

            word_to_change = min(self.value_to_key.items(), key=lambda x: x[1])[0]
            self.key_to_value[0] = '<PAD>'
            self.value_to_key['<PAD>'] = 0
            self.value_to_key[word_to_change] = len(self.embedding_matrix) - 1
            self.key_to_value[len(self.embedding_matrix) - 1] = word_to_change

        return copy.deepcopy(self.embedding_matrix)


## Model


In [56]:
class Model(nn.Module):
    def __init__(self, embedding_dim, output_dim, pre_trained_emb,
                 merging_type='concatenation', sentence_type='last_state'):
        super().__init__()
        self.merging_type = merging_type
        self.sentence_type = sentence_type
        self.embedding = nn.Embedding.from_pretrained(embeddings=pre_trained_emb, freeze=True, padding_idx=0)
        self.rnn_claim = nn.RNN(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)
        self.rnn_evid = nn.RNN(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)
        self.fc = nn.Linear(embedding_dim * (2 if merging_type == 'concatenation' else 1), 2)

        self.softmax = nn.Softmax(dim=1)
        self.sigmoid = nn.Sigmoid()
        self.i = 0
        # Find a good way to initialize
        # torch.nn.init.uniform_(self.fc.weight, a=0.49, b=0.51)

    def forward(self, claim, evid):
        # claim = [max_tok, emb_dim]
        # evid = [max_tok, emb_dim]
        # self.i+=1

        seq_lengths_claim = torch.tensor(list(map(len, claim)))
        seq_lengths_evid = torch.tensor(list(map(len, evid)))

        seq_lengths_claim, perm_idx_claim = seq_lengths_claim.sort(0, descending=True)
        seq_lengths_evid, perm_idx_evid = seq_lengths_evid.sort(0, descending=True)

        seq_claim = claim[perm_idx_claim]
        seq_evid = evid[perm_idx_evid]

        emb_claim = self.embedding(seq_claim.long())# [batch_size, max_len_in_batch, 50]
        emb_evid = self.embedding(seq_evid.long())# [batch_size, max_len_in_batch, 50]

        packed_claim = pack_padded_sequence(emb_claim, seq_lengths_claim, batch_first=True)
        packed_evid = pack_padded_sequence(emb_evid, seq_lengths_evid, batch_first=True)

        output_claim, hidden_claim = self.rnn_claim(packed_claim) 
        output_evid, hidden_evid = self.rnn_evid(packed_evid) 

        output_claim_unpack, claim_lens_unpacked = pad_packed_sequence(output_claim, batch_first=True)
        output_evid_unpack, evid_lens_unpacked = pad_packed_sequence(output_evid, batch_first=True)

        output_claim_idxs_first_zero = torch.argmin(torch.abs(torch.sum(output_claim_unpack, dim=2)), dim=1, keepdim=True)

        print()
        [print(output_claim_unpack[i, -1]) for i in range(32)]
        print(output_evid_unpack.size())
        print(torch.sum(output_claim_unpack, dim=2).size())

        print(torch.sum(output_claim_unpack, dim=2))
        print(output_claim_idxs_first_zero)

        raise

        if self.sentence_type == 'last_state':
            merge_input1 = output_claim_unpack[:, -1, :] # [batch_size, EMB_DIM]
            merge_input2 = output_evid_unpack[:, -1, :] # [batch_size, EMB_DIM]
            
            # print("\n CLAIM RNN", merge_input1)
            # print(merge_input1.size())
            # print("\n EVID RNN", merge_input2)

        elif self.sentence_type == 'average_outputs':
            # recheck shapes on this, not yet tested
            merge_input1 = torch.mean(output_claim)
            merge_input2 = torch.mean(output_evid)
        elif self.sentence_type == 'simple_mlp':
            raise NotImplementedError
        else: # mean of tokens
            raise NotImplementedError

        if self.merging_type == 'concatenation':
            merge = torch.cat([merge_input1, merge_input2], 1) # [batch_size, EMB_DIM*2]
            # print("\n MERGE", merge)
            # if self.i == 30:
            #     raise
        elif self.merging_type == 'sum':
            merge = torch.sum([merge_input1, merge_input2], dim=0) # [batch_size, EMB_DIM]
        else:
            merge = torch.mean([merge_input1, merge_input2], dim=0)  # [batch_size, EMB_DIM]

        dense = self.fc(merge)
        # print("SOFTMAX:", self.softmax(dense))
        # print("############################### NEXT BATCH ###########################")
        return self.softmax(dense)
    
    def __str__(self):
        return f"Model with '{self.sentence_type}' "\
               f"sentence embeddings and '{self.merging_type}' merging type."

### Model functions for training

In [None]:
##### General functions for Models #####

def progress_bar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    credits to: https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

def train_epoch(model:nn.Module,
                iterator_train: torch.utils.data.DataLoader,
                device: torch.device,
                optimizer: torch.optim,
                loss,
                epoch,
                epochs):

    accuracy_epoch = 0
    loss_epoch = 0
    num_batches = len(iterator_train)

    progress_bar(0, num_batches, prefix='', suffix='Initializing', length=20, printEnd='')

    
    model.train()
    for i, batch in enumerate(iterator_train):

        # print(len(batch[0]))
        # raise

        labels = batch[1].to(torch.float32) 
        claims = batch[0][0].to(torch.float32)
        evids = batch[0][1].to(torch.float32)

        # if i==0:
        #     print()
        #     print(' '.join([test.tokenizer.key_to_value[el.item()] for el in claims[0]])) # claim
        #     print(' '.join([test.tokenizer.key_to_value[el.item()] for el in evids[0]])) # evid
        #     print('supports' if labels[0].item() == 1 else 'refutes') # label
        #     raise

        predictions = model(claims, evids) # claim, evid as inputs
        
        # REMEMBER: round to nearest integer for dense and softmax
        accuracy_step = binary_accuracy(torch.round(predictions), labels)
        accuracy_epoch += accuracy_step

        # if i <= 30:
        #     print()
        #     print("PREDS: ", predictions)
        #     print("TRUES: ", labels)

        # if i == 30:
        #     raise

        loss_step = loss(predictions, labels)
        loss_epoch += loss_step.item()

        loss_step.backward()
        optimizer.step()

        if (i + 1) % (num_batches // 20) == 0:
            train_info_str = f"-- Epoch: {epoch}/{epochs} "\
                                f"-- Step: {i + 1}/{num_batches} "\
                                f"-- Acc: {accuracy_step:.2F} "\
                                f"-- Loss: {loss_step.item():.2F}"

            progress_bar(i + 1, num_batches, prefix='', suffix=train_info_str, length=20, printEnd='')
            
    loss_epoch /= num_batches
    accuracy_epoch /= num_batches
    return loss_step.item(), accuracy_step

def validate_epoch(model:nn.Module,
                    iterator_val: torch.utils.data.DataLoader,
                    device: torch.device,
                    optimizer: torch.optim,
                    loss,
                    epoch):

    accuracy_epoch = 0
    loss_epoch = 0
    num_batches = len(iterator_val)
    
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(iterator_val):
            labels = batch[1].to(torch.float32) 
            claims = batch[0][0].to(torch.float32)
            evids = batch[0][1].to(torch.float32)

            predictions = model(claims, evids) # claim, evid as inputs        
            
            # REMEMBER: round to nearest integer for dense and argmax for softmax
            accuracy_step = binary_accuracy(torch.round(predictions), labels)
            accuracy_epoch += accuracy_step

            loss_step = loss(predictions, labels)
            loss_epoch += loss_step.item()

    loss_epoch /= num_batches
    accuracy_epoch /= num_batches

    return loss_step.item(), accuracy_step
    

def train_model(model, 
                epochs, 
                batch_size, 
                iterator_train, 
                iterator_validation, 
                optimizer, 
                loss, 
                num_train_samples, 
                device):
    model.to(device)
    num_iters = num_train_samples // batch_size

    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    for epoch in range(1, epochs + 1):
        train_loss, train_acc = train_epoch(model, iterator_train, device, optimizer, loss, epoch, epochs)
        train_losses.append(train_loss)
        train_accs.append(train_acc)

        val_loss, val_acc = validate_epoch(model, iterator_validation, device, optimizer, loss, epoch)
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        epoch_info_str = f"-- Epoch: {epoch}/{epochs} "\
                         f"-- Step: {num_iters}/{num_iters} "\
                         f"-- Acc: {train_acc:.2F} "\
                         f"-- Loss: {train_loss:.2F} "\
                         f"-- Acc_Val: {val_acc:.2F} "\
                         f"-- Loss_Val: {val_loss:.2F}"
        progress_bar(num_iters, num_iters, prefix='', suffix=epoch_info_str, length=20, printEnd='')
        
    return train_losses, train_accs, val_losses, val_accs



def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    correct = [torch.equal(pred, target) for pred, target in zip(preds, y)]
    acc = sum(correct) / len(y)
    return acc


# LOADING DATASET

In [None]:
def load_dataset():
    # LOAD GLOVE
    try:
        with open(f"glove-{EMBEDDING_DIMENSION}.pkl", 'rb') as f:
            emb_model = pickle.load(f)
    except Exception:
        emb_model = gloader.load(f"glove-wiki-gigaword-{EMBEDDING_DIMENSION}")
        with open(f"glove-{EMBEDDING_DIMENSION}.pkl", 'wb') as f:
            pickle.dump(emb_model, f)

    glove_dict = emb_model.key_to_index
    glove_matrix = emb_model.vectors

    # LOAD CLEANED DATA IN TORCH DATASET OBJECTS
    splits = {}
    for split in ['train', 'val', 'test']:
        try:
            with open(f"{os.path.join('dataset_torched', split)}.pkl", 'rb') as f:
                splits[split] = pickle.load(f)
        except Exception:
            if split == 'train':
                tokenizer = None
            elif split == 'val':
                tokenizer = splits['train'].tokenizer
            else:
                tokenizer = splits['val'].tokenizer 

            splits[split] = FactDataset(EMBEDDING_DIMENSION, glove_dict, glove_matrix, split, tokenizer=tokenizer)
            with open(f"{os.path.join('dataset_torched', split)}.pkl", 'wb') as f:
                pickle.dump(splits[split], f)

    return splits


splits = load_dataset()
train, val, test = splits['train'], splits['val'], splits['test']

In [None]:
print(val[:2])

[[[[12083, 136, 4457, 8074, 29785, 18095, 12083, 12049, 20449, 26075, 9109], [2773, 21537, 28304, 32163, 10086, 7129, 8074, 6845, 6809, 29972, 18095, 12083, 9245, 24980, 20449, 30765, 9109]], [[28084, 19819, 21582], [20209, 24556, 13183, 10237, 25852, 29234, 27881, 19938, 22097, 26254, 32833, 7129, 21582, 21115, 12083, 1538]]], [[0, 1], [0, 1]]]


In [None]:
# https://www.scottcondron.com/jupyter/visualisation/audio/2020/12/02/dataloaders-samplers-collate.html#Input-to-your-collate-function

def chunk(indices, chunk_size):
    return torch.split(torch.tensor(indices), chunk_size)


class SequentialSampler(Sampler[int]):
    def __init__(self, dataset, batch_size):
        self.batch_size = batch_size
        self.dataset = dataset 
        self.indices = random.sample(list(range(len(dataset))), len(dataset))
        self.batch_size = batch_size        

    def __iter__(self):
        batches = chunk(self.indices, self.batch_size)
        return iter(batches)

    def __len__(self):
        return len(self.dataset)//self.batch_size
    
def collate_fn(batch):
    label_list, claim_evid, = [], [[], []]
    batch = [list(sample) for sample in batch]

    first_el = batch[0]
    # print(' '.join([test.tokenizer.key_to_value[el] for el in first_el[0][0]])) # claim
    # print(' '.join([test.tokenizer.key_to_value[el] for el in first_el[0][1]])) # evid
    # print('supports' if first_el[1] == 1 else 'refutes') # label

    for _text, _label in batch:  # [  [[claim, evid], label] ... ]
        label_list.append(_label)
        claim_evid[0].append(torch.tensor(_text[0]))
        claim_evid[1].append(torch.tensor(_text[1]))    

    label_list = torch.tensor(label_list, dtype=torch.int64)
    claim_evid[0] = pad_sequence(claim_evid[0], batch_first=True, padding_value=0)
    claim_evid[1] = pad_sequence(claim_evid[1], batch_first=True, padding_value=0)

    # print(' '.join([test.tokenizer.key_to_value[el.item()] for el in claim_evid[0][0]]))
    # print(' '.join([test.tokenizer.key_to_value[el.item()] for el in claim_evid[1][0]]))

    
    return (claim_evid[0].to(device), claim_evid[1].to(device)), label_list.to(device)



sequential_sampler_train = SequentialSampler(train, BATCH_SIZE)
sequential_sampler_val = SequentialSampler(val, BATCH_SIZE)
sequential_sampler_test = SequentialSampler(test, BATCH_SIZE)


dataloader_train = DataLoader(dataset=train,
                              pin_memory=False, shuffle=False, collate_fn=collate_fn,
                               batch_sampler=sequential_sampler_train)

dataloader_val = DataLoader(dataset=val,
                            pin_memory=False, shuffle=False, collate_fn=collate_fn,
                             batch_sampler=sequential_sampler_val)

dataloader_test = DataLoader(dataset=test, 
                             pin_memory=False, shuffle=False, collate_fn=collate_fn,
                              batch_sampler=sequential_sampler_test)

print(len(dataloader_train))



3784


In [50]:
# TESTING DATALOADERS
for i, batch in enumerate(dataloader_test):
    # dataloader return batch by batch

    # print(batch[0][1])

    labels = batch[1]
    claims = batch[0][0]
    evids = batch[0][1]

    print(' '.join([test.tokenizer.key_to_value[el.item()] for el in claims[0]]))
    print(' '.join([test.tokenizer.key_to_value[el.item()] for el in evids[0]]))
    print('supports' if labels[0] == [0, 1] else 'refutes')


    if i == 0:
        break

# CHECK PADDING TOKEN 
# print(f"PADDING STRING: {val.tokenizer.key_to_value[0]}")
# print(val.tokenizer.key_to_value)


mutiny on the bounty was adapted exactly once into a movie <PAD> <PAD> <PAD>
it is the second american film to be made from the novel the first being mutiny on the bounty <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
refutes


In [None]:
print(train.tokenizer.value_to_key['<PAD>'])

0


# TRAIN VARIOUS MODELS

In [57]:
sentence_emb_types = [
                    'last_state',
                    'average_outputs',
                    'simple_mlp',
                    'mean_of_tokens'
]

merging_types = [
                'concatenation',
                'sum',
                'mean'
]


for sentence_type in sentence_emb_types:
    for merging_type in merging_types:
        model_params = {
            'embedding_dim': EMBEDDING_DIMENSION,
            'output_dim': NUM_CLASSES,
            'pre_trained_emb': torch.tensor(test.emb_matrix).to(device),
            'merging_type': merging_type,
            'sentence_type': sentence_type
        }

        model = Model(**model_params)
        optimizer = optim.Adam(model.parameters(), lr=1e-4)
        loss = nn.CrossEntropyLoss()

        model = model.to(device)
        loss = loss.to(device)

        # summary(model, (2, 90)
        # quit()

        training_info = {
            'model': model,
            'epochs': EPOCHS,
            'batch_size': BATCH_SIZE,
            'iterator_train': dataloader_train,
            'iterator_validation': dataloader_val,
            'optimizer': optimizer,
            'loss': loss,
            'num_train_samples': train.n_samples,
            'device': device
        }

        train_losses, train_accs, val_losses, val_accs = train_model(**training_info)

        # REMEMBER: delete the breaks after debugging
        break
    break

 |--------------------| 0.0% Initializing
tensor([-0.0470,  0.0032,  0.1078, -0.0395, -0.0184,  0.0178,  0.0557, -0.1810,
         0.0641, -0.0687,  0.1372, -0.0466,  0.1223,  0.0143, -0.0725,  0.0722,
         0.0261,  0.1779,  0.1304, -0.0624, -0.1249, -0.1180, -0.0241, -0.1356,
        -0.1568, -0.1545,  0.1989,  0.0261,  0.1779,  0.0542, -0.1414,  0.0446,
        -0.0186,  0.0028,  0.1013,  0.1417, -0.0049, -0.0346, -0.0957, -0.0776,
         0.0722, -0.0544,  0.0199, -0.0408,  0.0316,  0.2567,  0.0554,  0.1549,
         0.1056,  0.0853,  0.0329, -0.0699, -0.0289,  0.1002,  0.1541, -0.1160,
         0.0493, -0.0251, -0.1584, -0.1016, -0.0798,  0.0485,  0.0751,  0.0006,
         0.1644,  0.1493, -0.1324, -0.0561, -0.0903,  0.1500, -0.1353,  0.1137,
         0.0938,  0.0575,  0.0669,  0.0179, -0.1150, -0.1089, -0.1543, -0.1785,
         0.0772,  0.1376, -0.0864, -0.1078,  0.1567,  0.0087, -0.0778, -0.0518,
        -0.0369,  0.0205, -0.1191,  0.0645, -0.0156, -0.0793,  0.1172,  0.108

RuntimeError: ignored

In [None]:
# problemi con le label e target, sempre piu allineati

In [None]:

# saving model
torch.save(model, 'model/model.pkl')

In [None]:
# load model
model_params = {
            'embedding_dim': EMBEDDING_DIMENSION,
            'output_dim': NUM_CLASSES,
            'pre_trained_emb': torch.tensor(test.emb_matrix).to(device),
            'merging_type': 'last_state',
            'sentence_type': 'concatenation'
        }


model = Model(**model_params)
model = torch.load('model/model.pkl')
model.to(device)
model.eval()

In [None]:
def predict(model:nn.Module,
            claims,
            evids):
    
    model.eval()
    with torch.no_grad():
        claims = claims.to(torch.float32)
        evids = evids.to(torch.float32)

        prediction = torch.round(model(claims, evids)).cpu().tolist()
        print(f"PREDICTED LABEL: {'supports' if prediction == [.0, 1.0] else 'refutes'}")

for i, batch in enumerate(dataloader_test):
    for j in range(BATCH_SIZE):

        labels = batch[1].cpu().tolist()
        claims = batch[0][0]
        evids = batch[0][1]

        # print(' '.join([test.tokenizer.key_to_value[el.item()] for el in claims[j]]))
        # print(' '.join([test.tokenizer.key_to_value[el.item()] for el in evids[j]]))
        print(f"TRUE LABEL: {'supports' if labels[j] == [0, 1] else 'refutes'}")

        predict(model, torch.unsqueeze(claims[j], 0), torch.unsqueeze(evids[j], 0))

    if i == 20:
        break


TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: refutes
PREDICTED LABEL: refutes
TRUE LABEL: refutes
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: refutes
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: refutes
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: refutes
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: refutes
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: supports
PREDICTED LABEL: refutes
TRUE LABEL: refutes
PREDICTED LABEL: refutes


In [None]:
#1 SISTEMARE LOSS
#2 NON STAMPARE SEMPRE LO STESSO
#3 