# IMPORTS

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import re
import os
import copy
import requests
import zipfile
import pickle
import gensim
import random
import gensim.downloader as gloader
import numpy as np
import pandas as pd

In [35]:
EMBEDDING_DIMENSION = 50
BATCH_SIZE = 64
NUM_CLASSES = 2
EPOCHS = 1

device = torch.device('cpu')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use. 
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True        
    
fix_random(42)
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# PRE-PROCESSING

In [4]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)


def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                           params={'id': toy_data_url_id},
                                           stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")


def pre_process(dataset, filename):  # clean the dataset
    dataset.drop(dataset.columns[0], axis=1, inplace=True)  # remove first column of dataframe containing numbers
    dataset.drop(['ID'], axis=1, inplace=True)
    # remove numbers before each evidence
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'^\d+\t', '', x))
    # remove everything after the period
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r' \..*', ' .', x))
    # remove round brackets and what they contain
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'-LRB-.*-RRB-', '', x))
    # remove square brackets and what they contain
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'-LSB-.*-RSB-', '', x))

    n_before = dataset.shape[0]
    # removes instances longer than a threshold on evidence
    dataset = dataset[dataset['Evidence'].str.split().str.len() <= 100]
    # remove all rows where there are single brackets in the evidence
    dataset = dataset[~dataset['Evidence'].str.contains('|'.join(['-LRB-', '-LSB-', '-RRB-', '-RSB-']))]
    n_after = dataset.shape[0]

    # removes punctuation and excessive spaces
    dataset = dataset.applymap(lambda x: re.sub(r'[^\w\s]', '', x))
    dataset = dataset.applymap(lambda x: re.sub(r' +', ' ', x))
    dataset = dataset.applymap(lambda x: re.sub(r'^ +', '', x))
    dataset = dataset.applymap(lambda x: x.lower())

    labels = {'supports': 1, 'refutes': 0}
    dataset = dataset.replace({'Label': labels})
    # removes rows with empty elements
    dataset = dataset[dataset['Evidence'] != '']
    dataset = dataset[dataset['Claim'] != '']
    dataset = dataset[dataset['Label'] != '']



    rem_elements = n_before - n_after
    print(f"Removed {rem_elements}\t ({100 * rem_elements / n_before:.2F}%)"
          f" elements because of inconsistency on {filename}")
    return dataset


#########################################

try:
    from google.colab import drive
    IN_COLAB=True
except:
    IN_COLAB=False

if IN_COLAB:
    print("We're running Colab")
    # Mount the Google Drive at mount
    mount='/content/gdrive'
    print("Colab: mounting Google drive on ", mount)
    drive.mount(mount)

    # Switch to the directory on the Google Drive that you want to use
    drive_root = mount + "/My Drive/NLP/Assignment2"
    
    # Create drive_root if it doesn't exist
    create_drive_root = True
    if create_drive_root:
        print("\nColab: making sure ", drive_root, " exists.")
        os.makedirs(drive_root, exist_ok=True)
    
    # Change to the directory
    print("\nColab: Changing directory to ", drive_root)
    %cd $drive_root
    print("Checking working directory:")
    %pwd

# download_data('dataset')

if not len(os.listdir("dataset_cleaned")):
    for file in os.listdir("dataset"):
        dataset_cleaned = pre_process(pd.read_csv("dataset/" + file, sep=','), file)
        dataset_cleaned.to_csv(os.path.join("dataset_cleaned", file))

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Mounted at /content/gdrive

Colab: making sure  /content/gdrive/My Drive/NLP/Assignment2  exists.

Colab: Changing directory to  /content/gdrive/My Drive/NLP/Assignment2
/content/gdrive/My Drive/NLP/Assignment2
Checking working directory:


# CLASSES

## FactDataset

In [31]:
class FactDataset(Dataset):
    def __init__(self, emb_dim, glove_dict, glove_matrix, split='train', tokenizer=None):
        pairs_claim_evid = pd.read_csv(f"dataset_cleaned/{split}_pairs.csv")
        # tokenization & embeddings
        text = pairs_claim_evid['Claim'] + ' ' + pairs_claim_evid['Evidence']
        text = text.astype(str).to_list()

        # IMPORTANT: if a tokenizer is already computed for example on the train set, for val set we have to extend
        if tokenizer:
            self.tokenizer = tokenizer
            self.tokenizer.dataset_sentences = text
        else:
            self.tokenizer = Tokenizer(text, emb_dim, glove_dict, glove_matrix)

        self.tokenizer.tokenize()
        self.val_to_key = self.tokenizer.get_val_to_key()
        self.emb_matrix = self.tokenizer.build_embedding_matrix()
        print(len(self.emb_matrix))

        lengths = pairs_claim_evid['Evidence'].str.split().str.len().to_numpy()

        # print(len([i for i, el in enumerate(lengths) if el > 100]))

        self.max_claim_len = int(pairs_claim_evid['Claim'].str.split().str.len().max())
        self.max_evid_len = int(pairs_claim_evid['Evidence'].str.split().str.len().max())
        self.max_seq_len = int(max(self.max_evid_len, self.max_claim_len))

        # tester function
        if not self._check_tokenizer():
            raise ValueError

        # create x as list of sentences
        # sentences are splitted in claim evidence
        self.x = []
        print(f"Creating FactDataset: \n")
        for i, (claim_sen, evid_sen) in tqdm(enumerate(zip(pairs_claim_evid['Claim'], pairs_claim_evid['Evidence']))):
            sentence = [[], []]
            for word in claim_sen.split():
                sentence[0].append(self.val_to_key[word])
            for word in evid_sen.split():
                sentence[1].append(self.val_to_key[word])

            sentence[0] = sentence[0] + [0] * (self.max_seq_len - len(claim_sen.split()))
            sentence[1] = sentence[1] + [0] * (self.max_seq_len - len(evid_sen.split()))
            self.x.append(sentence)

        self.x = torch.tensor(self.x)
        self.y = torch.tensor(pairs_claim_evid['Label'])
        self.n_samples = pairs_claim_evid.shape[0]
        print(f"Max sequence length: {self.max_seq_len}")

    def __getitem__(self, index):
        # xi = [[claim], [evid]], yi = [label]
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples

    def _check_tokenizer(self):
        word = 'the'
        for i, el in enumerate(self.tokenizer.glove_matrix[self.tokenizer.glove_dict[word]]):
            if el != self.emb_matrix[self.val_to_key[word]][i]:
                print("Check Tokenizer, possible bugs")
                return False
        return True

## Tokenizer

In [32]:
class Tokenizer(object):
    def __init__(self, dataset_sentences, embedding_dim, glove_dict, glove_matrix):
        self.embedding_matrix = None
        self.value_to_key = {}
        self.value_to_key_new = {}
        self.key_to_value = {}
        self.num_unique_words = 0
        self.dataset_sentences = dataset_sentences
        self.embedding_dim = embedding_dim
        self.glove_dict = glove_dict
        self.glove_matrix = glove_matrix
        self.unique_words = set()

    def get_val_to_key(self):
        return copy.deepcopy(self.value_to_key)

    def tokenize(self):
        self.value_to_key_new = {}
        unique_words = set()
        for sen in self.dataset_sentences:
            for w in sen.split():
                unique_words.add(w)  # get set of unique words
        new_unique = unique_words - self.unique_words
        for i, word in enumerate(new_unique):
            if self.embedding_matrix is not None:
                self.key_to_value[i + len(self.embedding_matrix)] = word  # build two dictionaries for key value correspondence
                self.value_to_key[word] = i + len(self.embedding_matrix)
            else:
                self.key_to_value[i] = word  # build two dictionaries for key value correspondence
                self.value_to_key[word] = i
            self.value_to_key_new[word] = i

        self.num_unique_words = len(new_unique)
        self.unique_words = self.unique_words | new_unique  # union of unique words and new unique words


    def __build_embedding_matrix_glove(self):
        oov_words = []
        tmp_embedding_matrix = np.zeros((self.num_unique_words, self.embedding_dim), dtype=np.float32)
        len_old_emb_matrix = len(self.embedding_matrix) if self.embedding_matrix is not None else 0
        print(f"Finding OOVs: ")
        for word, idx in tqdm(self.value_to_key_new.items()):
            try:
                embedding_vector = self.glove_matrix[self.glove_dict[word]]
                tmp_embedding_matrix[idx] = embedding_vector
            except (KeyError, TypeError):
                oov_words.append((word, idx + len_old_emb_matrix))
        if self.embedding_matrix is not None:
            self.embedding_matrix = np.vstack((self.embedding_matrix, tmp_embedding_matrix))
        else:
            self.embedding_matrix = tmp_embedding_matrix
        return oov_words

    def build_embedding_matrix(self):
        oov_words = self.__build_embedding_matrix_glove()
        print(f"Solving OOVs: ")
        for word, idx in tqdm(oov_words):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=self.embedding_dim)
            self.embedding_matrix[idx] = embedding_vector

        # PADDING (feat. David Guetta)
        first = self.embedding_matrix[0]
        if np.count_nonzero(first) == 0:
            first = self.embedding_matrix[0]
            self.embedding_matrix[0] = np.zeros(self.embedding_dim)
            self.embedding_matrix = np.vstack((self.embedding_matrix, first))

            word_to_change = min(self.value_to_key.items(), key=lambda x: x[1])[0]
            self.value_to_key[word_to_change] = len(self.embedding_matrix) - 1
            self.key_to_value[len(self.embedding_matrix) - 1] = word_to_change

        return copy.deepcopy(self.embedding_matrix)


## Model


In [64]:
class Model(nn.Module):
    def __init__(self, sentence_len, embedding_dim, output_dim, pre_trained_emb,
                 merging_type='concatenation', sentence_type='last_state'):
        super().__init__()
        self.merging_type = merging_type
        self.sentence_type = sentence_type
        self.embedding = nn.Embedding.from_pretrained(embeddings=pre_trained_emb, freeze=True, padding_idx=0)
        self.rnn_claim = nn.RNN(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)
        self.rnn_evid = nn.RNN(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)
        self.fc = nn.Linear(embedding_dim * (2 if merging_type == 'concatenation' else 1), 1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        # claim = [max_tok, emb_dim]
        # evid = [max_tok, emb_dim]

        emb_claim = self.embedding(x[:, 0].long())# [batch_size, 90, 50]
        emb_evid = self.embedding(x[:, 1].long())# [batch_size, 90, 50]

        '''
        print()
        print('emb_claim')
        print(emb_claim.shape)
        print(emb_claim)
        '''

        #!!! PROBLEM !!!: rnn return a vector [batch_size, 50] where 50 vectors are identical
        output_claim, hidden_claim = self.rnn_claim(emb_claim) 
        output_evid, hidden_evid = self.rnn_evid(emb_evid) 

        '''
        print(hidden_claim.shape)
        print(hidden_claim)
        '''

        if self.sentence_type == 'last_state':
            merge_input1 = hidden_claim[0] # [batch_size, 50]
            merge_input2 = hidden_evid[0] # [batch_size, 50]
        elif self.sentence_type == 'average_outputs':
            # recheck shapes on this, not yet tested
            merge_input1 = torch.mean(output_claim)
            merge_input2 = torch.mean(output_evid)
        elif self.sentence_type == 'simple_mlp':
            raise NotImplementedError
        else: # mean of tokens
            raise NotImplementedError

        if self.merging_type == 'concatenation':
            merge = torch.cat([merge_input1, merge_input2], 1) # [batch_size, 100]
        elif self.merging_type == 'sum':
            merge = torch.sum([merge_input1, merge_input2], dim=0) # [batch_size, 50]
        else:
            merge = torch.mean([merge_input1, merge_input2], dim=0)  # [batch_size, 50]

        dense = self.fc(merge)
        return dense
    
    def __str__(self):
        return f"Model with '{self.sentence_type}' "\
               f"sentence embeddings and '{self.merging_type}' merging type."

### Model functions for training

In [65]:
##### General functions for Models #####

def progress_bar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    credits to: https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

def train_epoch(model:nn.Module,
              iterator_train: torch.utils.data.DataLoader,
              device: torch.device,
              optimizer: torch.optim,
              loss,
              epoch,
              epochs):

    accuracy_epoch = 0
    loss_epoch = 0
    num_batches = len(iterator_train)

    progress_bar(0, num_batches, prefix='', suffix='Initializing', length=20, printEnd='')

    
    model.train()
    for i, (inputs, labels) in enumerate(iterator_train):
        optimizer.zero_grad()
        inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.float32)
        
        predictions = model(inputs).squeeze(1)
        
        # REMEMBER: round to nearest integer for dense and softmax
        accuracy_step = binary_accuracy(torch.round(predictions), labels)
        accuracy_epoch += accuracy_step.item()
        
        # print(predictions)
        # print(labels
        # raise

        loss_step = loss(predictions, labels)
        loss_epoch += loss_step.item()

        loss_step.backward()
        optimizer.step()

        if (i + 1) % (num_batches // 20) == 0:

            train_info_str = f"-- Epoch: {epoch}/{epochs} "\
                                f"-- Step: {i + 1}/{num_batches} "\
                                f"-- Acc: {accuracy_epoch/(i+1):.2F} "\
                                f"-- Loss: {loss_epoch/(i+1):.2F}"

            progress_bar(i + 1, num_batches, prefix='', suffix=train_info_str, length=20, printEnd='')

            
    loss_epoch /= num_batches
    accuracy_epoch /= num_batches
    return loss_epoch, accuracy_epoch

def validate_epoch(model:nn.Module,
                iterator_val: torch.utils.data.DataLoader,
                device: torch.device,
                optimizer: torch.optim,
                loss,
                epoch):

    accuracy_epoch = 0
    loss_epoch = 0
    num_batches = len(iterator_val)
    
    model.eval()
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(iterator_val):
                inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.float32)  
                predictions = model(inputs).squeeze(1)
                
                # REMEMBER: round to nearest integer for dense and argmax for softmax
                accuracy_step = binary_accuracy(torch.round(predictions), labels)
                accuracy_epoch += accuracy_step.item()

                loss_step = loss(predictions, labels)
                loss_epoch += loss_step.item()

    loss_epoch /= num_batches
    accuracy_epoch /= num_batches

    return loss_epoch, accuracy_epoch
    

def train_model(model, epochs, batch_size, iterator_train, iterator_validation, optimizer, loss, num_train_samples, device):
    model.to(device)
    num_iters = num_train_samples // batch_size

    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    for epoch in range(1, epochs + 1):
        train_loss, train_acc = train_epoch(model, iterator_train, device, optimizer, loss, epoch, epochs)
        train_losses.append(train_loss)
        train_accs.append(train_acc)

        val_loss, val_acc = validate_epoch(model, iterator_validation, device, optimizer, loss, epoch)
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        epoch_info_str = f"-- Epoch: {epoch}/{epochs} "\
                         f"-- Step: {num_iters}/{num_iters} "\
                         f"-- Acc: {train_acc:.2F} "\
                         f"-- Loss: {train_loss:.2F} "\
                         f"-- Acc_Val: {val_acc:.2F} "\
                         f"-- Loss_Val: {val_loss:.2F}"
        progress_bar(num_iters, num_iters, prefix='', suffix=epoch_info_str, length=20, printEnd='')
        
    return train_losses, train_accs, val_losses, val_accs



def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    correct = (preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


# MAIN

In [66]:
def load_dataset():
    # LOAD GLOVE
    try:
        with open(f"glove-{EMBEDDING_DIMENSION}.pkl", 'rb') as f:
            emb_model = pickle.load(f)
    except Exception:
        emb_model = gloader.load(f"glove-wiki-gigaword-{EMBEDDING_DIMENSION}")
        with open(f"glove-{EMBEDDING_DIMENSION}.pkl", 'wb') as f:
            pickle.dump(emb_model, f)

    glove_dict = emb_model.key_to_index
    glove_matrix = emb_model.vectors

    # LOAD CLEANED DATA IN TORCH DATASET OBJECTS
    splits = {}
    for split in ['train', 'val', 'test']:
        try:
            with open(f"{os.path.join('dataset_torched', split)}.pkl", 'rb') as f:
                splits[split] = pickle.load(f)
        except Exception:
            if split == 'train':
                tokenizer = None
            elif split == 'val':
                tokenizer = splits['train'].tokenizer
            else:
                tokenizer = splits['val'].tokenizer 

            splits[split] = FactDataset(EMBEDDING_DIMENSION, glove_dict, glove_matrix, split, tokenizer=tokenizer)
            with open(f"{os.path.join('dataset_torched', split)}.pkl", 'wb') as f:
                pickle.dump(splits[split], f)

    return splits


splits = load_dataset()
train, val, test = splits['train'], splits['val'], splits['test']
dataloader_train = DataLoader(dataset=train, batch_size=BATCH_SIZE, num_workers=2,
                              pin_memory=True, shuffle=True)
dataloader_val = DataLoader(dataset=val, batch_size=BATCH_SIZE, num_workers=2,
                            pin_memory=True, shuffle=True)
dataloader_test = DataLoader(dataset=test, batch_size=BATCH_SIZE, num_workers=2, 
                             pin_memory=True, shuffle=True)




# TRAIN VARIOUS MODELS

In [67]:
sentence_emb_types = [
                    'last_state',
                    'average_outputs',
                    'simple_mlp',
                    'mean_of_tokens'
]

merging_types = [
                'concatenation',
                'sum',
                'mean'
]
 

for sentence_type in sentence_emb_types:
    for merging_type in merging_types:
        model_params = {
            'sentence_len': train.max_seq_len,
            'embedding_dim': EMBEDDING_DIMENSION,
            'output_dim': NUM_CLASSES,
            'pre_trained_emb': torch.tensor(val.emb_matrix).to(device),
            'merging_type': merging_type,
            'sentence_type': sentence_type
        }

        model = Model(**model_params)
        optimizer = optim.Adam(model.parameters(), lr=1e-3)
        loss = nn.BCEWithLogitsLoss()

        model = model.to(device)
        loss = loss.to(device)

        # summary(model, (2, 90), batch_size=32)
        # quit()

        training_info = {
            'model': model,
            'epochs': EPOCHS,
            'batch_size': BATCH_SIZE,
            'iterator_train': dataloader_train,
            'iterator_validation': dataloader_val,
            'optimizer': optimizer,
            'loss': loss,
            'num_train_samples': train.n_samples,
            'device': device
        }

        train_losses, train_accs, val_losses, val_accs = train_model(**training_info)

        # REMEMBER: delete the breaks after debugging
        break
    break

 |████████████████████| 100.0% -- Epoch: 1/1 -- Step: 1892/1892 -- Acc: 0.73 -- Loss: 0.58 -- Acc_Val: 0.50 -- Loss_Val: 0.80
