# IMPORTS

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import re
import os
import copy
import requests
import zipfile
import pickle
import gensim
import gensim.downloader as gloader
import numpy as np
import pandas as pd

In [None]:
EMBEDDING_DIMENSION = 50
BATCH_SIZE = 32
NUM_CLASSES = 2
EPOCHS = 2

# PRE-PROCESSING

In [None]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)


def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                           params={'id': toy_data_url_id},
                                           stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")


def pre_process(dataset, filename):  # clean the dataset
    dataset.drop(dataset.columns[0], axis=1, inplace=True)  # remove first column of dataframe containing numbers
    dataset.drop(['ID'], axis=1, inplace=True)
    # remove numbers before each evidence
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'^\d+\t', '', x))
    # remove everything after the period
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r' \..*', ' .', x))
    # remove round brackets and what they contain
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'-LRB-.*-RRB-', '', x))
    # remove square brackets and what they contain
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'-LSB-.*-RSB-', '', x))

    n_before = dataset.shape[0]
    # removes instances longer than a threshold on evidence
    dataset = dataset[dataset['Evidence'].str.split().str.len() <= 100]
    # remove all rows where there are single brackets in the evidence
    dataset = dataset[~dataset['Evidence'].str.contains('|'.join(['-LRB-', '-LSB-', '-RRB-', '-RSB-']))]
    n_after = dataset.shape[0]

    # removes punctuation and excessive spaces
    dataset = dataset.applymap(lambda x: re.sub(r'[^\w\s]', '', x))
    dataset = dataset.applymap(lambda x: re.sub(r' +', ' ', x))
    dataset = dataset.applymap(lambda x: re.sub(r'^ +', '', x))
    dataset = dataset.applymap(lambda x: x.lower())

    labels = {'supports': 1, 'refutes': 0}
    dataset = dataset.replace({'Label': labels})
    # removes rows with empty elements
    dataset = dataset[dataset['Evidence'] != '']
    dataset = dataset[dataset['Claim'] != '']
    dataset = dataset[dataset['Label'] != '']



    rem_elements = n_before - n_after
    print(f"Removed {rem_elements}\t ({100 * rem_elements / n_before:.2F}%)"
          f" elements because of inconsistency on {filename}")
    return dataset


#########################################

try:
    from google.colab import drive
    IN_COLAB=True
except:
    IN_COLAB=False

if IN_COLAB:
    print("We're running Colab")
    # Mount the Google Drive at mount
    mount='/content/gdrive'
    print("Colab: mounting Google drive on ", mount)
    drive.mount(mount)

    # Switch to the directory on the Google Drive that you want to use
    drive_root = mount + "/My Drive/NLP/Assignment2"
    
    # Create drive_root if it doesn't exist
    create_drive_root = True
    if create_drive_root:
        print("\nColab: making sure ", drive_root, " exists.")
        os.makedirs(drive_root, exist_ok=True)
    
    # Change to the directory
    print("\nColab: Changing directory to ", drive_root)
    %cd $drive_root
    print("Checking working directory:")
    %pwd

# download_data('dataset')

for file in os.listdir("dataset"):
    dataset_cleaned = pre_process(pd.read_csv("dataset/" + file, sep=','), file)
    dataset_cleaned.to_csv(os.path.join("dataset_cleaned", file))

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).

Colab: making sure  /content/gdrive/My Drive/NLP/Assignment2  exists.

Colab: Changing directory to  /content/gdrive/My Drive/NLP/Assignment2
/content/gdrive/My Drive/NLP/Assignment2
Checking working directory:
Removed 492	 (0.40%) elements because of inconsistency on train_pairs.csv
Removed 41	 (0.57%) elements because of inconsistency on val_pairs.csv
Removed 4	 (0.06%) elements because of inconsistency on test_pairs.csv


# CLASSES

## FactDataset

In [None]:
class FactDataset(Dataset):
    def __init__(self, emb_dim, glove_dict, glove_matrix, split='train'):
        pairs_claim_evid = pd.read_csv(f"dataset_cleaned/{split}_pairs.csv")
        # tokenization & embeddings
        text = pairs_claim_evid['Claim'] + ' ' + pairs_claim_evid['Evidence']
        text = text.astype(str).to_list()

        self.tokenizer = Tokenizer(text, emb_dim, glove_dict, glove_matrix)
        self.tokenizer.tokenize()
        self.val_to_key = self.tokenizer.get_val_to_key()
        self.emb_matrix = self.tokenizer.build_embedding_matrix()
        lengths = pairs_claim_evid['Evidence'].str.split().str.len().to_numpy()

        # print(len([i for i, el in enumerate(lengths) if el > 100]))

        self.max_claim_len = int(pairs_claim_evid['Claim'].str.split().str.len().max())
        self.max_evid_len = int(pairs_claim_evid['Evidence'].str.split().str.len().max())
        self.max_seq_len = int(max(self.max_evid_len, self.max_claim_len))

        if not self._check_tokenizer():
            raise ValueError

        # create x as list of sentences
        # sentences are splitted in claim evidence
        self.x = []
        print(f"Creating FactDataset: \n")
        for i, (claim_sen, evid_sen) in tqdm(enumerate(zip(pairs_claim_evid['Claim'], pairs_claim_evid['Evidence']))):
            sentence = [[], []]
            for word in claim_sen.split():
                sentence[0].append(self.val_to_key[word])
            for word in evid_sen.split():
                sentence[1].append(self.val_to_key[word])

            sentence[0] = sentence[0] + [0] * (self.max_seq_len - len(claim_sen.split()))
            sentence[1] = sentence[1] + [0] * (self.max_seq_len - len(evid_sen.split()))
            self.x.append(sentence)

        self.x = torch.tensor(self.x)
        self.y = torch.tensor(pairs_claim_evid['Label'])
        self.n_samples = pairs_claim_evid.shape[0]
        print(f"Max sequence length: {self.max_seq_len}")

    def __getitem__(self, index):
        # xi = [[claim][evid]], yi = [label]
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples

    def _check_tokenizer(self):
        word = 'the'
        for i, el in enumerate(self.tokenizer.glove_matrix[self.tokenizer.glove_dict[word]]):
            if el != self.emb_matrix[self.val_to_key[word]][i]:
                print("Check Tokenizer, possible bugs")
                return False
        return True

## Tokenizer

In [None]:
class Tokenizer(object):
    def __init__(self, dataset_sentences, embedding_dim, glove_dict, glove_matrix):
        self.embedding_matrix = None
        self.value_to_key = {}
        self.value_to_key_new = {}
        self.key_to_value = {}
        self.num_unique_words = 0
        self.dataset_sentences = dataset_sentences
        self.embedding_dim = embedding_dim
        self.glove_dict = glove_dict
        self.glove_matrix = glove_matrix
        self.unique_words = set()

    def get_val_to_key(self):
        return copy.deepcopy(self.value_to_key)

    def tokenize(self):
        self.value_to_key_new = {}
        unique_words = set()
        for sen in self.dataset_sentences:
            for w in sen.split():
                unique_words.add(w)  # get se of unique words
        new_unique = unique_words - self.unique_words
        for i, word in enumerate(new_unique):
            if self.embedding_matrix is not None:
                self.key_to_value[i + len(self.embedding_matrix)] = word  # build two dictionaries for key value correspondence
                self.value_to_key[word] = i + len(self.embedding_matrix)
            else:
                self.key_to_value[i] = word  # build two dictionaries for key value correspondence
                self.value_to_key[word] = i
            self.value_to_key_new[word] = i

        self.num_unique_words = len(new_unique)
        self.unique_words = self.unique_words | new_unique  # union of unique words and new unique words


    def __build_embedding_matrix_glove(self):
        oov_words = []
        tmp_embedding_matrix = np.zeros((self.num_unique_words, self.embedding_dim), dtype=np.float32)
        len_old_emb_matrix = len(self.embedding_matrix) if self.embedding_matrix is not None else 0
        print(f"Finding OOVs: ")
        for word, idx in tqdm(self.value_to_key_new.items()):
            try:
                embedding_vector = self.glove_matrix[self.glove_dict[word]]
                tmp_embedding_matrix[idx] = embedding_vector
            except (KeyError, TypeError):
                oov_words.append((word, idx + len_old_emb_matrix))
        if self.embedding_matrix is not None:
            self.embedding_matrix = np.vstack((self.embedding_matrix, tmp_embedding_matrix))
        else:
            self.embedding_matrix = tmp_embedding_matrix
        return oov_words

    def build_embedding_matrix(self):
        oov_words = self.__build_embedding_matrix_glove()
        print(f"Solving OOVs: ")
        for word, idx in tqdm(oov_words):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=self.embedding_dim)
            self.embedding_matrix[idx] = embedding_vector

        # PADDING (feat. David Guetta)
        first = self.embedding_matrix[0]
        if np.count_nonzero(first) == 0:
            first = self.embedding_matrix[0]
            self.embedding_matrix[0] = np.zeros(self.embedding_dim)
            self.embedding_matrix = np.vstack((self.embedding_matrix, first))

            word_to_change = min(self.value_to_key.items(), key=lambda x: x[1])[0]
            self.value_to_key[word_to_change] = len(self.embedding_matrix) - 1
            self.key_to_value[len(self.embedding_matrix) - 1] = word_to_change

        return copy.deepcopy(self.embedding_matrix)


## Model


In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


##### General functions for Models #####
def train_model(model, epochs, batch_size, iterator_train, iterator_validation, optimizer, loss, num_train_samples):
    model.to(device)
    num_iters = num_train_samples // batch_size

    for epoch in range(epochs):
        epoch_loss = 0
        epoch_acc = 0
        epoch_loss_val = 0
        epoch_acc_val = 0
        for i, (inputs, labels) in enumerate(iterator_train):
            optimizer.zero_grad()
            inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.float32)
            predictions = model(inputs).squeeze(1)
            loss_iter = loss(predictions, labels)
            acc_iter = binary_accuracy(predictions, labels)
            loss_iter.backward()
            optimizer.step()
            epoch_loss += loss_iter.item()
            epoch_acc += acc_iter.item()
            if (i + 1) % (num_iters // 5) == 0:
                print(f"Epoch: {epoch + 1}/{epochs} -- Step: {i + 1}/{num_iters} "
                      f"-- Acc: {acc_iter.item():.2F} -- Loss: {loss_iter.item():.2F}")

        for i, (inputs, labels) in enumerate(iterator_validation):
            inputs, labels = inputs.to(device).to(torch.float32), labels.to(device).to(torch.float32)
            with torch.no_grad():
                predictions = model(inputs).squeeze(1)
                loss_iter = loss(predictions, labels)
                acc_iter = binary_accuracy(predictions, labels)
                epoch_loss_val += loss_iter.item()
                epoch_acc_val += acc_iter.item()

        print(f"Epoch: {epoch + 1}/{epochs} -- Step: {num_iters}/{num_iters} "
              f"-- Acc: {epoch_acc / len(iterator_train):.2F} "
              f"-- Loss: {epoch_loss / len(iterator_train):.2F} "
              f"-- Acc_Val: {epoch_acc_val / len(iterator_validation):.2F} "
              f"-- Loss_Val: {epoch_loss_val / len(iterator_validation):.2F}")

    return epoch_acc / len(iterator_train), epoch_loss / len(iterator_train)


def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


#########################################
class Model_RNN1(nn.Module):
    def __init__(self, sentence_len, embedding_dim, output_dim, pre_trained_emb):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings=pre_trained_emb, freeze=True, padding_idx=0)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)
        self.fc = nn.Linear(100, 1)
        # self.softmax = nn.LogSoftmax()

    def forward(self, x):
        # claim = [max_tok, emb_dim]
        # evid = [max_tok, emb_dim]
        # print("input shape:", x.shape)

        # print(x[:, 0].long())
        emb_claim = self.embedding(x[:, 0].long())
        emb_evid = self.embedding(x[:, 1].long())
        # print("emb_merda:", emb_evid.shape)  # [32, 90, 50]

        output_claim, hidden_claim = self.rnn(emb_claim)
        output_evid, hidden_evid = self.rnn(emb_evid)
        # print("hidden merda:", hidden_claim.shape)  # [1, 1, 50]

        concat = torch.cat((hidden_claim[0], hidden_evid[0]), 1)
        # print("concat merda", concat.shape)  # [1, 1, 100]
        # print(concat)
        dense = self.fc(concat)
        return dense

# MAIN

In [None]:

def load_dataset():
    # LOAD GLOVE
    try:
        with open(f"glove-{EMBEDDING_DIMENSION}.pkl", 'rb') as f:
            emb_model = pickle.load(f)
    except Exception:
        emb_model = gloader.load(f"glove-wiki-gigaword-{EMBEDDING_DIMENSION}")
        with open(f"glove-{EMBEDDING_DIMENSION}.pkl", 'wb') as f:
            pickle.dump(emb_model, f)

    glove_dict = emb_model.key_to_index
    glove_matrix = emb_model.vectors

    # LOAD CLEANED DATA IN TORCH DATASET OBJECTS
    splits = {}
    for split in ['train', 'val', 'test']:
        try:
            with open(f"{os.path.join('dataset_torched', split)}.pkl", 'rb') as f:
                splits[split] = pickle.load(f)
        except Exception:
            splits[split] = FactDataset(EMBEDDING_DIMENSION, glove_dict, glove_matrix, split)
            with open(f"{os.path.join('dataset_torched', split)}.pkl", 'wb') as f:
                pickle.dump(splits[split], f)

    return splits


splits = load_dataset()
train, val, test = splits['train'], splits['val'], splits['test']
dataloader_train = DataLoader(dataset=train, batch_size=BATCH_SIZE, num_workers=2, shuffle=True)
dataloader_val = DataLoader(dataset=val, batch_size=BATCH_SIZE, num_workers=2, shuffle=True)
dataloader_test = DataLoader(dataset=test, batch_size=BATCH_SIZE, num_workers=2, shuffle=True)

dataiter_train = iter(dataloader_train)
dataiter_val = iter(dataloader_val)
dataiter_test = iter(dataloader_test)

# print(torch.tensor(train.emb_matrix, device=device))
# print(max(train.val_to_key.values()))
# print(train.emb_matrix.shape)

model_params = {
    'sentence_len': train.max_seq_len,
    'embedding_dim': EMBEDDING_DIMENSION,
    'output_dim': NUM_CLASSES,
    'pre_trained_emb': torch.tensor(train.emb_matrix).to(device)
}
model = Model_RNN1(**model_params)
optimizer = optim.SGD(model.parameters(), lr=1e-3)
loss = nn.BCEWithLogitsLoss()

model = model.to(device)
loss = loss.to(device)

# summary(model, (2, 90), batch_size=32)
# quit()

training_info = {
    'model': model,
    'epochs': EPOCHS,
    'batch_size': BATCH_SIZE,
    'iterator_train': dataiter_train,
    'iterator_validation': dataiter_val,
    'optimizer': optimizer,
    'loss': loss,
    'num_train_samples': train.n_samples
}

train_model(**training_info)

Epoch: 1/2 -- Step: 378/3784 -- Acc: 0.75 -- Loss: 0.57
Epoch: 1/2 -- Step: 756/3784 -- Acc: 0.78 -- Loss: 0.53
Epoch: 1/2 -- Step: 1134/3784 -- Acc: 0.81 -- Loss: 0.50
Epoch: 1/2 -- Step: 1512/3784 -- Acc: 0.69 -- Loss: 0.63
Epoch: 1/2 -- Step: 1890/3784 -- Acc: 0.62 -- Loss: 0.69
Epoch: 1/2 -- Step: 2268/3784 -- Acc: 0.66 -- Loss: 0.66
Epoch: 1/2 -- Step: 2646/3784 -- Acc: 0.72 -- Loss: 0.59
Epoch: 1/2 -- Step: 3024/3784 -- Acc: 0.81 -- Loss: 0.50
Epoch: 1/2 -- Step: 3402/3784 -- Acc: 0.69 -- Loss: 0.63
Epoch: 1/2 -- Step: 3780/3784 -- Acc: 0.81 -- Loss: 0.50
Epoch: 1/2 -- Step: 3784/3784 -- Acc: 0.73 -- Loss: 0.58 -- Acc_Val: 0.50 -- Loss_Val: 0.82
Epoch: 2/2 -- Step: 3784/3784 -- Acc: 0.00 -- Loss: 0.00 -- Acc_Val: 0.00 -- Loss_Val: 0.00


(0.0, 0.0)