In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.regularizers import l2

import re
import os
import copy
import math
import requests
import tarfile

import zipfile
import pickle
import gensim
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from urllib import request

import collections
import gensim.downloader as gloader
import sklearn.metrics as sk_metrics
import matplotlib.pyplot as plt
import pickle

In [2]:
# In order to use key_to_index attribute from the embedding model
! pip install gensim==4.1.2
import gensim
import gensim.downloader as gloader



In [3]:
EMBEDDING_SIZE = 100
BATCH_SIZE = 32
NUM_CLASSES = 2
EPOCHS = 3

In [4]:
def fix_random(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use. 
    """
    np.random.seed(seed)
    random.seed(seed)
        
fix_random(42)
!nvidia-smi


Thu Dec 16 15:23:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# PRE-PROCESSING

In [5]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)


def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                           params={'id': toy_data_url_id},
                                           stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")


def pre_process(dataset, filename):  # clean the dataset
    dataset.drop(dataset.columns[0], axis=1, inplace=True)  # remove first column of dataframe containing numbers
    dataset.drop(['ID'], axis=1, inplace=True)
    # remove numbers before each evidence
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'^\d+\t', '', x))
    # remove everything after the period
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r' \..*', ' .', x))
    # remove round brackets and what they contain
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'-LRB-.*-RRB-', '', x))
    # remove square brackets and what they contain
    dataset['Evidence'] = dataset['Evidence'].apply(lambda x: re.sub(r'-LSB-.*-RSB-', '', x))

    n_before = dataset.shape[0]
    # removes instances longer than a threshold on evidence
    # TODO: only on train
    dataset = dataset[dataset['Evidence'].str.split().str.len() <= 100]
    # remove all rows where there are single brackets in the evidence
    dataset = dataset[~dataset['Evidence'].str.contains('|'.join(['-LRB-', '-LSB-', '-RRB-', '-RSB-']))]
    n_after = dataset.shape[0]

    # removes punctuation and excessive spaces
    dataset = dataset.applymap(lambda x: re.sub(r'[^\w\s]', '', x))
    dataset = dataset.applymap(lambda x: re.sub(r' +', ' ', x))
    dataset = dataset.applymap(lambda x: re.sub(r'^ +', '', x))
    dataset = dataset.applymap(lambda x: x.lower())

    labels = {'supports': 1, 'refutes': 0}
    dataset = dataset.replace({'Label': labels})
    # removes rows with empty elements
    dataset = dataset[dataset['Evidence'] != '']
    dataset = dataset[dataset['Claim'] != '']
    dataset = dataset[dataset['Label'] != '']



    rem_elements = n_before - n_after
    print(f"Removed {rem_elements}\t ({100 * rem_elements / n_before:.2F}%)"
          f" elements because of inconsistency on {filename}")
    return dataset


#########################################

try:
    from google.colab import drive
    IN_COLAB=True
except:
    IN_COLAB=False

if IN_COLAB:
    print("We're running Colab")
    # Mount the Google Drive at mount
    mount='/content/gdrive'
    print("Colab: mounting Google drive on ", mount)
    drive.mount(mount)

    # Switch to the directory on the Google Drive that you want to use
    drive_root = mount + "/My Drive/NLP/Assignment2"
    
    # Create drive_root if it doesn't exist
    create_drive_root = True
    if create_drive_root:
        print("\nColab: making sure ", drive_root, " exists.")
        os.makedirs(drive_root, exist_ok=True)
    
    # Change to the directory
    print("\nColab: Changing directory to ", drive_root)
    %cd $drive_root
    print("Checking working directory:")
    %pwd

# download_data('dataset')

if not len(os.listdir("dataset_cleaned")):
    for file in os.listdir("dataset"):
        dataset_cleaned = pre_process(pd.read_csv("dataset/" + file, sep=','), file)
        dataset_cleaned.to_csv(os.path.join("dataset_cleaned", file))


We're running Colab
Colab: mounting Google drive on  /content/gdrive
Mounted at /content/gdrive

Colab: making sure  /content/gdrive/My Drive/NLP/Assignment2  exists.

Colab: Changing directory to  /content/gdrive/My Drive/NLP/Assignment2
/content/gdrive/My Drive/NLP/Assignment2
Checking working directory:


# Tokenizer

In [6]:
class Tokenizer(object):
    def __init__(self, dataset_sentences, embedding_dim, glove_dict, glove_matrix):
        self.embedding_matrix = None
        self.value_to_key = {}
        self.value_to_key_new = {}
        self.key_to_value = {}
        self.num_unique_words = 0
        self.dataset_sentences = dataset_sentences
        self.embedding_dim = embedding_dim
        self.glove_dict = glove_dict
        self.glove_matrix = glove_matrix
        self.unique_words = set()

    def get_val_to_key(self):
        return copy.deepcopy(self.value_to_key)

    def tokenize(self):
        self.value_to_key_new = {}
        unique_words = set()
        for sen in self.dataset_sentences:
            for w in sen.split():
                unique_words.add(w)  # get se of unique words
        new_unique = unique_words - self.unique_words
        for i, word in enumerate(new_unique):
            if self.embedding_matrix is not None:
                self.key_to_value[i + len(self.embedding_matrix)] = word  # build two dictionaries for key value correspondence
                self.value_to_key[word] = i + len(self.embedding_matrix)
            else:
                self.key_to_value[i] = word  # build two dictionaries for key value correspondence
                self.value_to_key[word] = i
            self.value_to_key_new[word] = i

        self.num_unique_words = len(new_unique)
        self.unique_words = self.unique_words | new_unique  # union of unique words and new unique words

    def __build_embedding_matrix_glove(self):
        oov_words = []
        tmp_embedding_matrix = np.zeros((self.num_unique_words, self.embedding_dim)) #dtype=np.float32
        len_old_emb_matrix = len(self.embedding_matrix) if self.embedding_matrix is not None else 0
        for word, idx in tqdm(self.value_to_key_new.items()):
            try:
                embedding_vector = self.glove_matrix[self.glove_dict[word]]
                tmp_embedding_matrix[idx] = embedding_vector
            except (KeyError, TypeError):
                oov_words.append((word, idx + len_old_emb_matrix))
        
        if self.embedding_matrix is not None:
            self.embedding_matrix = np.vstack((self.embedding_matrix, tmp_embedding_matrix))

        else:
            self.embedding_matrix = copy.deepcopy(tmp_embedding_matrix)
        return oov_words

    def build_embedding_matrix(self):
        oov_words = self.__build_embedding_matrix_glove()
        for word, idx in oov_words:
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=self.embedding_dim)
            self.embedding_matrix[idx] = embedding_vector
        return copy.deepcopy(self.embedding_matrix)

# LOAD DATA

In [71]:
# LOAD GLOVE
try:
    with open(f"glove-{EMBEDDING_SIZE}.pkl", 'rb') as f:
        emb_model = pickle.load(f)
except Exception:
    emb_model = gloader.load(f"glove-wiki-gigaword-{EMBEDDING_SIZE}")
    with open(f"glove-{EMBEDDING_SIZE}.pkl", 'wb') as f:
        pickle.dump(emb_model, f)

glove_dict = emb_model.key_to_index
glove_matrix = emb_model.vectors

train = pd.read_csv("dataset_cleaned/train_pairs.csv")
val = pd.read_csv("dataset_cleaned/val_pairs.csv")
test = pd.read_csv("dataset_cleaned/test_pairs.csv")

try:
    with open("emb_mat.pkl", 'rb') as f:
        v4_matrix = pickle.load(f)
    with open("val_to_key.pkl", 'rb') as f:
        v4_val_to_key = pickle.load(f)
    with open(f"tokenizer.pkl", 'rb') as f:
        tokenizer = pickle.load(f)
except Exception:
    tokenizer = Tokenizer(train["Claim"] + ' ' + train["Evidence"], EMBEDDING_SIZE, glove_dict, glove_matrix)
    tokenizer.tokenize()
    v2_matrix = tokenizer.build_embedding_matrix()
    tokenizer.dataset_sentences = val["Claim"] + ' ' + val["Evidence"]
    tokenizer.tokenize()
    v3_matrix = tokenizer.build_embedding_matrix()
    tokenizer.dataset_sentences = test["Claim"] + ' ' + test["Evidence"]
    tokenizer.tokenize()
    v4_matrix = tokenizer.build_embedding_matrix()
    v4_val_to_key = tokenizer.get_val_to_key()
    with open(f"emb_mat.pkl", 'wb') as f:
        pickle.dump(v4_matrix, f)
    with open(f"val_to_key.pkl", 'wb') as f:
        pickle.dump(v4_val_to_key, f)
    with open(f"tokenizer.pkl", 'wb') as f:
        pickle.dump(tokenizer, f)

v4_val_to_key.update((x, y+1) for x, y in v4_val_to_key.items())


translate_tokens = {}
key_val_list_items = list(tokenizer.key_to_value.items())
for i, (token, value) in enumerate(key_val_list_items):
    if i > 0:
        translate_tokens[token] = key_val_list_items[i-1][1]
    else:
        translate_tokens[i] = '<PAD>'


# GENERATOR

In [70]:
def generator(dataset, value_to_key):
    dataset_size = dataset.shape[0]
    dataset = dataset.to_numpy()[:, 1:]

    refutes = dataset[dataset[:,2] == 0]
    supports = dataset[dataset[:,2] == 1]
    
    while True:
        X_claim = []
        X_evid = []
        y = []

        rnd_choices_refutes = np.random.choice(np.arange(len(refutes)),replace=False, size=BATCH_SIZE//2)
        rnd_choices_supports = np.random.choice(np.arange(len(supports)),replace=False, size=BATCH_SIZE//2)

        batch = []
        for i in range(BATCH_SIZE//2):
            batch.append(list(refutes[rnd_choices_refutes[i]]))
            batch.append(list(supports[rnd_choices_supports[i]]))
        
        random.shuffle(batch)

        max_seq_claim = max([len(el[0].split()) for el in batch])
        max_seq_evid = max([len(el[1].split()) for el in batch])

        for sample in batch:
            tokenized_claim = [value_to_key[word] for word in sample[0].split()]
            tokenized_evid = [value_to_key[word] for word in sample[1].split()]

            tmp_claim = [0] * (max_seq_claim - len(tokenized_claim)) + tokenized_claim
            tmp_evid = [0] * (max_seq_evid - len(tokenized_evid)) + tokenized_evid

            X_claim.append(tmp_claim)
            X_evid.append(tmp_evid)
            y.append(sample[2])

        yield [np.array(X_claim), np.array(X_evid)], np.array(y)


In [9]:
x_claims, x_evids, labels = next(generator(train, v4_val_to_key))
print(x_claims[0])
print(x_evids[0])
print(labels[0])

print(' '.join([translate_tokens[w] for w in x_claims[0]]))
print(' '.join([translate_tokens[w] for w in x_evids[0]]))

[    0     0     0 15825  4831 26594 31326 29150 11887 12273  2494 13842
 18357 20933]
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0  2671  7472 29150   258 22095 20508 18357 23840  4831 15893
 18357 24246 26228 11770 13848 18357 29150 21243 19299 29150 20892 29150
 20696  7472 20334]
0
<PAD> <PAD> <PAD> tom cruise turned down the role he was offered in cocktail
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> one of the biggest movie stars in hollywood cruise starred in several more successful films in the 1980s including the dramas the color of money


# MODEL

In [62]:
from keras.layers import Concatenate

class Model(object):
    def __init__(self, compile_info, value_to_key, embedding_dim, 
                 embedding_matrix, l2_reg):

        self.compile_info = compile_info
        self.value_to_key = value_to_key
        self.embedding_dim = embedding_dim
        self.embedding_matrix = embedding_matrix
        self.l2_reg = l2_reg

# SERVONO DUE EMBEDDING IN ENTRATE, DUE LSTM UNA PER CLAIM E UNA PER EVIDENCE, CONCATENATION E DENSI 
# INPUT_LENGTH NON È NECESSARIO IN EMBEDDING PERCHÈ CAMBIA SEMPRE
# usare la funtional api con x, y = claim, evid per poter fare concatenazione, reshape ecc
        model = keras.models.Sequential()
        input_claim = keras.layers.Input(shape=(None,))
        input_evid = keras.layers.Input(shape=(None,))
        emb_claim = layers.Embedding(input_dim=len(v4_val_to_key.keys()), #qui ci sarebbe un +1 ma da errore
                                            output_dim=EMBEDDING_SIZE,
                                            mask_zero=True,
                                            weights=[v4_matrix],
                                            trainable=False
                                            )(input_claim)
        emb_evid = layers.Embedding(input_dim=len(v4_val_to_key.keys()),
                                            output_dim=EMBEDDING_SIZE,
                                            mask_zero=True,
                                            weights=[v4_matrix],
                                            trainable=False
                                            )(input_evid)
        print(input_claim.shape)
        lstm_claim, forward_h_claim, forward_c_claim = layers.LSTM(EMBEDDING_SIZE, return_sequences=True, kernel_regularizer=l2(self.l2_reg), return_state = True)(emb_claim) #mettere max_seq_len come num recurrents?
        #state_h_claim = Concatenate()([forward_h_claim, backward_h_claim]) 
        print(forward_h_claim.shape)
        # print(state_h.shape)
        print(lstm_claim.shape)
        #print(backward_h_claim.shape)
        #print(backward_c_claim.shape)
        print(forward_c_claim.shape)
        lstm_evid, forward_h_evid, forward_c_evid = layers.LSTM(EMBEDDING_SIZE, return_sequences=True, kernel_regularizer=l2(self.l2_reg), return_state = True)(emb_evid)
        #state_h_evid = Concatenate()([forward_h_evid, backward_h_evid])
        merge = Concatenate()([forward_h_claim, forward_h_evid])
        outputs = layers.Dense(1, activation="sigmoid")(merge)

        model = keras.Model(inputs=[input_claim, input_evid], outputs=outputs)
        model.compile(**self.compile_info)
        model.summary()

        self.model = model

    def show_history(self, history: keras.callbacks.History):

        history_data = history.history
        print("Displaying the following history keys: ", history_data.keys())

        for key, value in history_data.items():
            if not key.startswith('val'):
                fig, ax = plt.subplots(1, 1)
                ax.set_title(key)
                ax.plot(value)
                if 'val_{}'.format(key) in history_data:
                    ax.plot(history_data['val_{}'.format(key)])
                else:
                    print("Couldn't find validation values for metric: ", key)

                ax.set_ylabel(key)
                ax.set_xlabel('epoch')
                ax.legend(['train', 'val'], loc='best')

        plt.show()

    def train_model(self,
                  train,
                  x_val: np.ndarray,
                  y_val: np.ndarray,
                  training_info: dict):
        print("Start training! \nParameters: {}".format(training_info))
        history = self.model.fit(x=train[0], y=train[1],
                                  validation_data=(x_val, y_val),
                                  shuffle=True,
                                  **training_info)
        print("Training completed! Showing history...")

        self.show_history(history)

model = Model(**model_params)


(None, None)
(None, 100)
(None, None, 100)
(None, 100)
Model: "model_22"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_74 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_75 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_61 (Embedding)       (None, None, 100)    3529400     ['input_74[0][0]']               
                                                                                                  
 embedding_62 (Embedding)       (None, None, 100)    3529400     ['input_75[0][0]']               
                                    

In [72]:
x_train, y_train = next(generator(train, v4_val_to_key))
print(x_train[0][0])
print(x_train[1][0])
print(y_train)
x_val, y_val = next(generator(val, v4_val_to_key))
x_test, y_test = next(generator(test, v4_val_to_key))

train_gen = generator(train, v4_val_to_key)

[    0     0     0     0     0     0     0     0 13432  7710 31126 14572
 29042 18357 16772  7472 28798 24741]
[    0     0     0     0     0     0     0     0     0     0     0     0
 12273 16983  3303  6688  3448 14679 18357  2579 22700  8621 26481 18357
 16772  7472 28798 24741   162 15127 30070 24246 14869 23020 12934 28031
 22700 20202 22700 29494 10372 18357 27398 31006 22772  2976 28589 18357
 31727]
[0 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 1 1 1 1 1 0 1 0]


In [73]:
compile_info = {
    'optimizer': keras.optimizers.Nadam(learning_rate=1e-2),
    'loss': 'binary_crossentropy',
    'metrics': ['acc']
}

training_info = {
    'verbose': 1,
    'epochs': 10,
    'batch_size': BATCH_SIZE,
    'callbacks': [keras.callbacks.EarlyStopping(monitor='val_acc', 
                                                patience=4,
                                                restore_best_weights=True)]
}

model_params = {
    'compile_info': compile_info,
    'value_to_key': v4_val_to_key,
    'embedding_dim': EMBEDDING_SIZE,
    'embedding_matrix': np.vstack((np.zeros((1, EMBEDDING_SIZE)),
                       v4_matrix)),
    'l2_reg' : 1e-5
}

prediction_info = {
    'batch_size': BATCH_SIZE,
    'verbose': 1
}

TRAINING=True
step_len = train.shape[0] // BATCH_SIZE
if TRAINING:
        model = Model(**model_params)


        history = model.model.fit(train_gen, steps_per_epoch=step_len,
                          validation_data=(x_val, y_val),
                          **training_info)
        print("Training completed! Showing history...")

        self.show_history(history)




        #history = model.train_model(generator(train, v4_val_to_key),
                                    #generator(val, v4_val_to_key), training_info=training_info)
       # model.model.save(m_name)
      #  models.append(model.model)
      #  histories.append(history)
else:
  #  models = [keras.models.load_model(m) for m in model_names]
   pass

(None, None)
(None, 100)
(None, None, 100)
(None, 100)
Model: "model_27"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_84 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_85 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_71 (Embedding)       (None, None, 100)    3529400     ['input_84[0][0]']               
                                                                                                  
 embedding_72 (Embedding)       (None, None, 100)    3529400     ['input_85[0][0]']               
                                    

KeyboardInterrupt: ignored