### This file was only used, to test my works during the creation of the project

# Imports

In [1]:
from datasets import load_dataset
import nltk
import numpy as np
from utils import preprocess_data, build_vocab, create_embedding_dict, encode_example, create_model_data_rep, create_batch
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Get the data ready

In [2]:
nltk.download('punkt') # this one was needed for the tokenizer of nltk (pretrained model, to properly tokenize aka punkt)
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/david/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
dataset = load_dataset("stanfordnlp/snli")

In [4]:
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
})


In [5]:
#type(dataset)
preprocessed_data = preprocess_data(dataset)

Filter: 100%|██████████| 10000/10000 [00:00<00:00, 35044.88 examples/s]
Filter: 100%|██████████| 10000/10000 [00:00<00:00, 43555.64 examples/s]
Filter: 100%|██████████| 550152/550152 [00:11<00:00, 46919.48 examples/s]


In [6]:
def build_vocab(preprocessed_data):
    vocab = set()
    for split in preprocessed_data:
        for example in preprocessed_data[split]:
            vocab.update(example["premise"])
            vocab.update(example["hypothesis"])
    return vocab

In [7]:
unique_vocab = build_vocab(preprocessed_data)

# Load the Glove embeddings

In [8]:
embeddings_dict = {}
with open("glove.840B.300d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        try:
            word = values[0] # A word such as dog
            if word in unique_vocab: # Only take in account the words that appear in the SNLI dataset, to speed up training
                vector = np.asarray(values[1:], "float32") # The embedding vector of the word
                embeddings_dict[word] = vector # A dict key-value of word/vector
        except:
            # Some cases in glove seem to be messed up, such as . . . or at name@domain.com
            continue

# Combine the dataset (tokenized) with the glove embeddings to get the glove embedding of a sentence

In [9]:
def sentence_to_glove(tokens, glove_dict, dim=300):
    vectors = [glove_dict.get(token, np.zeros(dim)) for token in tokens]
    return np.array(vectors)

In [10]:
def encode_example(example):
    return {
        "premise_glove": sentence_to_glove(example["premise"], embeddings_dict),
        "hypothesis_glove": sentence_to_glove(example["hypothesis"], embeddings_dict),
        "label": example["label"]
    }

In [11]:
train_data = preprocessed_data["train"]
train_example = encode_example(train_data[0])

In [12]:
len(embeddings_dict)

32797

In [13]:
print(train_data['premise'][0])
print(train_data[0])

['a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.']
{'premise': ['a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.'], 'hypothesis': ['a', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition', '.'], 'label': 1}


In [14]:
print(len(preprocessed_data["train"]))
print(len(preprocessed_data["train"].select(range(1000))))
print(type(preprocessed_data["train"]))
print(preprocessed_data["train"].select(range(1000))[0])

549367
1000
<class 'datasets.arrow_dataset.Dataset'>
{'premise': ['a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.'], 'hypothesis': ['a', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition', '.'], 'label': 1}


In [15]:
train_data = create_model_data_rep(preprocessed_data["train"])

In [None]:
train_rows = len(train_data['premise'])

# Shuffle the data and get the premises, hypothesis and target_labels (shuffled) for the entire training set
random_indices = np.random.permutation(train_rows)
premises = train_data['premise'][random_indices]
hypothesis = train_data['hypothesis'][random_indices]
target_labels = train_data['label'][random_indices]
print("Min target label:", target_labels.min())
print("Max target label:", target_labels.max())


[490443 339932 258008 ... 439308 502453 228306]
Min target label: 0
Max target label: 2


In [45]:
print(premises[0])
print(hypothesis[0])
print(target_labels[0])


['a', 'boy', 'is', 'standing', 'next', 'to', 'a', 'car', 'in', 'front', 'of', 'a', 'clothesline', '.']
['the', 'boy', 'is', 'by', 'a', 'clothesline', '.']
0


In [43]:
premise_batch = create_batch(premises[0:64], embeddings_dict)
hypothesis_batch = create_batch(hypothesis[0:64], embeddings_dict)
target_batch = torch.LongTensor(target_labels[0:64].astype(int))
#target_batch = create_batch(target_labels[0:64], embeddings_dict)