Imports

In [1]:
import torch
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List

Prepping dataset

In [2]:
file = open("arguments-training.tsv", 'r', encoding='utf8')
arguments = [line.strip().split('\t') for line in file.readlines()[1:]]
print(arguments[0])
file = open("labels-training.tsv", 'r', encoding='utf8')
labels = [line.strip().split('\t') for line in file.readlines()[1:]]
print(labels[0])

['A01001', 'Entrapment should be legalized', 'in favor of', "if entrapment can serve to more easily capture wanted criminals, then why shouldn't it be legal?"]
['A01001', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']


In [3]:
#Tokenize, conjoin strings, and add special tokens, remove item ids from labels
import spacy

def tokenize(text, labels):
    nlp = spacy.load("en_core_web_sm")

    args = []
    labs = []
    for arg, lab in zip(text, labels):
        if arg[3] == 'in favor of':
            sep = ['<PRO>']
        else:
            sep = ['<CON>']
        item = ['<SOS>'] + list(nlp(arg[1])) + sep + list(nlp(arg[3])) + ['<EOS>']
        args.append(item)
        labs.append(lab[1:20])

    return args, labs

arguments_tok, labels = tokenize(arguments, labels)
print(arguments_tok[0])
print(labels[0])

['<SOS>', Entrapment, should, be, legalized, '<CON>', if, entrapment, can, serve, to, more, easily, capture, wanted, criminals, ,, then, why, should, n't, it, be, legal, ?, '<EOS>']
['0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0']


In [5]:
#Create vocabulary and get word embeddings

#Since we have access to the Twitter embeddings already, I just used these. If there are better ones to use, let me know. 
# - Maddy

SPECIAL_TOKENS = ['<UNK>', '<PAD>', '<SOS>', '<EOS>', '<PRO>', '<CON>']
vocab = sorted(set([str(w) for ws in list(arguments_tok) + [SPECIAL_TOKENS] for w in ws]))
embeddings_path = 'glove.twitter.27B.200d.txt'

from typing import Dict, Tuple
import torch
import numpy as np

def read_pretrained_embeddings(
    embeddings_path: str,
    vocab
) -> Tuple[Dict[str, int], torch.FloatTensor]:
    """Read the embeddings matrix and make a dict hashing each word.

    Args:
        embeddings_path (str): _description_
        vocab_path (str): _description_

    Returns:
        Tuple[Dict[str, int], torch.FloatTensor]: _description_
    """
    word2i = {}
    vectors = []
    
    print(f"Reading embeddings from {embeddings_path}...")
    with open(embeddings_path, "r", encoding = "utf-8") as f:
        i = 0
        for line in f:
            word, *weights = line.rstrip().split(" ")
            
            if word in vocab:
                word2i[word] = i
                i += 1
                w_weights = [float(i) for i in weights]
                vectors.append(w_weights)

        vectors = torch.FloatTensor(vectors)

    return word2i, vectors

def get_oovs(vocab, word2i: Dict[str, int]) -> List[str]:
    """Find the vocab items that do not exist in the glove embeddings (in word2i).
    Return the List of such (unique) words.

    Args:
        vocab_path: List of batches of sentences.
        word2i (Dict[str, int]): _description_

    Returns:
        List[str]: _description_
    """
    glove_and_vocab = set(word2i.keys())
    vocab_and_not_glove = set(vocab) - glove_and_vocab
    return list(vocab_and_not_glove)

def initialize_new_embedding_weights(num_embeddings: int, dim: int) -> torch.FloatTensor:
    """xavier initialization for the embeddings of words in train, but not in gLove.

    Args:
        num_embeddings (int): _description_
        dim (int): _description_

    Returns:
        torch.FloatTensor: _description_
    """
    #Initialize a num_embeddings x dim matrix with xiavier initiialization
    return torch.FloatTensor(np.random.normal(0, dim**-0.5, size=(num_embeddings, dim)))
    

def update_embeddings(
    glove_word2i: Dict[str, int],
    glove_embeddings: torch.FloatTensor,
    oovs: List[str]
) -> Tuple[Dict[str, int], torch.FloatTensor]:
    #Add the oov words to the dict, assigning a new index to each
        i = len(glove_embeddings)
        for w in oovs:
            glove_word2i[w] = i
            i +=1
    #Concatenate a new row to embeddings for each oov, initialize those new rows with `intialize_new_embedding_weights`
        new_emb = initialize_new_embedding_weights(len(oovs), len(glove_embeddings[0]))
        cat_emb = torch.cat((glove_embeddings, new_emb), 0)
        return (glove_word2i, cat_emb)

glove_word2i, glove_embeddings = read_pretrained_embeddings(
    embeddings_path,
    vocab
)
oovs = get_oovs(vocab, glove_word2i)

# Add the oovs from training data to the word2i encoding, and as new rows
# to the embeddings matrix
word2i, embeddings = update_embeddings(glove_word2i, glove_embeddings, oovs)

Reading embeddings from glove.twitter.27B.200d.txt...


NameError: name 'np' is not defined

In [13]:
#Split into Train and Dev
import random as rd
import pandas as pd

def split_dataset(texts, labels, percent_train):
    
    num_texts = len(texts)
    df = pd.DataFrame(
    {'text': texts,
     'labels': labels
    })
    train = df.sample(frac=0.8)
    for txt in train:
        dev = df[df.text != txt]
    train_texts = list(train.loc[:,"text"])
    train_labels = list(train.loc[:,"labels"])
    dev_texts = list(dev.loc[:,"text"])
    dev_labels = list(dev.loc[:,"labels"])
    
    return train_texts, train_labels, dev_texts, dev_labels

train_arguments, train_labels, dev_arguments, dev_labels = split_dataset(arguments_tok, labels, 80)

In [14]:
assert len(train_arguments) == len(train_labels)
assert len(dev_arguments) == len(dev_labels)

Batch Training arguments and pad

In [6]:
#IN PROGRESS

import math

def make_batches(sequences: List[List[str]], labels: List[List[int]], batch_size: int) -> (List[ListList[[str]]], List[List[List[int]]]):
    """Yield batch_size chunks from sequences."""
    
    num_batch = math.floor(len(sequences)/batch_size)
    batched_sents = []
    batched_labs = []
    
    df = pd.DataFrame(data = {"seq": sequences, "lab": labels})
    for i in range(num_batch):
        batch = df.sample(n=batch_size, ignore_index = True)
        this_batch_sents = []
        this_batch_labs = []
        for i in range(0, len(batch)-1):
            sent = batch._get_value(i, "seq")
            label = batch._get_value(i, "lab")
            df = df[df.seq != sent]
            this_batch_sents.append(sent)
            this_batch_labs.append(label)
        batched_sents.append(this_batch_sents)
        batched_labs.append(this_batch_labs)
        
    return batched_sents, batched_labs

def pad()


# TODO: Set your preferred batch size
batch_size = 8
tokenizer = Tokenizer()

# We make batches now and use those.
batch_tokenized = []
# Note: Labels need to be batched in the same way to ensure
# We have train sentence and label batches lining up.
batched_sents, batched_labs = make_batches(train_arguments, train_labels, batch_size)
for batch in batched_sents:
    pad_batch = pad(batch)
    batch_tokenized.append(tok_batch)