# SharedTask Touche23 Human Value Detection

## Written by Madeleine Wallace and John Ortiz

In [15]:
import torch
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List
import numpy as np
import pandas as pd
from util import precision, recall, f1_score
import spacy
import math

## Importing dataset

In [5]:
file = open("data/arguments-training.tsv", 'r', encoding='utf8')
x_train = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()
print(x_train[0])

file = open("data/labels-training.tsv", 'r', encoding='utf8')
y_train = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()

file = open("data/arguments-validation.tsv", 'r', encoding='utf8')
x_valid = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()
print(x_valid[0])
file = open("data/labels-validation.tsv", 'r', encoding='utf8')
y_valid = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()
print(y_valid[0])
file = open("data/arguments-test.tsv", 'r', encoding='utf8')
x_test = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()
print(x_test[0])

['A01002', 'We should ban human cloning', 'in favor of', 'we should ban human cloning as it will only cause huge issues when you have a bunch of the same humans running around all acting the same.']
['A01001', 'Entrapment should be legalized', 'in favor of', "if entrapment can serve to more easily capture wanted criminals, then why shouldn't it be legal?"]
['A01001', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['A26004', 'We should end affirmative action', 'against', 'affirmative action helps with employment equity.']


## Tokenizing all Data

In [6]:
#Tokenize, conjoin strings, and add special tokens, remove item ids from labels
def tokenize(text, labels=None):
    nlp = spacy.load("en_core_web_sm")

    args = []
    labs = []
    if(labels != None):
        for arg, lab in zip(text, labels):
            if arg[3] == 'in favor of':
                sep = ['<PRO>']
            else:
                sep = ['<CON>']
            item = ['<SOS>'] + list(nlp(arg[1])) + sep + list(nlp(arg[3])) + ['<EOS>']
            args.append(item)
            labs.append(lab[1:20])
    else:
        for arg in text:
            if arg[3] == 'in favor of':
                sep = ['<PRO>']
            else:
                sep = ['<CON>']
            item = ['<SOS>'] + list(nlp(arg[1])) + sep + list(nlp(arg[3])) + ['<EOS>']
            args.append(item)

    return args, labs
    
def tokenize_allData(x_train,y_train,x_valid,y_valid,x_test):
    x_train, y_train = tokenize(x_train, y_train)
    x_valid, y_valid = tokenize(x_valid,y_valid)
    x_test, _ = tokenize(x_test)
    print(x_train[0], y_train[0])
    print("x_train size: ",len(x_train)," - x_train size: ",len(y_train))
    print("___________________")
    print(x_valid[0], y_valid[0])
    print("x_valid size: ",len(x_valid)," - y_valid size: ",len(y_valid))
    print("_______________")
    print(x_test[0])
    print("xTest size: ", len(x_test))
    return x_train,y_train,x_valid,y_valid,x_test
x_train,y_train,x_valid,y_valid,x_test = tokenize_allData(x_train,y_train,x_valid,y_valid,x_test)

['<SOS>', We, should, ban, human, cloning, '<CON>', we, should, ban, human, cloning, as, it, will, only, cause, huge, issues, when, you, have, a, bunch, of, the, same, humans, running, around, all, acting, the, same, ., '<EOS>'] ['0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0']
x_train size:  5393  - x_train size:  5393
___________________
['<SOS>', Entrapment, should, be, legalized, '<CON>', if, entrapment, can, serve, to, more, easily, capture, wanted, criminals, ,, then, why, should, n't, it, be, legal, ?, '<EOS>'] ['0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0']
x_valid size:  1896  - y_valid size:  1896
_______________
['<SOS>', We, should, end, affirmative, action, '<CON>', affirmative, action, helps, with, employment, equity, ., '<EOS>']
xTest size:  1576


## Label Selection (Which labels to use in models )

In [7]:
def removeLabels_NotWanted(labels, labels_wanted):
    newLabels=[]
    for row in labels:
        newRow = []
        for i in labels_wanted:
            newRow.append(row[i])
        assert len(newRow) == len(labels_wanted)
        newLabels.append(newRow)
    return newLabels
def removeAllEmptyLabelRows(text,labels):
    newText=[]
    newLabels=[]
    for i in range(len(text)):
        intList = [eval(j) for j in labels[i] ]
        if(np.sum(intList)!=0):
            newText.append(text[i])
            newLabels.append(labels[i])
      #else:
            #print(labels[i])
    return newText,newLabels


### Creating different labels for training on     
    labels_starters recommended by Eval:   Self-direction: action, Achievement, Security: personal, Security: societal, Benevolence: caring, Universalism: concern.
 

In [8]:
reference_labels = {
    "Self-direction: thought": 0,
    "Self-direction: action": 1,	
    "Stimulation": 2,
    "Hedonism": 3,
    "Achievement": 4,
    "Power: dominance": 5,
    "Power: resources": 6,
    "Face": 7,	
    "Security: personal": 8,
    "Security: societal": 9,
    "Tradition": 10,
    "Conformity: rules": 11,
    "Conformity: interpersonal": 12,
    "Humility": 13,	
    "Benevolence: caring": 14,
    "Benevolence: dependability": 15,	
    "Universalism: concern": 16,	
    "Universalism: nature": 17,	
    "Universalism: tolerance": 18,
    "Universalism: objectivity": 19
}

#________________________Define other label category selections here________________________#
recommended_categories = [1, 4, 8, 9, 14, 16]
starters_dict = {
    "Self-direction: action":0,
    "Achievement": 1,
    "Security: personal": 2,
    "Security: societal": 3,
    "Benevolence: caring": 4,
    "Universalism: concern": 16
}
security = [8, 9] 
security_dict = {
    "Security: personal": 0,
    "Security: societal": 1
}


"""
print("Number of original items:",len(x_train), len(y_train))
labels_starters = removeLabels_NotWanted(y_train, recommended_categories)
x_train_trimmed, labels_starters_train = removeAllEmptyLabelRows(x_train, labels_starters)
print("xTest - yTest", len(x_train_trimmed),"-",len(labels_starters_train))
labels_starters_valid = removeLabels_NotWanted(y_valid, recommended_categories)
x_valid_trimmed, labels_starters_valid = removeAllEmptyLabelRows(x_valid, labels_starters_valid)
print("xValid - yValid: ", len(x_valid_trimmed),"-",len(labels_starters_valid))

"""

'\nprint("Number of original items:",len(x_train), len(y_train))\nlabels_starters = removeLabels_NotWanted(y_train, recommended_categories)\nx_train_trimmed, labels_starters_train = removeAllEmptyLabelRows(x_train, labels_starters)\nprint("xTest - yTest", len(x_train_trimmed),"-",len(labels_starters_train))\nlabels_starters_valid = removeLabels_NotWanted(y_valid, recommended_categories)\nx_valid_trimmed, labels_starters_valid = removeAllEmptyLabelRows(x_valid, labels_starters_valid)\nprint("xValid - yValid: ", len(x_valid_trimmed),"-",len(labels_starters_valid))\n\n'

In [9]:
"""Requires two inputs the labels index you want to focus on and dictionary reassigning labels"""
def reduceDF_basedOnLabels(focusName,labelFocus,focudDict, x_train,y_train,x_valid,y_valid):
  print(f"({focusName}) - Original items (xTrain:{len(x_train)} - yTrain:{len(y_train)}) , (xValid:{len(x_valid)} - yValid:{len(y_valid)}):")
  labels = removeLabels_NotWanted(y_train, labelFocus)
  x_train_trimmed, labels_train = removeAllEmptyLabelRows(x_train, labels)
  labels_valid = removeLabels_NotWanted(y_valid, labelFocus)
  x_valid_trimmed, labels__valid = removeAllEmptyLabelRows(x_valid, labels_valid)
  print(f"({focusName}) - Item size based on desired labels (xTrain:{len(x_train_trimmed)} - yTrain:{len(labels_train)}) , (xValid:{len(x_valid_trimmed)} - yValid:{len(labels__valid)}):")
  return x_train_trimmed,labels_train,x_valid_trimmed,labels__valid

In [13]:
x_train_recommended,labels_train_recommended,x_valid_recommended,labels_valid_recommended  = reduceDF_basedOnLabels("Recommended",recommended_categories,starters_dict, x_train,y_train,x_valid,y_valid)
x_train_security,labels_train_security,x_valid_security,labels_valid_security  = reduceDF_basedOnLabels("Security",security,security_dict, x_train,y_train,x_valid,y_valid)


(Recommended) - Original items (xTrain:5393 - yTrain:5393) , (xValid:1896 - yValid:1896):
(Recommended) - Item size based on desired labels (xTrain:4985 - yTrain:4985) , (xValid:1768 - yValid:1768):
(Security) - Original items (xTrain:5393 - yTrain:5393) , (xValid:1896 - yValid:1896):
(Security) - Item size based on desired labels (xTrain:3164 - yTrain:3164) , (xValid:1109 - yValid:1109):


## Create PT3 Label set

In [11]:
def makePT3Labels(labels_train,labels_valid):
    PT3LabelsDict = {}
    PT3Labels_train = []
    #PT3Labels_vects = []
    label=0
    for row in labels_train:
        row_lab = [key for key, value in PT3LabelsDict.items() if value == row]
        if row_lab:
            PT3Labels_train.append(row_lab[0])
        else:
            PT3LabelsDict[label] = row
            PT3Labels_train.append(label)
            label += 1
    # for row in PT3Labels_idx:
    #     label_vect = [0]*len(PT3LabelsDict)
    #     label_vect[row] = 1
    #     PT3Labels_vects.append(label_vect)
    PT3LabelsValid = []
    label=0
    for row in labels_valid:
        row_lab = [key for key, value in PT3LabelsDict.items() if value == row]
        if row_lab:
            PT3LabelsValid.append(row_lab[0])
        else:
            PT3LabelsDict[label] = row
            PT3LabelsValid.append(label)
            label += 1

    print("PT3 ySize- ",len(PT3Labels_train))
    print("PT3 ySizeValid- ",len(PT3LabelsValid))
    print("Number of new combination labels:", len(PT3LabelsDict))
    return PT3Labels_train, PT3LabelsValid,PT3LabelsDict

In [14]:
"""To grab valid pt3 data one need both labels_train and labels_valid of whatever focus we are trying 
One instance could be  labels_security_train, labels_security_valid

"""
PT3LabelsTrain, PT3LabelsValid, PT3Dict = makePT3Labels(labels_train_recommended,labels_valid_recommended)

PT3 ySize-  4985
PT3 ySizeValid-  1768
Number of new combination labels: 63


In [None]:
PT3LabelsTrain, PT3Dict = makePT3Labels(labels_starters_train)


#PT3Labels_idx = []
PT3LabelsValid = []
label=0
for row in labels_starters_valid:
    row_lab = [key for key, value in PT3Dict.items() if value == row]
    if row_lab:
        PT3LabelsValid.append(row_lab[0])
    else:
        PT3Dict[label] = row
        PT3LabelsValid.append(label)
        label += 1
# for row in PT3Labels_idx:
#     label_vect = [0]*len(PT3Dict)
#     label_vect[row] = 1
#     PT3LabelsValid.append(label_vect)
print("PT3 ySize- ",len(PT3LabelsTrain))
print("PT3 ySizeValid- ",len(PT3LabelsValid))
print("Number of new combination labels:", len(PT3Dict))


In [None]:
#Uncommment to run PT3 for labels = security
"""PT3LabelsTrain, PT3Dict = makePT3Labels(labels_security_train)


#PT3Labels_idx = []
PT3LabelsValid = []
label=0
for row in labels_security_valid:
    row_lab = [key for key, value in PT3Dict.items() if value == row]
    if row_lab:
        PT3LabelsValid.append(row_lab[0])
    else:
        PT3Dict[label] = row
        PT3LabelsValid.append(label)
        label += 1
# for row in PT3Labels_idx:
#     label_vect = [0]*len(PT3Dict)
#     label_vect[row] = 1
#     PT3LabelsValid.append(label_vect)

print("Number of new combination labels:", len(PT3Dict))
"""

## Get PT4 Labels

In [None]:
#creates two dictionaries, one with positive instances for each class
#and one for negative instances for each class
def makePT4Labels(allClassLabels):
    PT4LabelsPos = {}
    PT4LabelsNeg = {}
    
    for i in range(len(allClassLabels)):
        PT4LabelsPos[i] = []
        PT4LabelsNeg[i] = []

    for row in allClassLabels:
        label_idx = 0
        for label in row:
            if label == '0':
                PT4LabelsPos[label_idx].append(0)
                PT4LabelsNeg[label_idx].append(1)
            if label == '1':
                PT4LabelsPos[label_idx].append(1)
                PT4LabelsNeg[label_idx].append(0)
            label_idx += 1
    assert len(PT4LabelsPos[0]) == len(PT4LabelsNeg[0]) == len(allClassLabels)

    return PT4LabelsPos, PT4LabelsNeg


In [None]:
PT4LabelsPos, PT4LabelsNeg = makePT4Labels(labels_starters)
print(len(PT4LabelsPos), len(PT4LabelsNeg))

In [None]:
#len(PT4LabelsPos.update(PT4LabelsNeg))
#PT4LabelsPos.update(PT4LabelsNeg)
#PT4LabelsTrain = PT4LabelsPos.copy()
#(len(PT4LabelsTrain))

## Tokenizign Dataset

  - Based on train, dev , test combination of all words

In [23]:
#TODO: combine xtrain,x-valid, x-test
#  originall: x_train_trimmed - > x_train_recommended
# goal x_train_recommended + x_valid_recommended + x_test_recommended 

In [24]:
SPECIAL_TOKENS = ['<UNK>', '<PAD>', '<SOS>', '<EOS>', '<PRO>', '<CON>']
vocab = sorted(set([str(w) for ws in list(x_train_recommended) + [SPECIAL_TOKENS] for w in ws]))
embeddings_path = '../glove.twitter.27B.200d.txt'

from typing import Dict, Tuple
import torch
import numpy as np

def read_pretrained_embeddings(
    embeddings_path: str,
    vocab
) -> Tuple[Dict[str, int], torch.FloatTensor]:
    """Read the embeddings matrix and make a dict hashing each word.

    Args:
        embeddings_path (str): _description_
        vocab_path (str): _description_

    Returns:
        Tuple[Dict[str, int], torch.FloatTensor]: _description_
    """
    word2i = {}
    vectors = []
    
    print(f"Reading embeddings from {embeddings_path}...")
    with open(embeddings_path, "r", encoding = "utf-8") as f:
        i = 0
        for line in f:
            word, *weights = line.rstrip().split(" ")
            
            if word in vocab:
                word2i[word] = i
                i += 1
                w_weights = [float(i) for i in weights]
                vectors.append(w_weights)

        vectors = torch.FloatTensor(vectors)

    return word2i, vectors

def get_oovs(vocab, word2i: Dict[str, int]) -> List[str]:
    """Find the vocab items that do not exist in the glove embeddings (in word2i).
    Return the List of such (unique) words.

    Args:
        vocab_path: List of batches of sentences.
        word2i (Dict[str, int]): _description_

    Returns:
        List[str]: _description_
    """
    glove_and_vocab = set(word2i.keys())
    vocab_and_not_glove = set(vocab) - glove_and_vocab
    return list(vocab_and_not_glove)

def initialize_new_embedding_weights(num_embeddings: int, dim: int) -> torch.FloatTensor:
    """xavier initialization for the embeddings of words in train, but not in gLove.

    Args:
        num_embeddings (int): _description_
        dim (int): _description_

    Returns:
        torch.FloatTensor: _description_
    """
    #Initialize a num_embeddings x dim matrix with xiavier initiialization
    return torch.FloatTensor(np.random.normal(0, dim**-0.5, size=(num_embeddings, dim)))
    

def update_embeddings(
    glove_word2i: Dict[str, int],
    glove_embeddings: torch.FloatTensor,
    oovs: List[str]
) -> Tuple[Dict[str, int], torch.FloatTensor]:
    #Add the oov words to the dict, assigning a new index to each
        i = len(glove_embeddings)
        for w in oovs:
            glove_word2i[w] = i
            i +=1
    #Concatenate a new row to embeddings for each oov, initialize those new rows with `intialize_new_embedding_weights`
        new_emb = initialize_new_embedding_weights(len(oovs), len(glove_embeddings[0]))
        cat_emb = torch.cat((glove_embeddings, new_emb), 0)
        return (glove_word2i, cat_emb)

In [25]:
glove_word2i, glove_embeddings = read_pretrained_embeddings(embeddings_path,vocab)
oovs = get_oovs(vocab, glove_word2i)

# Add the oovs from training data to the word2i encoding, and as new rows
# to the embeddings matrix
word2i, embeddings = update_embeddings(glove_word2i, glove_embeddings, oovs)

Reading embeddings from ../glove.twitter.27B.200d.txt...


### Make batches for each different dataframe 

In [16]:
#defining batches here

def make_batches(sequences: List[List[str]], labels: List[List[int]], batch_size: int) -> (List[List[List[str]]], List[List[List[int]]]):
    """Yield batch_size chunks from sequences."""
    
    num_batch = math.floor(len(sequences)/batch_size)
    batched_sents = []
    batched_labs = []
    
    df = pd.DataFrame(data = {"seq": sequences, "lab": labels})
    for i in range(num_batch):
        batch = df.sample(n=batch_size)
        #print("Batch size: ",batch.shape[0])
        this_batch_sents = []
        this_batch_labs = []
        for index, row in batch.iterrows():
            sent = row['seq']
            label = row['lab']
            #df = df[df.seq != sent]
            this_batch_sents.append(sent)
            this_batch_labs.append(label)
        df = df.drop(batch.index)
        batched_sents.append(this_batch_sents)
        batched_labs.append(this_batch_labs)
        
    return batched_sents, batched_labs


def pad(sents, labs):
    lengths = []
    for sent in sents:
        lengths.append(len(sent))
            
    max_length = max(lengths)
        
    for sent in sents:
        n = max_length - len(sent)
        for i in range(n):
            sent.append("")
        
    return sents

In [17]:
# Set your preferred batch size
batch_size = 8

#_________PT3________#
# We make batches now and use those.
PT3_batches_train = []
# Note: Labels need to be batched in the same way to ensure
# We have train sentence and label batches lining up.
print(len(x_train_recommended),len(PT3LabelsTrain))
PT3_batched_sents, PT3_batched_labs = make_batches(x_train_recommended, PT3LabelsTrain, batch_size)
for batch in PT3_batched_sents:
    pad_batch = pad(batch, PT3_batched_labs)
    PT3_batches_train.append(pad_batch)
    
print(len(x_valid_recommended),len(PT3LabelsValid))
PT3_batches_valid = []
PT3_batched_sents_valid, PT3_batched_labs_valid = make_batches(x_valid_recommended, PT3LabelsValid, batch_size)
for batch in PT3_batched_sents_valid:
    pad_batch = pad(batch, PT3_batched_labs_valid)
    PT3_batches_valid.append(pad_batch)
   
#________PT4_________#
"""
#batching train
PT4_batches_train = []
PT4_batched_sents, PT4_batched_labs = make_batches(x_train_trimmed, PT4LabelsTrain, batch_size)
for batch in PT4_batched_sents:
    pad_batch = pad(batch, PT4_batched_labs_valid)
    PT4_batches_train.append(pad_batch)
#batching valid
PT4_batches_valid = []
PT4_batched_sents_valid, PT4_batched_labs_valid = make_batches(x_valid_trimmed, PT4LabelsValid, batch_size)
for batch in PT4_batched_sents_valid:
    pad_batch = pad(batch, PT4_batched_labs_valid)
    PT4_batches_valid.append(pad_batch)
    """

4985 4985
1768 1768


'\n#batching train\nPT4_batches_train = []\nPT4_batched_sents, PT4_batched_labs = make_batches(x_train_trimmed, PT4LabelsTrain, batch_size)\nfor batch in PT4_batched_sents:\n    pad_batch = pad(batch, PT4_batched_labs_valid)\n    PT4_batches_train.append(pad_batch)\n#batching valid\nPT4_batches_valid = []\nPT4_batched_sents_valid, PT4_batched_labs_valid = make_batches(x_valid_trimmed, PT4LabelsValid, batch_size)\nfor batch in PT4_batched_sents_valid:\n    pad_batch = pad(batch, PT4_batched_labs_valid)\n    PT4_batches_valid.append(pad_batch)\n    '

In [18]:
import torch

class ValuesClassifier(torch.nn.Module):
    def __init__(self, 
    output_size: int, 
    hidden_size: int,
    embeddings_tensor: torch.FloatTensor,
    pad_idx: int,
    dropout_val: float = 0.3,
    input_dim: int = 200,
    ):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        # Initialize BERT, which we use instead of a single embedding layer.
        self.bert = BertModel.from_pretrained("prajjwal1/bert-small")
        self.bert_hidden_dimension = self.bert.config.hidden_size
        self.hidden_layer = torch.nn.Linear(self.bert_hidden_dimension, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
        self.log_softmax = torch.nn.LogSoftmax(dim=2)
        self.embeddings = torch.nn.Embedding.from_pretrained(embeddings_tensor, freeze = False, padding_idx = pad_idx)
        self.dropout_val = dropout_val
        self.dropout_layer = torch.nn.Dropout(p=self.dropout_val, inplace=False)
        self.pad_idx = pad_idx
        self.input_dim = input_dim
        self.lstm = torch.nn.LSTM(
            self.input_dim,
            self.hidden_size,
            num_layers=5,
            dropout=dropout_val,
            batch_first=True,
            bidirectional=True,
        )
        


    def encode_text(
        self,
        symbols: torch.Tensor
    ) -> torch.Tensor:
        """Encode the (batch of) sequence(s) of token symbols with an LSTM.
            Then, get the last (non-padded) hidden state for each symbol and return that.

        Args:
            symbols (torch.Tensor): The batch size x sequence length tensor of input tokens

        Returns:
            torch.Tensor: The final hiddens tate of the LSTM, which represents an encoding of
                the entire sentence
        """
        # First we get the embedding for each input symbol
        embedded = self.embeddings(symbols)
        embedded = self.dropout_layer(embedded)
        # Packs embedded source symbols into a PackedSequence.
        # This is an optimization when using padded sequences with an LSTM
        lens = (symbols != self.pad_idx).sum(dim=1).to("cpu")
        packed = torch.nn.utils.rnn.pack_padded_sequence(
            embedded, lens, batch_first=True, enforce_sorted=False
        )
        # -> batch_size x seq_len x encoder_dim, (h0, c0).
        packed_outs, (H, C) = self.lstm(packed)
        encoded, _ = torch.nn.utils.rnn.pad_packed_sequence(
            packed_outs,
            batch_first=True,
            padding_value=self.pad_idx,
            total_length=None,
        )
        # Now we have the representation of eahc token encoded by the LSTM.
        encoded, (H, C) = self.lstm(embedded)
        
        # This part looks tricky. All we are doing is getting a tensor
        # That indexes the last non-PAD position in each tensor in the batch.
        last_enc_out_idxs = lens - 1
        # -> B x 1 x 1.
        last_enc_out_idxs = last_enc_out_idxs.view([encoded.size(0)] + [1, 1])
        # -> 1 x 1 x encoder_dim. This indexes the last non-padded dimension.
        last_enc_out_idxs = last_enc_out_idxs.expand(
            [-1, -1, encoded.size(-1)]
        )
        # Get the final hidden state in the LSTM
        last_hidden = torch.gather(encoded, 1, last_enc_out_idxs)
        return last_hidden

    def forward(
        self,
        symbols: Dict,
    ) -> torch.Tensor:
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        output = self.hidden_layer(encoded_sents)
        output = self.relu(output)
        output = self.classifier(output)
        return self.log_softmax(output)
        
# For making predictions at test time
def predict(model: torch.nn.Module, sents: torch.Tensor) -> List:
    logits = model(sents)
    return list(torch.argmax(logits, axis=2).squeeze().numpy())

In [19]:
# Use these functions to encode your batches before you call the train loop.

def encode_sentences(batch: List[List[str]], word2i: Dict[str, int]) -> torch.LongTensor:
    """Encode the tokens in each sentence in the batch with a dictionary

    Args:
        batch (List[List[str]]): The padded and tokenized batch of sentences.
        word2i (Dict[str, int]): The encoding dictionary.

    Returns:
        torch.LongTensor: The tensor of encoded sentences.
    """
    UNK_IDX = word2i["<UNK>"]
    tensors = []
    for sent in batch:
        tensors.append(torch.LongTensor([word2i.get(w, UNK_IDX) for w in sent]))
        
    return torch.stack(tensors)


def encode_labels(labels: List[int]) -> torch.FloatTensor:
    """Turns the batch of labels into a tensor

    Args:
        labels (List[int]): List of all labels in the batch

    Returns:
        torch.FloatTensor: Tensor of all labels in the batch
    """
    return torch.LongTensor([int(l) for l in labels])

In [20]:
import numpy as np
from numpy import logical_and, sum as t_sum


def precision(predicted_labels, true_labels, which_label=1):
    """
    Precision is True Positives / All Positives Predictions
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(pred_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def recall(predicted_labels, true_labels, which_label=1):
    """
    Recall is True Positives / All Positive Labels
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(true_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def f1_score(
    predicted_labels: List[int],
    true_labels: List[int],
    which_label: int
):
    """
    F1 score is the harmonic mean of precision and recall
    """
    P = precision(predicted_labels, true_labels, which_label=which_label)
    R = recall(predicted_labels, true_labels, which_label=which_label)
    if P and R:
        return 2*P*R/(P+R)
    else:
        return 0.


def macro_f1(
    predicted_labels: List[int],
    true_labels: List[int],
    possible_labels: List[int]
):
    scores = [f1_score(predicted_labels, true_labels, l) for l in possible_labels]
    # Macro, so we take the uniform avg.
    return sum(scores) / len(scores)

In [21]:
import random
import tqdm

def training_loop(
    num_epochs,
    train_features,
    train_labels,
    dev_sents,
    dev_labels,
    optimizer,
    model,
    possible_labels,
):
    print("Training...")
    loss_func = torch.nn.NLLLoss()
    batches = list(zip(train_features, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        print("Working on epoch", i)
        for features, labels in tqdm.tqdm(batches):
            # Empty the dynamic computation graph
            optimizer.zero_grad()
            preds = model(features).squeeze(1)
            loss = loss_func(preds, labels)
            # Backpropogate the loss through our model
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
        # Estimate the f1 score for the development set
        print("Evaluating dev...")
        all_preds = []
        all_labels = []
        for sents, labels in tqdm.tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
            pred = predict(model, sents)
            all_preds.extend(pred)
            all_labels.extend(list(labels.numpy()))

        dev_f1 = macro_f1(all_preds, all_labels, possible_labels)
        print(f"Dev F1 {dev_f1}")
        
    # Return the trained model
    return model

In [26]:
# You can increase epochs if need be
epochs = 10
# TODO: Find a good learning rate
LR = 0.0001
hidden_size = 256
batch_size = 8

#encode
train_input_batches = [encode_sentences(batch, word2i) for batch in PT3_batched_sents]
train_label_batches = [encode_labels(batch) for batch in PT3_batched_labs]

validation_input_sents = [encode_sentences(batch, word2i) for batch in PT3_batched_sents_valid]
validation_encoded_labels = [encode_labels(batch) for batch in PT3_batched_labs_valid]

num_possible_labels = len(PT3Dict)
model = ValuesClassifier(num_possible_labels, hidden_size, embeddings, word2i['<PAD>'])
optimizer = torch.optim.AdamW(model.parameters(), LR)

possible_labels = PT3Dict.keys()

output_model = training_loop(
    epochs,
    train_input_batches,
    train_label_batches,
    validation_input_sents,
    validation_encoded_labels,
    optimizer,
    model,
    possible_labels
)

Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training...
Working on epoch 0


  3%|▎         | 17/623 [00:43<26:01,  2.58s/it]


KeyboardInterrupt: 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# knn base have not finished yet
"""ML-kNN (Zhang & Zhou, 2005) is an adaptation of the kNN lazy learning algorithm for multi-label
data. Actually this method follows the paradigm of PT4. In essence, ML-kNN uses the kNN algorithm
independently for each label l: It finds the k nearest examples to the test instance and considers those
that are labelled at least with l as positive and the rest as negative. What mainly differentiates this
method from the application of the original kNN algorithm to the transformed problem using PT4 is
the use of prior probabilities. ML-kNN has also the capability of producing a ranking of the labels as
an output. 


Luo and Zincir-Heywood (2005) present two systems for multi-label document classification, which
are also based on the kNN classifier. The main contribution of their work is on the pre-processing
stage for the effective representation of documents. For the classification of a new instance, the
systems initially find the k nearest examples. Then for every appearance of each label in each of these
examples, they increase a corresponding counter for that label. Finally they output the N labels with
the largest counts. N is chosen based on the number of labels of the instance. This is an inappropriate
strategy for real-world use, where the number of labels of a new instance is unknown. """

class NearestNeighbor(object):
    def __init__(self):
        pass

    def train(self, X, y):
        """ X is N x D where each row is an example. Y is 1-dimension of size N """
        # the nearest neighbor classifier simply remembers all the training data
        self.Xtr = X
        self.ytr = y

    def predict(self, X, distance='L1'):
        """ X is N x D where each row is an example we wish to predict label for """
        num_test = X.shape[0]
        # lets make sure that the output type matches the input type
        Ypred = np.zeros(num_test, dtype=self.ytr.dtype)

        # loop over all test rows
        for i in range(num_test):
            # find the nearest training image to the i'th test image
            # using the L1 distance (sum of absolute value differences)
            if distance == 'L1':
                distances = np.sum(np.abs(self.Xtr - X[i,:]), axis=1)
            # using the L2 distance (sum of absolute value differences)
            if distance == 'L2':
                distances = np.sqrt(np.sum(np.square(self.Xtr - X[i,:]), axis=1))
            min_index = np.argmin(distances) # get the index with smallest distance
            Ypred[i] = self.ytr[min_index] # predict the label of the nearest example

        return Ypred

In [None]:
#https://github.com/AnupamMicrosoft/PyTorch-Classification/blob/master/Linear%20Support%20Vector%20Machines.py
from torch import nn
import random
class SVM_Loss(nn.modules.Module):    
    def __init__(self):
        super(SVM_Loss,self).__init__()
    def forward(self, outputs, labels):
         return torch.sum(torch.clamp(1 - outputs.t()*labels, min=0))/batch_size

        
        
def runSVM(epochs,input_size,num_classes,train_input_batches, train_label_batches,validation_input_sents,
    validation_encoded_labels):      
    #SVM regression model and Loss
    svm_model = nn.Linear(input_size,num_classes)
    #model = LogisticRegression(input_size,num_classes)

    ## Loss criteria and SGD optimizer
    svm_loss_criteria = SVM_Loss()
    #loss_criteria = nn.CrossEntropyLoss()  

    #svm_optimizer = torch.optim.SGD(svm_model.parameters(), lr=learning_rate, momentum=momentum)
    optimizer = torch.optim.AdamW(svm_model.parameters(), LR)
    
    batches = list(zip(train_input_batches, train_label_batches))
    random.shuffle(batches)
    
    
    #total_step = len(batches)
    for epoch in range(epochs):
        avg_loss_epoch = 0
        batch_loss = 0
        total_batches = 0
        for features, labels in tqdm.tqdm(batches):
            # Reshape images to (batch_size, input_size)
            #images = images.reshape(-1, 28*28)                      
            #labels = Variable(2*(labels.float()-0.5))

            # Forward pass        
            outputs = svm_model(features)           
            loss_svm = svm_loss_criteria(outputs, labels)    


            # Backward and optimize
            optimizer.zero_grad()
            loss_svm.backward()
            optimizer.step()    

            #print("Model's parameter after the update:")
            #for param2 in svm_model.parameters():
             #   print(param2)
            total_batches += 1     
            batch_loss += loss_svm.item()

        avg_loss_epoch = batch_loss/total_batches
        print ('Epoch [{}/{}], Averge Loss:for epoch[{}, {:.4f}]' 
                       .format(epoch+1, num_epochs, epoch+1, avg_loss_epoch ))
    return svm_model
        


In [None]:
#__________________pt4 on svm  ______________#
# You can increase epochs if need be
epochs = 10
# TODO: Find a good learning rate
LR = 0.00001
hidden_size = 256
batch_size = 8

#encode
train_input_batches = [encode_sentences(batch, word2i) for batch in PT4_batched_sents]
train_label_batches = [encode_labels(batch) for batch in PT4_batched_labs]

validation_input_sents = [encode_sentences(batch, word2i) for batch in PT4_batched_sents_valid]
validation_encoded_labels = [encode_labels(batch) for batch in PT4_batched_labs_valid]

num_possible_labels = len(PT4Dict)
#model = ValuesClassifier(num_possible_labels, hidden_size, embeddings, word2i['<PAD>'])
optimizer = torch.optim.AdamW(model.parameters(), LR)

possible_labels = PT4Dict.keys()
input_size, _ = getSizeOfPT_Batched(train_input_batches,train_label_batches)

runSVM(epochs,input_size,len(possible_labels),train_input_batches, train_label_batches,validation_input_sents,
    validation_encoded_labels)

In [None]:
import tqdm

#_____________PT3 on SVM_________#
epochs = 10
# TODO: Find a good learning rate
LR = 0.00001
hidden_size = 256
batch_size = 8

#encode
train_input_batches = [encode_sentences(batch, word2i) for batch in PT3_batched_sents]
train_label_batches = [encode_labels(batch) for batch in PT3_batched_labs]

validation_input_sents = [encode_sentences(batch, word2i) for batch in PT3_batched_sents_valid]
validation_encoded_labels = [encode_labels(batch) for batch in PT3_batched_labs_valid]

num_possible_labels = len(PT3Dict)
#model = ValuesClassifier(num_possible_labels, hidden_size, embeddings, word2i['<PAD>'])
#optimizer = torch.optim.AdamW(model.parameters(), LR)

possible_labels = PT3Dict.keys()
runSVM(epochs,len(train_input_batches),len(possible_labels),train_input_batches, train_label_batches,validation_input_sents,
    validation_encoded_labels)

### Bert Multi-Classification Problem

In [None]:
https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

Re-Importing

In [7]:
file = open("data/arguments-training.tsv", 'r', encoding='utf8')
x_train = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()
print(x_train[0])

file = open("data/labels-training.tsv", 'r', encoding='utf8')
y_train = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()

file = open("data/arguments-validation.tsv", 'r', encoding='utf8')
x_valid = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()
print(x_valid[0])
file = open("data/labels-validation.tsv", 'r', encoding='utf8')
y_valid = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()
print(y_valid[0])
file = open("data/arguments-test.tsv", 'r', encoding='utf8')
x_test = [line.strip().split('\t') for line in file.readlines()[1:]]
file.close()
print(x_test[0])

['A01002', 'We should ban human cloning', 'in favor of', 'we should ban human cloning as it will only cause huge issues when you have a bunch of the same humans running around all acting the same.']
['A01001', 'Entrapment should be legalized', 'in favor of', "if entrapment can serve to more easily capture wanted criminals, then why shouldn't it be legal?"]
['A01001', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['A26004', 'We should end affirmative action', 'against', 'affirmative action helps with employment equity.']


Retokenize for BERT optimization

In [88]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.args = dataframe.args
        self.targets = self.data.labs

        special_tokens = ["<PRO>", "<CON>"]
        self.tokenizer.add_tokens(special_tokens, special_tokens = True)


    def __len__(self):
        return len(self.args)

    def __getitem__(self, index):
        args = str(self.args[index])
        args = " ".join(args.split())

        inputs = self.tokenizer.encode_plus(
            args,
            None,
            add_special_tokens=True,
            padding='max_length',
            max_length = len(max(self.args)),
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        self.targets[index] = [int(l) for l in self.targets[index]]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

In [63]:
from typing import Dict, List
def combine(text, labels=None):
    
    args = []
    labs = []
    if(labels != None):
        for arg, lab in zip(text, labels):
            if arg[3] == 'in favor of':
                sep = '<PRO>'
            else:
                sep = '<CON>'
            item = '<SOS>' + arg[1] + sep + arg[3] + '<EOS>'
            args.append(item)
            labs.append(lab[1:20])
    else:
        print("ValueError: labels can not be noneType Object")
    
    combined = pd.DataFrame(data = {'args': args, 'labs': labs})
    return combined




currently ['A01002', 'We should ban human cloning', 'in favor of', 'we should ban human cloning as it will only cause huge issues when you have a bunch of the same humans running around all acting the same.']
goal  A01002 We should ban human cloning in favor of we should ban human cloning as it will only cause huge issues when you have a bunch of the same humans running around all acting the same.


In [64]:
#-------------------------------------------------my guuess where you change to tensor objects: thought process - is any reference call after would be reference its og form 
train_comb = combine(x_train, y_train)
valid_comb = combine(x_valid, y_valid)


for lab in train_comb["labs"]:
    lab = [int(l) for l in lab]

for lab in valid_comb["labs"]:
    lab = [int(l) for l in lab]


In [89]:
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

training_set = CustomDataset(train_comb, tokenizer)
testing_set = CustomDataset(valid_comb, tokenizer)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)


In [96]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 20)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

model = BERTClass()
model.to(device)

MemoryError: 

In [94]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0): #------------------------------------TRAIN X CALLED HERE
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [95]:
for epoch in range(EPOCHS):
    train(epoch)

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 4005888 bytes.