In [150]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [151]:
# data imports
import glob
import random
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

# model imports
# similar to https://github.com/huggingface/transformers/blob/14e9d2954c3a7256a49a3e581ae25364c76f521e/src/transformers/models/bert/modeling_bert.py
import logging

from dataclasses import dataclass

import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, HingeEmbeddingLoss

from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertEmbeddings, BertModel, BertPreTrainedModel
from transformers.utils import logging

from transformers.file_utils import ModelOutput
from typing import Optional

# logger = logging.get_logger(__name__)

# Trainer imports
from transformers import Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /Users/tariq/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /Users

# Dataset

In [86]:
# each training instance consists of a paragraph, edu splits and edu labels

class ArgumentDataset(Dataset):
    
    def __init__(self, tokenizer, paragraph_files, edus_files, labels_files, max_len=128, max_edu_seq=50):
        
        self.max_len, self.max_edu_seq = max_len, max_edu_seq
        self.tokenizer = tokenizer
        tokenizer.add_special_tokens({'additional_special_tokens':['[EDU_SEP]']})
        
        self.paragraphs = [''.join(open(file).readlines()) for file in glob.glob(paragraph_files)]
        self.edus = [open(file).readlines() for file in glob.glob(edus_files)]
        self.labels = [open(file).readlines() for file in glob.glob(labels_files)]
        self.label2id = {'B-claim': 1, 'I-claim': 2, 'B-premise': 3, 'I-premise': 4, 'O' : 0}
        
        ######
        # filterout = [7, 24, 89, 231, 298, 348, 370, 373, 421, 473, 481, 485, 496, 508, 599, 680] # linux file order
        filterout = [27, 99, 163, 183, 191, 194, 226, 239, 259, 271, 289, 377, 410, 582, 626, 656] # mac file order
        for i in filterout[::-1]:
            self.paragraphs.pop(i); self.edus.pop(i); self.labels.pop(i)
        # print([[(j, i, len(line.rstrip().split('\t'))) for i, line in enumerate(para_labels) if len(line.rstrip().split('\t')) != 2]
        #       for j, para_labels in enumerate(self.labels)])
        # print(len(self.paragraphs), len(self.edus), len(self.labels))
        ######
        
        self.labels = [
            [{'edu': line.rstrip().split('\t')[0], 'tokens': line.rstrip().split('\t')[1]} for line in para_labels]
                      for para_labels in self.labels
        ]
        # self.edus_tokenized = [self.tokenizer(para_edus, truncation=True, padding='max_length', max_length=self.max_len) for para_edus in self.edus]       
        # self.edus_tokenized2 = self.tokenizer.batch_encode_plus(self.edus[0], padding='max_length', max_length=self.max_len)
        
        # self.edu_seq_input_ids = torch.full((len(self.edus), self.max_edu_seq, self.max_len), 0, dtype=int)
        # self.edu_seq_attention_mask = torch.full((len(self.edus), self.max_edu_seq, self.max_len), 0, dtype=int)
        # self.edu_seq_token_type_ids = torch.full((len(self.edus), self.max_edu_seq, self.max_len), 0, dtype=int)
        self.label_edus = [[0 for _ in range(self.max_edu_seq)] for _ in self.labels]
        self.label_tokens = [[[0 for _ in range(self.max_len)] for _ in range(self.max_edu_seq)] for _ in self.labels]
        
        self.para_edu_splits = [' [EDU_SEP] '.join([line.rstrip() for line in para_edus]) for para_edus in self.edus]
        self.para_edu_splits_tok = self.tokenizer(self.para_edu_splits, truncation=True, padding='max_length', max_length=512)
        
#         for i, para_edus in enumerate(self.edus_tokenized):
#             for j in range(min(self.max_edu_seq, len(para_edus['input_ids']))):
#                 self.edu_seq_input_ids[i][j] = torch.tensor(para_edus['input_ids'][j])
#                 self.edu_seq_attention_mask[i][j] = torch.tensor(para_edus['attention_mask'][j])
#                 self.edu_seq_token_type_ids[i][j] = torch.tensor(para_edus['token_type_ids'][j])
                
        for i, para_edus in enumerate(self.labels):
            for j in range(min(self.max_edu_seq, len(self.labels[i]))):
                self.label_edus[i][j] = self.label2id[self.labels[i][j]['edu']]
                for k in range(min(self.max_len, len(self.labels[i][j]['tokens'].split()))):
                    self.label_tokens[i][j][k] = self.label2id[self.labels[i][j]['tokens'].split()[k]]
        
        # self.paragraphs_tokenized = self.tokenizer(self.paragraphs, truncation=True, padding='max_length', max_length=512)
        # self.paragraphs_tokenized = [self.tokenizer.tokenize(p, truncation=True, padding='max_length', max_length=128) for p in self.paragraphs]
        # assert len(self.paragraphs) == len(self.edus) == len(self.labels)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, i):
        return {'input_ids': self.para_edu_splits_tok['input_ids'][i],
                'attention_mask': self.para_edu_splits_tok['attention_mask'][i],
                'token_type_ids': self.para_edu_splits_tok['token_type_ids'][i],
                'edu_labels' : self.label_edus[i],
                'token_labels' : self.label_tokens[i],
                'edu_seperator_id' : self.tokenizer.convert_tokens_to_ids('[EDU_SEP]')
               }
    
#         return { 'edu_seq_input_ids' : self.edu_seq_input_ids[i],
#                 'edu_seq_attention_mask': self.edu_seq_attention_mask[i],
#                 'edu_seq_token_type_ids': self.edu_seq_token_type_ids[i],
#                 'edu_labels' : self.label_edus[i],
#                 'token_labels' : self.label_tokens[i]
            
#         }
        # return {'paragraph': self.paragraphs_tokenized[i], 'edus': self.edus_tokenized[i], 'labels': self.labels[i]}
        

# Model

In [87]:
''' this model should use BertModel to extract the embeddings of the paragraph
    then do the following:
        1. use EDU split to get the embeddings of each tokens of an EDU
        2. represent the EDU as the average embedding of its member tokens
        3. pass the EDU embedding to the classifier layer to make predictions
        4. calculate the loss based on the predicted and gold EDU labels
'''

class BertForPhraseClassification(BertPreTrainedModel):

    # _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config, edu_sequence_length=50):
        super().__init__(config)
        self.config = config
        self.num_labels = config.num_labels
        self.edu_sequence_length = edu_sequence_length

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(self.edu_sequence_length, config.num_labels)

        self.init_weights()


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        edu_labels=None,
        token_labels=None,
        edu_seperator_id=None
    ):
#         # edu_outputs of size: batch_size(=16), msx edus in one paragraph (=50), and bert hidden layer size (=768)
#         edu_outputs = torch.zeros(edu_seq_input_ids.shape[0], self.edu_sequence_length, self.config.hidden_size)
#         for i in range(self.edu_sequence_length):
#             outputs = self.bert(edu_seq_input_ids[:, i, :], attention_mask=edu_seq_attention_mask[:, i, :], token_type_ids=edu_seq_token_type_ids[:, i, :])
#             print(outputs[1].shape, edu_outputs[i].shape, edu_outputs.shape)
#             edu_outputs[:, i, :] = outputs[1]

        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        outputs = outputs[1]
        edu_embeddings = get_edu_emb(input_ids, outputs, edu_seperator_id)
        
        outputs = self.dropout(outputs)
        logits = self.classifier(outputs)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    def get_edu_emb(input_ids, outputs, edu_seperator_id):
        seperators = (input_ids == edu_seperator_id).nonzero(as_tuple=True)
        

In [149]:
outputs, edu_seperator_id = torch.rand(16, 512, 768), 30522
seperators = (torch.tensor(argdata.para_edu_splits_tok['input_ids'][:16]) == edu_seperator_id).nonzero(as_tuple=True)
seperators[0].shape, seperators[1].shape
seperators[0], seperators[1]
argdata.para_edu_splits_tok['input_ids'][13]

[101,
 2323,
 1996,
 2142,
 2163,
 2404,
 2062,
 9259,
 2006,
 3282,
 6095,
 1029,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,

In [93]:
bert = BertModel.from_pretrained('bert-base-uncased')

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /Users/tariq/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-uncased/resolve/main/pytorch_model.bin from

# Trainer

In [111]:
paragraph_files, edus_files, labels_files = '../data/ets/para_text/*', '../data/ets/para_edu/*', '../data/ets/para_edu_label/*'
argdata = ArgumentDataset(tokenizer, paragraph_files, edus_files, labels_files)

Assigning ['[EDU_SEP]'] to the additional_special_tokens key of the tokenizer


In [90]:
print(argdata[0])
argdata.para_edu_splits[0]

{'input_ids': [101, 1996, 2034, 3114, 30522, 2162, 2003, 2734, 30522, 2003, 2111, 2108, 5845, 8053, 1012, 30522, 4916, 1011, 2137, 2162, 30522, 1006, 9244, 1011, 7993, 1007, 1011, 2023, 2162, 2001, 4061, 30522, 2206, 1996, 18985, 1997, 3146, 1010, 30522, 2007, 3290, 30522, 2145, 6815, 1996, 2455, 2004, 2037, 2219, 1012, 30522, 3492, 2172, 1996, 4916, 2015, 4061, 1037, 2162, 1010, 30522, 2005, 3146, 1996, 1057, 1012, 1055, 1012, 2041, 14876, 18533, 1996, 4916, 2015, 1010, 30522, 12823, 3146, 30522, 1998, 13543, 2009, 2004, 1037, 2110, 1012, 30522, 2137, 4329, 30522, 1006, 14276, 1011, 15331, 1007, 1011, 30522, 1996, 2137, 4329, 2435, 1996, 2410, 2167, 2137, 8355, 4336, 2013, 2329, 3627, 30522, 1998, 2511, 1996, 2142, 2163, 1997, 2637, 4274, 12204, 1997, 4695, 1012, 30522, 1996, 4841, 4061, 1037, 2162, 30522, 2000, 4366, 4336, 2013, 2563, 1012, 30522, 2122, 2020, 2048, 1997, 2116, 4973, 30522, 2008, 2162, 19616, 2041, 2035, 1997, 1996, 3471, 1012, 30522, 1999, 2119, 1997, 2216, 8146, 202

"The first reason [EDU_SEP] war is needed [EDU_SEP] is people being treated equally . [EDU_SEP] Mexican - American War [EDU_SEP] ( 1846 - 1848 ) - This war was fought [EDU_SEP] following the annexation of Texas , [EDU_SEP] with Mexico [EDU_SEP] still claiming the land as their own . [EDU_SEP] Pretty much the Mexicans fought a war , [EDU_SEP] for Texas The U.S. outfought the Mexicans , [EDU_SEP] retaining Texas [EDU_SEP] and incorporating it as a state . [EDU_SEP] American Revolution [EDU_SEP] ( 1775 - 1783 ) - [EDU_SEP] The American Revolution gave the 13 North American colonies independence from British rule [EDU_SEP] and established the United States of America Internet Encyclopedia of Philosophy . [EDU_SEP] The Americans fought a war [EDU_SEP] to claim independence from England . [EDU_SEP] These were two of many examples [EDU_SEP] that war sorted out all of the problems . [EDU_SEP] In both of those situations they tried to talk [EDU_SEP] and it did n't work , [EDU_SEP] so they went 

In [52]:
# paragraph_files, edus_files, labels_files = '../data/ets/para_text/*', '../data/ets/para_edu/*', '../data/ets/para_edu_label_all/*'
# argdata = ArgumentDataset(tokenizer, paragraph_files, edus_files, labels_files)
edu_tag_model = BertForPhraseClassification.from_pretrained('bert-base-uncased')

training_args = TrainingArguments(
    output_dir='./',      
    num_train_epochs=3,
    per_device_train_batch_size=16,  
    save_steps=0, 
    do_train=True,
    dataloader_drop_last=True
)

trainer = Trainer(
    model=edu_tag_model,        
    args=training_args,                
    train_dataset=argdata,
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForPhraseClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForPhraseClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForPhraseClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForPhraseClassification were not initialized from the model checkpoint at bert-ba

In [53]:
trainer.train()
# trainer.evaluate(test_data)

***** Running training *****
  Num examples = 783
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 144


RuntimeError: size mismatch, m1: [16 x 768], m2: [50 x 2] at ../aten/src/TH/generic/THTensorMath.cpp:41

# Sketching

In [315]:
argdata[0]['edu_seq_input_ids'].shape, argdata[0]['edu_seq_attention_mask'].shape, argdata[0]['edu_seq_token_type_ids'].shape, \
len(argdata[0]['edu_labels']), len(argdata[0]['token_labels']), \
argdata[0]['edu_labels'][0], len(argdata[0]['token_labels'][0])

(torch.Size([50, 128]),
 torch.Size([50, 128]),
 torch.Size([50, 128]),
 50,
 50,
 1,
 128)

In [58]:
argdata.paragraphs_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [133]:
argdata.edus_tokenized[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [51]:
[[tok_id for tok_id in edu if tok_id not in [0,101,102]] for edu in argdata.edus_tokenized[0]['input_ids']]

[[6195, 2313, 3930, 2311, 2047, 3558, 4547, 6502, 2015, 2003, 6827],
 [1998, 1037, 2843, 1997, 3361, 4219, 2024, 2036, 3223, 1012],
 [3784, 2495, 2003, 1037, 5576],
 [2000, 2191, 2495, 9694, 2625, 3558, 5918, 1012],
 [2144,
  6624,
  1997,
  3784,
  2495,
  2064,
  2191,
  2495,
  6502,
  2015,
  5478,
  2625,
  10605,
  3361,
  2490],
 [2009, 2097, 2031, 1996, 3754],
 [2000, 2022, 3024, 2625, 6450, 2005, 2216],
 [2040, 2024, 2036, 13732, 1999, 5157, 1012]]

In [49]:
argdata.paragraphs_tokenized['input_ids'][0][:70]

[101,
 6195,
 2313,
 3930,
 2311,
 2047,
 3558,
 4547,
 6502,
 2015,
 2003,
 6827,
 1998,
 1037,
 2843,
 1997,
 3361,
 4219,
 2024,
 2036,
 3223,
 1012,
 3784,
 2495,
 2003,
 1037,
 5576,
 2000,
 2191,
 2495,
 9694,
 2625,
 3558,
 5918,
 1012,
 2144,
 6624,
 1997,
 3784,
 2495,
 2064,
 2191,
 2495,
 6502,
 2015,
 5478,
 2625,
 10605,
 3361,
 2490,
 2009,
 2097,
 2031,
 1996,
 3754,
 2000,
 2022,
 3024,
 2625,
 6450,
 2005,
 2216,
 2040,
 2024,
 2036,
 13732,
 1999,
 5157,
 1012,
 102]

In [66]:
L = [[(i, j+1, len(line.rstrip().split('\t'))) for j, line in enumerate(para_labels) if len(line.rstrip().split('\t')) != 2] for i, para_labels in enumerate(labels)]
[l[0][0] for l in L if len(l)>0]

[293, 342, 363, 365, 412, 463, 470, 473, 483, 494, 583, 662]

In [19]:
edus = open('../data/ets/para_edu/1-AbxwVc5Fvl-bX7-RBPA8fLHOghrgNifbzu0hYLtSRY_1.txt').readlines()
text = open('../data/ets/para_text/1-AbxwVc5Fvl-bX7-RBPA8fLHOghrgNifbzu0hYLtSRY_1.txt').readlines()
labels = open('../data/ets/para_edu_label_all/1-AbxwVc5Fvl-bX7-RBPA8fLHOghrgNifbzu0hYLtSRY_1.txt').readlines()
para_labels = [{'edu': line.rstrip().split('\t')[0], 'tokens': line.rstrip().split('\t')[1]} for line in labels]

In [30]:
len(''.join(text).split()), len(''.join(edus).split()), sum([len(line['tokens'].split()) for line in para_labels])

(81, 81, 81)

In [139]:
seq = torch.zeros(10, 5)
seq[0] = torch.Tensor([1, 2, 3, 4, 5])
seq

tensor([[1., 2., 3., 4., 5.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [103]:
from transformers import BertModel, BertConfig
config = BertConfig()
bert = BertModel(config, add_pooling_layer=False)
# bert = bert.from_pretrained('bert-based-uncased')

string_tok = tokenizer('I love movies')
ids = torch.tensor(string_tok['input_ids']).unsqueeze(0)
attn_mask = torch.tensor(string_tok['attention_mask']).unsqueeze(0)
seg_ids = torch.tensor(string_tok['token_type_ids']).unsqueeze(0)

res = bert(ids, attention_mask=attn_mask, token_type_ids=seg_ids)