In [1]:
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
data_path = '/content/drive/My Drive/NLP_HW2/Dataset'
general_path = '/content/drive/My Drive/NLP_HW2'

train_path = os.path.join(data_path, "train.json")
test_path = os.path.join(data_path, "test.json")
dev_path = os.path.join(data_path, "dev.json")
glove_path = os.path.join(general_path, "glove.6B.50d.txt")

In [3]:
import json
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

device = "cuda"
PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'

np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# torch.autograd.set_detect_anomaly(True)

### Load Data

In [4]:
def read_dataset(path: str):
  with open(path) as f:
      dataset = json.load(f)
  
  sentences, labels = {}, {}
  for sentence_id, sentence in dataset.items():
      sentence_id = int(sentence_id)
      sentences[sentence_id] = {
          'words': sentence['words'],
          'lemmas': sentence['lemmas'],
          'pos_tags': sentence['pos_tags'],
          'dependency_heads': [int(head) for head in sentence['dependency_heads']],
          'dependency_relations': sentence['dependency_relations'],
          'predicates': sentence['predicates'],
      }

      labels[sentence_id] = {
          'predicates': sentence['predicates'],
          'roles': {int(p): r for p, r in sentence['roles'].items()}
      }

  return sentences, labels

In [5]:
train_sentences, train_labels = read_dataset(train_path)
dev_sentences, dev_labels = read_dataset(dev_path)
test_sentences, test_labels = read_dataset(test_path)

### Single Predicate Converter

In [6]:
# This function is for Train and Dev dataset
# It gets sentences and labels in standard format
# Returns all sentences in single predicate format with new structre
def single_predicate_converter(sentences, labels):
  new_sentences = []
  for k, v in sentences.items():
    for pred_indx in labels[k]['roles'].keys():

      # Create a predicate mask for multi-predicate sentences
      # for example: - , EAT/BITE , - , DRINK --> - , - , - , DRINK 
      new_pred = ['_']*len(labels[k]['predicates'])
      new_pred[pred_indx] = labels[k]['predicates'][pred_indx]

      #create label indicator feature for argument identification
      new_role = ['!_' if role != '_' else '_' for role in labels[k]['roles'][pred_indx]]

      # create predicate indicator feature
      # for example: the cat ate the fish --> 0, 0, 1, 0, 0
      predicate_indicator = [0]*len(labels[k]['predicates'])
      predicate_indicator[pred_indx] = 1 

      # create word indicator feature in respect to predicate position
      # for example: the cat ate the fish --> -2, -1, 0, 1, 2
      lemmas_indicator = [0]*len(sentences[k]['predicates'])
      for i, x in enumerate(predicate_indicator):
        lemmas_indicator[i] = i - pred_indx

      new_sentences.append({
        'sentence_id': k,
        'position_predicate': pred_indx,
        'lemmas': sentences[k]['lemmas'],
        'pos_tags': sentences[k]['pos_tags'],
        'predicate': new_pred,
        'predicate_indicator': predicate_indicator,
        'lemmas_indicator': lemmas_indicator,
        'roles': labels[k]['roles'][pred_indx],
        'bi_roles': new_role
      })

  return new_sentences

In [7]:
# This converter is for Test dataset
# In test dataset we don't have access to the labels
# It gets test setenctes in standard format (Different from TA code)
# It gets all test setences at once
# Returns test sentences in new format with more features
def test_dataset_single_predicate_converter(sentences):

  new_sentences = []

  for k, v in sentences.items():

    # Extract the predicates index for each sentence
    predicate_indexes = []
    for indx, item in enumerate(sentences[k]['predicates']):
      if item != '_':
        predicate_indexes.append(indx)
    
    for pred_indx in predicate_indexes:

      # Create a predicate mask for multi-predicate sentences
      # for example: - , EAT/BITE , - , DRINK --> - , - , - , DRINK 
      new_pred = ['_']*len(sentences[k]['predicates'])
      new_pred[pred_indx] = sentences[k]['predicates'][pred_indx]

      # create predicate indicator feature
      # for example: the cat ate the fish --> 0, 0, 1, 0, 0
      predicate_indicator = [0]*len(sentences[k]['predicates'])
      predicate_indicator[pred_indx] = 1 

      # create word indicator feature in respect to predicate position
      # for example: the cat ate the fish --> -2, -1, 0, 1, 2
      lemmas_indicator = [0]*len(sentences[k]['predicates'])
      for i, x in enumerate(predicate_indicator):
        lemmas_indicator[i] = i - pred_indx


      new_sentences.append({
        'sentence_id': k,
        'position_predicate': pred_indx,
        'lemmas': sentences[k]['lemmas'],
        'pos_tags': sentences[k]['pos_tags'],
        'predicate': new_pred,
        'predicate_indicator': predicate_indicator,
        'lemmas_indicator' : lemmas_indicator
      })

  return new_sentences

In [8]:
train_single_pred_sentences = single_predicate_converter(train_sentences, train_labels)
dev_single_pred_sentences = single_predicate_converter(dev_sentences, dev_labels)
test_single_pred_sentences = test_dataset_single_predicate_converter(test_sentences)

### Create Dictionaries

In [9]:
vocab2ids = {
    'lemmas': {},
    'pos_tags': {},
    'predicate': {},
    'roles': {},
    'predicate_indicator': {},
    'lemmas_indicator': {},
    'bi_roles': {}
}

In [10]:
# Create Vocab to Ids for all features
# Given the feature, it extracts all unique values and assigns an id to each one.
def vocab2id_builder(dataset, k):

  items = []
  for sentence in dataset:
    for item in sentence[k]:
      items.append(item)
  
  # Set '_' id to 0 for all Dict 
  items = set(items)
  if '_' in items:
    items.remove('_')
    items = ['_'] + list(items)
  else:
    items = list(items)

  items.append(UNK_TOKEN)
  items.append(PAD_TOKEN)

  vocab2id = {v:i for i, v in enumerate(items)}

  return vocab2id

In [11]:
vocab2ids['lemmas'] = vocab2id_builder(train_single_pred_sentences, 'lemmas')
vocab2ids['pos_tags'] = vocab2id_builder(train_single_pred_sentences, 'pos_tags')
vocab2ids['predicate'] = vocab2id_builder(train_single_pred_sentences, 'predicate')
vocab2ids['roles'] = vocab2id_builder(train_single_pred_sentences, 'roles')
vocab2ids['predicate_indicator'] = vocab2id_builder(train_single_pred_sentences, 'predicate_indicator')
vocab2ids['lemmas_indicator'] = vocab2id_builder(train_single_pred_sentences, 'lemmas_indicator')
vocab2ids['bi_roles'] = vocab2id_builder(train_single_pred_sentences, 'bi_roles')

In [12]:
# Save the Dictionary
import pickle

# with open('/content/drive/My Drive/NLP_HW2/vocab2ids_final_v2', 'wb') as fp:
#   pickle.dump(vocab2ids, fp)

In [57]:
# Load the Dictionary
with open ('/content/drive/My Drive/NLP_HW2/vocab2ids_v1', 'rb') as fp:
  vocab2ids = pickle.load(fp)

In [58]:
# Create id 2 label dictionary for 'roles'
# Makes the decoding procedure more easy and faster 
id2class = {v: k for k, v in vocab2ids['roles'].items()}

##### Read / Create Glove

In [14]:
words = []
id = 0
word2id = {}
vectors = []

with open(glove_path, 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2id[word] = id
        id += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

In [15]:
glove = {w: vectors[word2id[w]] for w in words}

### Create Windows / Batches

In [59]:
# Check whether the item is in the Dictionary or not
# If the item is not in the Dict, replace it the unknown token
def vocab_checker(vocab2id_k, item):
  if item in vocab2id_k:
    return vocab2id_k[item]
  else:
    return vocab2id_k[UNK_TOKEN]

In [60]:
"""
We don't use Torch DataLoader
This function creates batches, adds padding and returns the max length in all batch
Input: for each sentence we have different features (depends on Test / Train-Dev)
Instead of 
          sentence1 = {'feature1', 'feature2',...}
          sentence1 = {'feature1', 'feature2',...}

We create: 
          feature1 = {'sentence1', 'sentence2',...} 
          feature2 = {'sentence1', 'sentence2',...}

"""

def create_batches(dataset, batch_size):
  batched_dataset = []
  max_sent_len = []

  for i in range(0, len(dataset), batch_size):
    batch = dataset[i:i+batch_size]
    keyed_batch = {}
    for k in dataset[0].keys():
      x = [sentence[k] for sentence in batch]
      if k == 'sentence_id' or k == 'position_predicate':
        keyed_batch[k] = x
        continue
      max_len = max([len(xx) for xx in x])
      max_sent_len.append(max_len)
      x = [xx + [PAD_TOKEN]*(max_len - len(xx)) for xx in x]
      keyed_batch[k] = torch.tensor(
          [[vocab_checker(vocab2ids[k], xxx) for xxx in xx] for xx in x]).to(device)
    batched_dataset.append(keyed_batch)

  return batched_dataset, max(max_sent_len)

In [61]:
train_batched, max_sent_len_train = create_batches(train_single_pred_sentences, 64)
dev_batched, max_sent_len_dev = create_batches(dev_single_pred_sentences, 64)
test_batched, max_sent_len_test = create_batches(test_single_pred_sentences, 64)

# We need max sentence length to build the positional encoder
max_len_all = max([max_sent_len_train, max_sent_len_dev, max_sent_len_test])

###Create Dictionary / Weights

In [63]:
embedding_weights = {
    'lemmas':{},
    'pos_tags':{},
    'predicates': {},
    'predicate_indicator': {},
    'lemmas_indicator': {}
}

In [64]:
# Create weights for different features
# Used Glove for 'lemmas'
# For other features, randomly generate weights
def embedding_weights_creator(glove, vocab2ids, feature):
  matrix_len = len(vocab2ids[feature])
  weights_matrix = np.zeros((matrix_len, 50))

  for k, v in vocab2ids[feature].items():
    if feature == 'lemmas':
      try: 
        weights_matrix[v] = glove[k]
      except KeyError:
        weights_matrix[v] = np.random.normal(scale=0.6, size=(50, ))
    else:
      weights_matrix[v] = np.random.normal(scale=0.6, size=(50, ))
      

  return torch.tensor(weights_matrix, dtype=torch.float32) #to(device)

In [21]:
embedding_weights['lemmas'] = embedding_weights_creator(glove, vocab2ids, 'lemmas')
embedding_weights['pos_tags'] = embedding_weights_creator(glove, vocab2ids, 'pos_tags')
embedding_weights['predicates'] = embedding_weights_creator(glove, vocab2ids, 'predicate')
embedding_weights['predicate_indicator'] = embedding_weights_creator(glove, vocab2ids, 'predicate_indicator')
embedding_weights['lemmas_indicator'] = embedding_weights_creator(glove, vocab2ids, 'lemmas_indicator')

In [65]:
# Load previously build Dictionary
with open ('/content/drive/My Drive/NLP_HW2/embedding_weights_v2', 'rb') as fp:
  embedding_weights = pickle.load(fp)

#### Saving Dictionaries

In [22]:
# Save built dictionary
# with open('/content/drive/My Drive/NLP_HW2/embedding_weights_final_v2', 'wb') as fp:
#   pickle.dump(embedding_weights, fp)



### Transformer Model

In [24]:
import math

# Transformer works in parallel
# Positional Encoder will help the model to take benefit from words position in sentences
# Works based on Original Paper Attention is All you need and PyTorch Doc
# Used Sine and Cosine for different positions
# Max length is the longest sentence in datasets
# dimention models is the embedding layers shape
class PositionalEncoding(nn.Module):

    def __init__(self, d_model=250, max_len=143):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1).to(device)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.shape[0]]
        return x

In [25]:
class TransEncoder(nn.Module):

  def __init__(self, hparams):
    super(TransEncoder, self).__init__()

    self.pos_encoder = PositionalEncoding()

    # Loads weights from previosuly generated weights
    # But it will updates the weights during training
    self.lemmas_embedding = nn.Embedding.from_pretrained(hparams.embedding_weights['lemmas'],freeze=False)
    self.pos_tags_embedding = nn.Embedding.from_pretrained(hparams.embedding_weights['pos_tags'],freeze=False)
    self.predicate_embedding = nn.Embedding.from_pretrained(hparams.embedding_weights['predicates'],freeze=False)
    self.predicate_flag_embedding = nn.Embedding.from_pretrained(hparams.embedding_weights['predicate_indicator'],freeze=False)
    self.lemmas_flag_embedding = nn.Embedding.from_pretrained(hparams.embedding_weights['lemmas_indicator'],freeze=False)

    encoder_layers = nn.TransformerEncoderLayer(hparams.dim_emb,
                                                hparams.num_heads,
                                                hparams.dim_feedforward,
                                                hparams.dropout)
    self.transformer_encoder = nn.TransformerEncoder(encoder_layers, hparams.nlayers)

    # Create two seperate linear layers 
    # First one is for binary labels
    # Second one is for original labels - 36 in total
    self.classifier_s3 = nn.Linear(hparams.dim_emb, hparams.num_classes_s3)
    self.classifier_s4 = nn.Linear(hparams.dim_emb, hparams.num_classes_s4)

    
  def forward(self, src):

    lemmas  = src['lemmas']
    pos_tags = src['pos_tags']
    predicates = src['predicates']
    predicates_indicator = src['predicate_indicator']
    lemmas_indicator = src['lemmas_indicator']
    
    lemmas_emb = self.lemmas_embedding(lemmas)
    pos_emb = self.pos_tags_embedding(pos_tags)
    pred_emb = self.predicate_embedding(predicates)
    pred_flag_emb = self.predicate_flag_embedding(predicates_indicator)
    lemmas_flag_emb = self.lemmas_flag_embedding(lemmas_indicator)


    # Concat all 5 features' embedding weights
    # Each one has 50 dim, so 250 in total
    embeddings = torch.cat((lemmas_emb, pos_emb, pred_emb, pred_flag_emb, lemmas_flag_emb), -1)
    embeddings = torch.transpose(embeddings, 0, 1)

    # Add positional embedding
    # the positional weights sums with the input embedding, so the output size would be the same as input
    embeddings = self.pos_encoder(embeddings)

    o = self.transformer_encoder(embeddings)

    output_s3 = self.classifier_s3(o)
    output_s4 = self.classifier_s4(o)
    output_s3 = torch.transpose(output_s3, 0, 1)
    output_s4 = torch.transpose(output_s4, 0, 1)

    return output_s3, output_s4

In [66]:
class HParams():

  dim_feedforward = 1024
  dim_emb = 250
  dropout = 0.2
  nlayers = 6
  num_heads = 10
  num_classes_s3 = 4
  num_classes_s4 = len(vocab2ids['roles'])
  embedding_weights = embedding_weights

params = HParams()

### Trainer Class

In [27]:
class Trainer():

  def __init__(
    self,
    model: nn.Module,
    loss_function_s3,
    loss_function_s4,
    optimizer):

    self.model = model
    self.loss_function_s3 = loss_function_s3
    self.loss_function_s4 = loss_function_s4
    self.optimizer = optimizer

  def train(self, train_dataset, 
            valid_dataset, 
            epochs):

    train_loss = 0.0
    for epoch in range(epochs):
      print(f'Epoch {epoch+1}')

      epoch_loss = 0.0
      self.model.train()

      for step, sentence in enumerate(train_dataset):

        tokens = {
            'lemmas': sentence['lemmas'],
            'pos_tags': sentence['pos_tags'],
            'predicates': sentence['predicate'],
            'predicate_indicator': sentence['predicate_indicator'],
            'lemmas_indicator': sentence['lemmas_indicator']
        }
        labels = {'roles': sentence['roles'],
                  'bi_roles': sentence['bi_roles']}

        self.optimizer.zero_grad()
        predictions_s3, predictions_s4 = self.model(tokens)

        # predictions_s3 = predictions_s3.reshape(-1, predictions_s3.shape[-1])
        # predictions_s4 = predictions_s4.reshape(-1, predictions_s4.shape[-1])
        # labels['roles'] = labels['roles'].view(-1)

        predictions_s3 = torch.transpose(predictions_s3, 1, 2)
        predictions_s4 = torch.transpose(predictions_s4, 1, 2)

        temp_loss_s3 = self.loss_function_s3(predictions_s3, labels['bi_roles'])
        temp_loss_s4 = self.loss_function_s4(predictions_s4, labels['roles'])

        # Set loss for '_' tokens to zero
        # So during backward, the model focuses on non '_' tokens
        temp_loss_s4 = temp_loss_s4 * labels['bi_roles']

        temp_loss_s3 = temp_loss_s3.mean(dim=-1).mean()
        temp_loss_s4 = temp_loss_s4.mean(dim=-1).mean()

        temp_loss = temp_loss_s3 + temp_loss_s4

        temp_loss.backward()
        self.optimizer.step()

        epoch_loss += temp_loss.tolist()

        
      avg_epoch_loss = epoch_loss / len(train_dataset)
      train_loss += avg_epoch_loss
      print(f'\t[Epoch: {epoch+1}] Training Loss = {avg_epoch_loss}')

      valid_loss = self.evaluate(valid_dataset)
      print(f'\t[Epoch: {epoch+1}] Validation Loss = {valid_loss}')

    print('Training has finished')
    
    avg_epoch_loss = train_loss / epochs
    return avg_epoch_loss
  

  def evaluate(self, valid_dataset):

    valid_loss = 0.0
    self.model.eval()

    with torch.no_grad():
      for sentence in valid_dataset:
        tokens = {
          'lemmas': sentence['lemmas'],
          'pos_tags': sentence['pos_tags'],
          'predicates': sentence['predicate'],
          'predicate_indicator': sentence['predicate_indicator'],
          'lemmas_indicator': sentence['lemmas_indicator']

        }
        labels = {'roles': sentence['roles'],
                  'bi_roles': sentence['bi_roles']}

        predictions_s3, predictions_s4 = self.model(tokens)

        # predictions_s3 = predictions_s3.reshape(-1, predictions_s3.shape[-1])
        # predictions_s4 = predictions_s4.reshape(-1, predictions_s4.shape[-1])
        # labels['roles'] = labels['roles'].view(-1)

        predictions_s3 = torch.transpose(predictions_s3, 1, 2)
        predictions_s4 = torch.transpose(predictions_s4, 1, 2)

        temp_loss_s3 = self.loss_function_s3(predictions_s3, labels['bi_roles'])
        temp_loss_s4 = self.loss_function_s4(predictions_s4, labels['roles'])

        # Set loss for '_' tokens to zero
        # So during backward, the model focuses on non '_' tokens
        temp_loss_s4 = temp_loss_s4 * labels['bi_roles']

        temp_loss_s3 = temp_loss_s3.mean(dim=-1).mean()
        temp_loss_s4 = temp_loss_s4.mean(dim=-1).mean()

        temp_loss = temp_loss_s3 + temp_loss_s4

        valid_loss += temp_loss.tolist()
      
    return valid_loss / len(valid_dataset)


  def predict(self, x):

    self.model.eval()
    
    with torch.no_grad():
      logits_s3, logits_s4 = self.model(x)
      predictions_s3 = torch.argmax(logits_s3, -1)
      predictions_s4 = torch.argmax(logits_s4, -1)
      predictions = predictions_s3 * predictions_s4
      return predictions

### Transformer Training

In [28]:
from pprint import pprint

In [29]:
model = TransEncoder(params).cuda()
model

TransEncoder(
  (pos_encoder): PositionalEncoding()
  (lemmas_embedding): Embedding(27349, 50)
  (pos_tags_embedding): Embedding(50, 50)
  (predicate_embedding): Embedding(458, 50)
  (predicate_flag_embedding): Embedding(4, 50)
  (lemmas_flag_embedding): Embedding(272, 50)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=250, out_features=250, bias=True)
        )
        (linear1): Linear(in_features=250, out_features=1024, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=1024, out_features=250, bias=True)
        (norm1): LayerNorm((250,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((250,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (sel

In [30]:
trainer = Trainer(
    model = model,
    loss_function_s3 = nn.CrossEntropyLoss(ignore_index=vocab2ids['bi_roles'][PAD_TOKEN],  reduction='none'),
    loss_function_s4 = nn.CrossEntropyLoss(ignore_index=vocab2ids['roles'][PAD_TOKEN],  reduction='none'),
    optimizer = optim.Adam(model.parameters()),
)

In [44]:
trainer.train(train_batched, dev_batched, 10)

Epoch 1
	[Epoch: 1] Training Loss = 0.03601293944869955
	[Epoch: 1] Validation Loss = 0.04560981384095024
Epoch 2
	[Epoch: 2] Training Loss = 0.04052025884512807
	[Epoch: 2] Validation Loss = 0.045496053993701935
Epoch 3
	[Epoch: 3] Training Loss = 0.03509779072432128
	[Epoch: 3] Validation Loss = 0.04411199320034653
Epoch 4
	[Epoch: 4] Training Loss = 0.043398586890180536
	[Epoch: 4] Validation Loss = 0.04752588900280934
Epoch 5
	[Epoch: 5] Training Loss = 0.03709767724239031
	[Epoch: 5] Validation Loss = 0.04726496368062263
Epoch 6
	[Epoch: 6] Training Loss = 0.033151667599180125
	[Epoch: 6] Validation Loss = 0.04589682565454174
Epoch 7
	[Epoch: 7] Training Loss = 0.034131137441063414
	[Epoch: 7] Validation Loss = 0.04580041620076871
Epoch 8
	[Epoch: 8] Training Loss = 0.034380069921806006
	[Epoch: 8] Validation Loss = 0.04861097227708966
Epoch 9
	[Epoch: 9] Training Loss = 0.03427684732607247
	[Epoch: 9] Validation Loss = 0.04566615195397068
Epoch 10
	[Epoch: 10] Training Loss = 0.0

0.036155096132447204

In [45]:
model_save_name = 'Transformer_Glove_2Step_30E_F1024_L6_H10_Em250_D2_relu_final_v2.pth'
path = f"/content/drive/My Drive/NLP_HW2/{model_save_name}" 
torch.save(model.state_dict(), path)

#### Loading Best Model

In [56]:
# load model
srl_model = TransEncoder(hparams=params).cuda()
srl_model.load_state_dict(torch.load('/content/drive/My Drive/NLP_HW2/Transformer_Glove_2Step_30E_F1024_L6_H10_Em250_D2_relu.pth'))
srl_model.eval()

TransEncoder(
  (pos_encoder): PositionalEncoding()
  (lemmas_embedding): Embedding(27349, 50)
  (pos_tags_embedding): Embedding(50, 50)
  (predicate_embedding): Embedding(458, 50)
  (predicate_flag_embedding): Embedding(4, 50)
  (lemmas_flag_embedding): Embedding(272, 50)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=250, out_features=250, bias=True)
        )
        (linear1): Linear(in_features=250, out_features=1024, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=1024, out_features=250, bias=True)
        (norm1): LayerNorm((250,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((250,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (sel

In [67]:
trainer = Trainer(
    model = srl_model,
    loss_function_s3 = nn.CrossEntropyLoss(ignore_index=vocab2ids['bi_roles'][PAD_TOKEN],  reduction='none'),
    loss_function_s4 = nn.CrossEntropyLoss(ignore_index=vocab2ids['roles'][PAD_TOKEN],  reduction='none'),
    optimizer = optim.Adam(srl_model.parameters()),
)

#### Test Dataset Prediction

In [68]:
test_predictions = []
test_others = []

for sentence in test_batched:
  tokens = {
    'lemmas': sentence['lemmas'],
    'pos_tags': sentence['pos_tags'],
    'predicates': sentence['predicate'],
    'predicate_indicator': sentence['predicate_indicator'],
    'lemmas_indicator': sentence['lemmas_indicator']
  }
  others = {
    'sentence_id': sentence['sentence_id'],
    'position_predicate': sentence['position_predicate']
  }

  predicts = trainer.predict(tokens)
  test_predictions.append(predicts)
  test_others.append(others)

### Decoder / Merger

In [69]:
# Extracting Sentence IDs and Predicate Positions
sentence_id = [sent_id['sentence_id'] for sent_id in test_others]
position_predicate = [batch['position_predicate'] for batch in test_others]

In [70]:
position_pred_flatten = [xx for x in position_predicate for xx in x]
pred_flatten = [xx for x in test_predictions for xx in x]
sentence_id_flatten = [xx for x in sentence_id for xx in x]

In [71]:
all_sentence_id = [sent_id for sent_id in test_sentences]
unique_sent_id = list(set(all_sentence_id))

In [72]:
# Sentence Merger
roles_prediction = {}

for id in unique_sent_id:
  tmp_dict = {}
  for i, item in enumerate(pred_flatten):
    if id == sentence_id_flatten[i]: # if 'roles' is not empty, at least one predicate
      tmp_dict[position_pred_flatten[i]] = pred_flatten[i]

  roles_prediction[id] = {
      'roles' : tmp_dict
  }

In [73]:
# Sentence Decoder
def sentence_decoder(sentences, predictions, id2vocabs):
  for sentence_id in sentences:
    pred = predictions[sentence_id]['roles']
    lemmas_len = len(sentences[sentence_id]['words'])  # original sentence length
    for idx in pred.keys():
      pred[idx] = pred[idx][:lemmas_len]
      decoded_sentence = [id2vocabs[item.item()] for item in pred[idx]]
      pred[idx] = decoded_sentence
        
  return predictions

In [74]:
decoded_predictions = sentence_decoder(test_sentences, roles_prediction, id2class)

### Evaluation

In [41]:
def evaluate_argument_identification(labels, predictions, null_tag='_'):
    true_positives, false_positives, false_negatives = 0, 0, 0
    for sentence_id in labels:
        gold = labels[sentence_id]['roles']
        pred = predictions[sentence_id]['roles']
        predicate_indices = set(gold.keys()).union(pred.keys())
        for idx in predicate_indices:
            if idx in gold and idx not in pred:
                false_negatives += sum(1 for role in gold[idx] if role != null_tag)
            elif idx in pred and idx not in gold:
                false_positives += sum(1 for role in pred[idx] if role != null_tag)
            else: # idx in both gold and pred
                for r_g, r_p in zip(gold[idx], pred[idx]):
                    if r_g != null_tag and r_p != null_tag:
                        true_positives += 1
                    elif r_g != null_tag and r_p == null_tag:
                        false_negatives += 1
                    elif r_g == null_tag and r_p != null_tag:
                        false_positives += 1

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * (precision * recall) / (precision + recall)
    return {
        'true_positives': true_positives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


def evaluate_argument_classification(labels, predictions, null_tag='_'):
    true_positives, false_positives, false_negatives = 0, 0, 0
    for sentence_id in labels:
        gold = labels[sentence_id]['roles']
        pred = predictions[sentence_id]['roles']
        predicate_indices = set(gold.keys()).union(pred.keys())

        for idx in predicate_indices:
            if idx in gold and idx not in pred:
                false_negatives += sum(1 for role in gold[idx] if role != null_tag)
            elif idx in pred and idx not in gold:
                false_positives += sum(1 for role in pred[idx] if role != null_tag)
            else: # idx in both gold and pred
                for r_g, r_p in zip(gold[idx], pred[idx]):
                    if r_g != null_tag and r_p != null_tag:
                        if r_g == r_p:
                            true_positives += 1
                        else:
                            false_positives += 1
                            false_negatives += 1
                    elif r_g != null_tag and r_p == null_tag:
                        false_negatives += 1
                    elif r_g == null_tag and r_p != null_tag:
                        false_positives += 1
                        
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * (precision * recall) / (precision + recall)
    return {
        'true_positives': true_positives,
        'false_positives': false_positives,
        'false_negatives': false_negatives,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [42]:
def _get_table_line(a, b, c):
    if isinstance(b, float):
        b = '{:0.2f}'.format(b)
    if isinstance(c, float):
        c = '{:0.2f}'.format(c)

    line = '{:^20}|{:^20}|{:^20}'.format(a, b, c)
    return line

def print_table(title, results):
    header = _get_table_line('', 'Gold Positive', 'Gold Negative')
    header_sep = '=' * len(header)

    first_line = _get_table_line('Pred Positive', results['true_positives'], results['false_positives'])
    second_line = _get_table_line('Pred Negative', results['false_negatives'], '')

    precision = 'Precision = {:0.4f}'.format(results['precision'])
    recall = 'Recall    = {:0.4f}'.format(results['recall'])
    f1 = 'F1 score  = {:0.4f}'.format(results['f1'])

    output = '{}\n\n{}\n{}\n{}\n{}\n\n\n{}\n{}\n{}\n\n\n'.format(title.upper(), header, header_sep, first_line, second_line, precision, recall, f1)
    return output

In [75]:
    print('MODEL: ARGUMENT IDENTIFICATION + ARGUMENT CLASSIFICATION')
    argument_identification_results = evaluate_argument_identification(test_labels, decoded_predictions)
    argument_classification_results = evaluate_argument_classification(test_labels, decoded_predictions)
    print(print_table('argument identification', argument_identification_results))
    print(print_table('argument classification', argument_classification_results))

MODEL: ARGUMENT IDENTIFICATION + ARGUMENT CLASSIFICATION
ARGUMENT IDENTIFICATION

                    |   Gold Positive    |   Gold Negative    
   Pred Positive    |       10309        |        991         
   Pred Negative    |        1144        |                    


Precision = 0.9123
Recall    = 0.9001
F1 score  = 0.9062



ARGUMENT CLASSIFICATION

                    |   Gold Positive    |   Gold Negative    
   Pred Positive    |        9754        |        1546        
   Pred Negative    |        1699        |                    


Precision = 0.8632
Recall    = 0.8517
F1 score  = 0.8574



