# 1. import packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# genral packages
import ast
import csv
import h5py
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:
# pytorch, __version__ = '1.8.1+cu101'
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils import data
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
# Use GPU
using_GPU = torch.cuda.is_available()
using_GPU

True

In [None]:
# __version__ = '2.4.0'
!pip install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/a7/4b/e2cd1576fb8016eddd1b63f045abe7c3227a9864fad58a6a864633af9f67/allennlp-2.5.0-py3-none-any.whl (681kB)
[K     |████████████████████████████████| 686kB 6.6MB/s 
Collecting google-cloud-storage<1.39.0,>=1.38.0
[?25l  Downloading https://files.pythonhosted.org/packages/12/75/78ed0d1ef691592b94e7a3d9f58153298166486342a97df82d3c5b66cc16/google_cloud_storage-1.38.0-py2.py3-none-any.whl (103kB)
[K     |████████████████████████████████| 112kB 14.2MB/s 
Collecting overrides==3.1.0
  Downloading https://files.pythonhosted.org/packages/ff/b1/10f69c00947518e6676bbd43e739733048de64b8dd998e9c2d5a71f44c5d/overrides-3.1.0.tar.gz
Collecting tensorboardX>=1.2
[?25l  Downloading https://files.pythonhosted.org/packages/07/84/46421bd3e0e89a92682b1a38b40efc22dafb6d8e3d947e4ceefd4a5fabc7/tensorboardX-2.2-py2.py3-none-any.whl (120kB)
[K     |████████████████████████████████| 122kB 10.1MB/s 
Collecting transformers<4.7,>=4

In [None]:
from allennlp.nn.util import sort_batch_by_length

# 2. Model

## attention setting

In [None]:
class Attention(nn.Module):

  # Modified based on TorchNLP https://github.com/kolloldas/torchnlp

  def __init__(self, input_depth, total_key_depth, total_value_depth, output_depth, 
         num_heads, bias_mask=None, dropout=0.0):
    """
    Parameters:
      input_depth: Size of last dimension of input
      total_key_depth: Size of last dimension of keys. Must be divisible by num_head
      total_value_depth: Size of last dimension of values. Must be divisible by num_head
      output_depth: Size last dimension of the final output
      num_heads: Number of attention heads
      bias_mask: Masking tensor to prevent connections to future elements
      dropout: Dropout probability (Should be non-zero only during training)
    """
    super(Attention, self).__init__()
    # Checks borrowed from 
    # https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/layers/common_attention.py
    if total_key_depth % num_heads != 0:
      raise ValueError("Key depth (%d) must be divisible by the number of "
               "attention heads (%d)." % (total_key_depth, num_heads))
    if total_value_depth % num_heads != 0:
      raise ValueError("Value depth (%d) must be divisible by the number of "
               "attention heads (%d)." % (total_value_depth, num_heads))
      
    self.num_heads = num_heads
    #self.query_scale = 8**-0.5
    self.bias_mask = bias_mask
    
    # Key and query depth will be same
    self.query_linear = nn.Linear(input_depth, total_key_depth, bias=False)
    self.key_linear = nn.Linear(input_depth, total_key_depth, bias=False)
    self.value_linear = nn.Linear(input_depth, total_value_depth, bias=False)
    self.output_linear = nn.Linear(total_value_depth, output_depth, bias=False)
    
    self.dropout = nn.Dropout(dropout)
  
  def _split_heads(self, x):
    """
    Split x such to add an extra num_heads dimension
    Input:
      x: a Tensor with shape [batch_size, seq_length, depth]
    Returns:
      A Tensor with shape [batch_size, num_heads, seq_length, depth/num_heads]
    """
    if len(x.shape) != 3:
      raise ValueError("x must have rank 3")
    shape = x.shape
    return x.view(shape[0], shape[1], self.num_heads, shape[2]//self.num_heads).permute(0, 2, 1, 3)
  
  def _merge_heads(self, x):
    """
    Merge the extra num_heads into the last dimension
    Input:
      x: a Tensor with shape [batch_size, num_heads, seq_length, depth/num_heads]
    Returns:
      A Tensor with shape [batch_size, seq_length, depth]
    """
    if len(x.shape) != 4:
      raise ValueError("x must have rank 4")
    shape = x.shape
    return x.permute(0, 2, 1, 3).contiguous().view(shape[0], shape[2], shape[3]*self.num_heads)
    
  def forward(self, queries, keys, values):
    
    # Do a linear for each component
    queries = queries
    keys = keys
    values = values
    
    # Split into multiple heads
    queries = self._split_heads(queries)
    keys = self._split_heads(keys)
    values = self._split_heads(values)

    # Combine queries and keys
    logits = torch.matmul(queries, keys.permute(0, 1, 3, 2))

    # Convert to probabilites
    weights = nn.functional.softmax(logits, dim=-1)
    
    # Combine with values to get context
    contexts = torch.matmul(weights, values)
    
    # Merge heads
    contexts = self._merge_heads(contexts)

    return contexts

In [None]:
def get_context(input_l2r, input_r2l, window = 3):
  batch_size = input_l2r.size(0)
  input_seq_len = input_l2r.size(1)
  pad_window = torch.zeros(input_l2r.size(0), window, input_l2r.size(2)).cuda()
  input_l2r = torch.cat([pad_window, input_l2r], 1)
  input_r2l = torch.cat([input_r2l, pad_window], 1)
  context_l2r = input_l2r.unsqueeze(1).expand(batch_size, input_seq_len, input_l2r.size(1), 
                          input_l2r.size(2)).contiguous().view(batch_size*input_seq_len,
                                             input_l2r.size(1),input_l2r.size(2))
  context_r2l = input_r2l.unsqueeze(1).expand(batch_size, input_seq_len, input_r2l.size(1), 
                          input_r2l.size(2)).contiguous().view(batch_size*input_seq_len,
                                             input_r2l.size(1),input_r2l.size(2))
  mask_eye = torch.from_numpy(np.eye(window, dtype=int)).type(torch.FloatTensor)

  mask_tensor_l2r = torch.cat([mask_eye, torch.zeros(context_l2r.size(1)-mask_eye.size(0),
                                 mask_eye.size(1)).type(torch.FloatTensor)], 0)
  mask_tensor_r2l = torch.cat([torch.cat([torch.zeros(1, mask_eye.size(-1)).type(torch.FloatTensor), 
                          mask_eye], 0), torch.zeros(context_l2r.size(1)-mask_eye.size(0)-1,
                           mask_eye.size(1)).type(torch.FloatTensor)], 0)

  # forming masks based on the window size
  for i in range(1, context_l2r.size(1)-window):
    mask_tensor_l2r = torch.cat([mask_tensor_l2r, 
                       torch.zeros(i, mask_eye.size(1)).type(torch.FloatTensor)], 0)
    mask_tensor_l2r = torch.cat([mask_tensor_l2r,
                      mask_eye], 0)
    mask_tensor_l2r = torch.cat([mask_tensor_l2r,
                      torch.zeros(context_l2r.size(1)-i-mask_eye.size(0), 
                            mask_eye.size(1)).type(torch.FloatTensor)], 0)

    mask_tensor_r2l = torch.cat([mask_tensor_r2l, 
                       torch.zeros(i+1, mask_eye.size(1)).type(torch.FloatTensor)], 0)
    mask_tensor_r2l = torch.cat([mask_tensor_r2l,
                      mask_eye], 0)
    mask_tensor_r2l = torch.cat([mask_tensor_r2l,
                      torch.zeros(context_r2l.size(1)-i-1-mask_eye.size(0), 
                            mask_eye.size(1)).type(torch.FloatTensor)], 0)
  mask_l2r = mask_tensor_l2r.unsqueeze(0).expand(batch_size, mask_tensor_l2r.size(0), mask_tensor_l2r.size(1))
  mask_r2l = mask_tensor_r2l.unsqueeze(0).expand(batch_size, mask_tensor_r2l.size(0), mask_tensor_r2l.size(1))

  mask_l2r = Variable(mask_l2r.contiguous().view(context_l2r.size(0), context_l2r.size(1), -1), 
              requires_grad = False).cuda()
  mask_r2l = Variable(mask_r2l.contiguous().view(context_r2l.size(0), context_r2l.size(1), -1), 
              requires_grad = False).cuda()

  maskted_context_l2r = torch.bmm(context_l2r.permute(0,2,1),mask_l2r).permute(0,2,1)
  maskted_context_r2l = torch.bmm(context_r2l.permute(0,2,1),mask_r2l).permute(0,2,1)
    
  return maskted_context_l2r, maskted_context_r2l

In [None]:
def get_query(input):
  query = input.unsqueeze(2).view(input.size(0)*input.size(1), 1, input.size(2))
  return query

## RNN setting

In [None]:
class BiLSTMAtt(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, num_layers, dropout_lstm_input, 
          dropout_fc_input, dropout_rnn, bidir=True, target_size=2):
    # Always call superclass first (nn.Module)
    super(BiLSTMAtt, self).__init__()

    # set RNN model
    # dropout_rnn = 0 when num_layers = 1
    self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim,
                num_layers=num_layers, dropout=dropout_rnn, 
                batch_first=True, bidirectional=bidir)
    # set dropout layer
    self.dropout_on_input_to_LSTM = nn.Dropout(dropout_lstm_input)
    self.dropout_on_input_to_linear_layer = nn.Dropout(dropout_fc_input)

    # set up the final transform to a distribution over classes.
    direc = 2 if bidir else 1
    self.fc = nn.Linear(hidden_dim * direc * 2, target_size)

    # set up attenion layer
    self.attention = Attention(input_depth=hidden_dim, total_key_depth=64, 
                   total_value_depth=64, output_depth=hidden_dim, 
                   num_heads=16, bias_mask=None, dropout=0.0)

  # get_lstm_features takes an batch of inputs and a list of ints
  # List of int with the unpadded length of each example in the batch.
  # input is of shape (batch_size, sequence_length)
  def forward(self, inputs, lengths):
    # 1. run LSTM
    # apply dropout to the input
    # Shape of inputs: (batch_size, sequence_length, embedding_dim)
    embedded_input = self.dropout_on_input_to_LSTM(inputs)
    # Sort the embedded inputs by decreasing order of input length.
    # sorted_input shape: (batch_size, sequence_length, embedding_dim)
    (sorted_input, sorted_lengths, input_unsort_indices, _) = sort_batch_by_length(embedded_input, lengths)
    # Pack the sorted inputs with pack_padded_sequence.
    packed_input = pack_padded_sequence(sorted_input, sorted_lengths.data.tolist(), batch_first=True)
    # Run the input through the RNN.
    packed_sorted_output, _ = self.rnn(packed_input)
    # Unpack (pad) the input with pad_packed_sequence
    # Shape: (batch_size, sequence_length, hidden_dim)
    sorted_output, _ = pad_packed_sequence(packed_sorted_output, batch_first=True)
    # Re-sort the packed sequence to restore the initial ordering
    # Shape: (batch_size, sequence_length, hidden_dim)
    lstm_output = sorted_output[input_unsort_indices]

    # 2. run attention layer
    # set the dimension of attention vector as hidden dimension
    att_vec_dim = int(lstm_output.size(2)/2)
    # split the last dimension of lstm_output into 2 parts equally
    # Shape: [batch_size, sequence_length, hidden_dim]
    l2r = lstm_output[:, :, :att_vec_dim]
    r2l = lstm_output[:, :, att_vec_dim:]
    # get query for each part
    query_l2r = get_query(lstm_output[:, :, :att_vec_dim])
    query_r2l = get_query(lstm_output[:, :, att_vec_dim:])
    # get context for each part
    context_l2r, context_r2l = get_context(lstm_output[:, :, :att_vec_dim], 
                         lstm_output[:, :, att_vec_dim:], 
                         window = 3)
    # run each part through attention layer to get two attention encodings
    # Shape: [batch_size, sequence_length, hidden_dim] 
    att_l2r = self.attention(query_l2r, context_l2r, context_l2r).view(
        lstm_output.size(0), lstm_output.size(1), -1)
    att_r2l = self.attention(query_r2l, context_r2l, context_r2l).view(
        lstm_output.size(0), lstm_output.size(1), -1)
    # concatenate these two attention encodings into one full attention encoding 
    # Shape: [batch_size, sequence_length, hidden_dim * 2]
    att = torch.cat([att_l2r, att_r2l], -1)    
    # concatenate LSTM layer output with the full attention encoding
    # Shape: [batch_size, sequence_length, hidden_dim * 4]
    attention_output = torch.cat([lstm_output, att], -1)
    
    # 3. run linear layer
    # apply dropout to input to the linear layer
    FC_input_encoding = self.dropout_on_input_to_linear_layer(attention_output)
    # run through the output projection to get scores for each of the classes.
    linear_output = self.fc(FC_input_encoding)
    # Normalize with log softmax
    output_distribution = F.log_softmax(linear_output, dim=-1)

    return output_distribution

# 3. Functions

## read data

In [None]:
def get_raw_data(path):
  raw = []
  with open(path) as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
      raw.append(line)
  return raw  

In [None]:
def get_PoS_tag(tag):
  ''' Tagset convertion '''
  if tag in ['a', 'ag', 'an', 'b', 'z', 'zg']:
    convert = 'ADJ'
  elif tag in ['p', 'f']:
    convert = 'ADP'
  elif tag in ['ad', 'd', 'dg', 't']:
    convert = 'ADV'
  elif tag in ['n', 'ng', 'nr', 'ns', 's', 'nz', 'g', 'j', 'nt', 'nx', 'tg']:
    convert = 'NOUN'
  elif tag in ['r', 'rg']:
    convert = 'PRNOUN'
  elif tag in ['v', 'vd', 'vg', 'vn']:
    convert = 'VERB'
  else:
    convert = 'X'
  return convert

In [None]:
def predict_dataset(dataset):
  '''
  get dataset for predict
  :param dataset: original dataset 
            index, sent, pos, met, genre, orignial tokens, bert tokens,
            bert toekns in original style, BIO tags
  :return predict: for predict
            original tokenized sentence, metaphor labels, POS tags 
  '''
  #  original tokenized sentence, metaphor labels, POS tags
  predict = [[ast.literal_eval(row[5]),
          ast.literal_eval(row[3]),
          [get_PoS_tag(tag) for tag in ast.literal_eval(row[2])]] 
          for row in dataset]
  return predict

In [None]:
def get_pos2idx_idx2pos(pos_set):
  pos_set = pos_set
  idx2pos = dict(enumerate(pos_set))
  pos2idx = dict([(v, k) for (k, v) in idx2pos.items()])
  return pos2idx, idx2pos

In [None]:
def get_token2idx_idx2token(vocab):
  vocab = vocab
  idx2token = dict(enumerate(vocab))
  token2idx = dict([(v, k) for (k, v) in idx2token.items()])
  return token2idx, idx2token

In [None]:
def countX(lst, x):
  ''' count the number of an element in a list'''
  count = 0
  for ele in lst:
    if (ele == x):
      count = count + 1
  return count

In [None]:
def describe(dataset):
  '''
  :param dataset: a formatted list of five-item lists
             - sentences
             - metaphor sequence
             - pos sequence
             - genre
             - split
  :return: print description of the dataset
        - first 5 lines
        - sentence length (mean and SD) 
          # Considering the amount of data, we directly use mean and SD without distribution check
        - literal & metaphor usages for each pos tag (1 for metaphor)
        - POS tag count
        - genre count
  '''
  # print the first and last 5 lines
  print('>>> 1. Size of the dataset')
  print(['sentences', 'pos_seq', 'met_seq', 'genre', 'split']) 
  print('Size: ', len(dataset))

  # print mean and SD of sentence length
  print('\n>>> 2. Sentence length') 
  sentences = [line[0] for line in dataset]
  print('Mean\t{:.2f}'.format(np.mean([len(sentence) for sentence in sentences])))
  print('SD\t{:.2f}'.format(np.std([len(sentence) for sentence in sentences])))

  # print metaphorical label porpotion
  print('\n>>> 3. Metaphor count (1 for metaphor):')
  ## pos and tag list
  met = [line[1] for line in dataset]
  pos = [line[2] for line in dataset]
  ## into a list
  pos2met = []
  for i in list(zip(pos, met)):
    pos2met.extend(list(zip(i[0], i[1])))
  ## count
  for i in sorted(set(pos2met)):
    print('{}\t{}'.format(i, countX(pos2met, i)))
  
  # print pos tag numbers
  print('\n>>> 4. POS tag count:')
  vocab = sum([row[0] for row in dataset], [])
  pos_list = []
  for pos_seq in pos:
    pos_list.extend(pos_seq)
  ## count
  for i in sorted(set(pos_list)):
    print('{}\t{}\t{:.2%}'.format(i, countX(pos_list, i), countX(pos_list, i)/len(vocab)))

In [None]:
def check_genre(dataset):
  genres = [line[4] for line in dataset]
  for i in sorted(set(genres)):
    print('{}: {}, {:.2%}'.format(i, countX(genres, i), countX(genres, i)/len(dataset)))

## get iterators

In [None]:
def embed_sequence(sequence, elmo, FT):
  '''
  :param sequence: a single string, the original sequence
  :param Bert: a h5 file, pretrained Bert
          group key: a sentence   
          group value: pretrained sentence embeddings type: numpy array
  :param FT: a h5 file, pretrained FastText
         group key: a sentence
         group value: pretrained sentence embeddings type: numpy array
  :return: a concentrated embedding with glove, pretrained and suffix
        type: numpy array
  '''
  sentence = ''.join(sequence)
  # 1. embed the sentence by bert vectors
  elmo_part = elmo[sentence]
  assert (elmo_part.shape == (len(sequence), 1024))

  # 2. embed the sentence by bert vectors
  FT_part = FT[sentence]
  assert (FT_part.shape == (len(sequence), 300))

  # 3. concatenate
  result = np.concatenate((elmo_part, FT_part), axis=1)

  return result

In [None]:
def get_dataloader(dataset, elmo, FT, batch_size, shuffle):
  '''
  :param: dataset: a splited dataset, train, test, or val
  :param: bert: pretrained sentence embeddings
  :param batch_size: size of batches
  :param: shuffle: a boolean value (True or False)
  :return: split_dataloader
  '''
  # embedded_split: embedded_sentence, indexed pos sequences, indexed tag sequences
  f = lambda x: [token2idx[token] for token in x]
  embedded_split = []
  for example in tqdm(dataset):
    embedded_sentence = embed_sequence(example[0], elmo, FT) # embedded sentence
    indexed_pos_seq = [pos2idx[pos] for pos in example[2]] # indexed pos sequences
    indexed_token_seq = [token2idx[token] for token in example[0]]
    labels = example[1] # metaphor labels
    embedded_split.append([embedded_sentence, indexed_pos_seq, labels, indexed_token_seq])

  # separate the input (embedded_sequence) and labels in the indexed train sets.
  split_dataset = MetaphorDataset([example[0] for example in embedded_split],
                    [example[1] for example in embedded_split],
                    [example[2] for example in embedded_split],
                    [example[3] for example in embedded_split])
  # set up a DataLoader
  split_dataloader = data.DataLoader(dataset=split_dataset, batch_size=batch_size, 
                      shuffle=shuffle, collate_fn=MetaphorDataset.collate_fn)
  return split_dataloader

In [None]:
# Make sure to subclass torch.utils.data.Dataset
# To convert a split dataset into batches to create iterator
class MetaphorDataset(data.Dataset):
  def __init__(self, embedded_text, pos_seqs, tag_seqs, tokenized_text):
    '''
    :param embedded_text: sentence embeddings
    :param tokenized_text: indexed tokenized sentece 
    :param pos_seqs: indexed pos sequences
    :param tag_seqs: indexed metaphor label sequences
    '''
    if len(embedded_text) != len(tag_seqs):
      raise ValueError('Differing number of sentences and tags!')
    
    # A list of h5py File results, available for convert into tensor, numpy array, list etc.
    # where each inner numpy arrays is sequence_length * embed_dim
    self.embedded_text = embedded_text
    # A list of indexed pos sequences
    # where each inner list is the indexed pos tags for the sentence at the corresponding index.
    self.pos_seqs = pos_seqs
    # A list of indexed BIO tags
    # where each inner list is the tag for the sentence at the corresponding index.
    self.tag_seqs = tag_seqs
    # A list of indexed tokenizations
    # where each inner list is the tags for tokens in bert tokenized sentece at the corresponing index
    self.tokenization = tokenized_text

  def __getitem__(self, idx):
    '''
    Return the Dataset example at index `idx`.
    '''
    example_pos_seq = self.pos_seqs[idx]
    example_text = self.embedded_text[idx]
    example_length = example_text.shape[0]
    example_tag_seq = self.tag_seqs[idx]
    example_tokenization = self.tokenization[idx]
    assert (example_length == len(example_pos_seq))
    assert (example_length == len(example_tag_seq))
    assert (example_length == len(example_tokenization))
    return example_pos_seq, example_text, example_length, example_tag_seq, example_tokenization
  
  def __len__(self):
    '''
    Return the number of examples in the Dataset.
    '''
    return len(self.tag_seqs)

  @staticmethod
  def collate_fn(batch):
    '''
    Given a list of examples (each from __getitem__), combine them to form a single batch by padding.
    
    Returns:
    -------
    batch_pos_seqs: list
      A list of list: each inner list is a variable-length list of sublists of indexed pos tags
    batch_padded_example_text: LongTensor
      LongTensor of shape (batch_size, longest_sequence_length) with the
      padded text for each example in the batch.
    length: LongTensor
      LongTensor of shape (batch_size,) with the unpadded length of the example.
    example_tag: LongTensor
      LongTensor of shape (batch_size,) with the tag of the example.
    '''
    batch_pos_seqs = []
    batch_padded_example_text = []
    batch_lengths = []
    batch_padded_tags = []
    batch_tokenization = []
    
    # Get the length of the longest sequence in the batch
    max_length = -1
    for _, _, _, tag, _ in batch:
      if len(tag) > max_length:
        max_length = len(tag)
    
    # Iterate over each example in the batch
    for pos, text, length, tag, tokenization in batch:
      # Unpack the example (returned from __getitem__)
      
      # append the pos_sequence to our batch
      batch_pos_seqs.append(pos)  

      # Fill shorter sentences by padding

      ## Amount to pad is length of longest example - length of this example. 
      amount_to_pad = max_length - length

      ## - fill tokenization by 0 ([PAD])
      tokenization = tokenization + [0] * amount_to_pad
      ## append the padded tokenizaiton to our batch
      batch_tokenization.append(tokenization)

      ## - fill sentence embedding by torch.zeros of shape (amount_to_pad,), converted to LongTensor
      pad_tensor = torch.zeros(amount_to_pad, text.shape[1])

      ## Append the pad_tensor to the example_text tensor.
      ## Shape of padded_example_text: (padded_length, embeding_dim)
      ## top part is the original text numpy,
      ## and the bottom part is the 0 padded tensors

      ## text embedding from the batch is a h5py File results 
      ## convert it to a torch.FloatTensor first to keep in same type with pad_tensor
      text = torch.Tensor(text)
      ## if we don't perform the conersion as below
      ## then here we can't concatenate 
      ## because tensor.cat requires two arguments to be the same types
      padded_example_text = torch.cat((text, pad_tensor), dim=0)
      ## append the padded example to our batch
      batch_padded_example_text.append(padded_example_text)

      # append the length to our batch
      batch_lengths.append(length)

      # Fill shorter pos sequences by padding
      # similar to what we did on shorter sentences
      # as tags from __getitem__ is a list, here we first fill with 0s in the list
      # then convert it to torch.LongTensor at the end
      padded_example_tag = tag + [0] * amount_to_pad
      # append the padded tag sequence to our batch
      batch_padded_tags.append(padded_example_tag)
    
    # Stack the list of LongTensors into a single LongTensor
    return (batch_pos_seqs,
        torch.stack(batch_padded_example_text),
        torch.LongTensor(batch_lengths),
        torch.LongTensor(batch_padded_tags),
        torch.LongTensor(batch_tokenization))

## evaluate

In [None]:
def get_batch_predictions(pos_seqs, tag_seqs, predict_seqs):
  '''
  :param pos_seqs: A list of sublists of ints, indexed pos tags
  :param tag_seqs: LongTensor of shape (batch_size,) with the padded length
  :param predict_seqs: A list of ints, 
  :return: pos_tags: a list of ints, indexed pos tags 
       gold_labels:  a list of ints, with the orignial tokenization length
       predict_labels: a list of ints, with the orignial tokenization length
       # number of sublists: number of example in that batch
  '''
  
  
  # first put the tensor on CPU and then convert them into lists
  predict_seqs = torch.max(predict_seqs.data, 2)[1]
  tag_seqs = tag_seqs.cpu().numpy().tolist()
  assert (len(pos_seqs) == len(tag_seqs))
  assert (len(pos_seqs) == predict_seqs.shape[0])

  pos_tags = []
  gold_labels = []
  predict_labels = []
  for i in range(len(pos_seqs)):
    # convert pos seq to pos tag
    pos_tag = pos_seqs[i]
    # convert the BIO tags into metaphor labels
    gold_label = tag_seqs[i][:len(pos_seqs[i])]
    predict_label = predict_seqs[i][:len(pos_seqs[i])]
    assert (len(pos_tag) == len(gold_label))
    assert (len(pos_tag) == len(predict_label))

    # concatenate these results respectively into a single list
    pos_tags.append(pos_tag)
    gold_labels.append(gold_label)
    predict_labels.append(predict_label)
  
  return (pos_tags, gold_labels, predict_labels) 

In [None]:
def update_confusion_matrix(matrix, predictions, labels, pos_seqs):
  '''
  update the confusion matrix based on the given batch

  :param matrix: a 3D numpy array of shape (#pos_tags, 2, 2)
  :param predictions: a numpy array of shape (batch_size, max_seq_len)
  :param labels: a numpy array of shape (batch_size, max_seq_len)
  :param lengths: a numpy array of shape (batch_size)
  :param pos_seqs: a list of variable-length indexed pos sequence
  :param idx2pos: a dictionary: int --> pos tag
  :return: the updated matrix
  '''
  for i in range(len(pos_seqs)):
    indexed_pos_sequence = pos_seqs[i]
    prediction = predictions[i]
    label = labels[i]
    for j in range(len(indexed_pos_sequence)):
      indexed_pos = indexed_pos_sequence[j]
      p = prediction[j]
      l = label[j]
      matrix[indexed_pos][p][l] += 1
  return matrix

In [None]:
def print_info(matrix, idx2pos):
  '''
  Prints the precision, recall, f1, and accuracy for each pos tag
  Assume that the confusion matrix is implicitly mapped with the idx2pos
  i.e. row 0 in confusion matrix is for the pos tag mapped by int 0 in idx2pos

  :param matrix: a confusion matrix of shape (#pos_tags, 2, 2)
  :param idx2pos: idx2pos: a dictionary: int --> pos tag
  :return: a matrix (#allpostags, 4) each row is the PRFA performance for a pos tag
  '''
  result = []
  print('\t\t\t\tPrecision\tRecall\t\tF1\t\tAccuracy\tMatrix')
  for idx in range(len(idx2pos)):
    pos_tag = idx2pos[idx]
    grid = matrix[idx]
    precision = 100 * grid[1, 1] / np.sum(grid[1])
    recall = 100 * grid[1, 1] / np.sum(grid[:, 1])
    f1 = 2 * precision * recall / (precision + recall)
    accuracy = 100 * (grid[1, 1] + grid[0, 0]) / np.sum(grid)
    print('- PRFA performance for {}:\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{}'.format(pos_tag, precision, recall, f1, accuracy, grid.tolist()))
    result.append([precision, recall, f1, accuracy])
  return np.array(result)

In [None]:
def evaluate(model, evaluation_dataloader, evaluation_dataset):
  '''
  Evaluate the model on the given evaluation_dataloader
  :param model:
  :param evaluation_dataloader:
  :param evaluation_dataset:
  :return:
  - average_eval_loss
  - a matrix (#allpostags, 4) 
    each row is the PRFA performance for a pos tag
  '''
  # Set model to eval mode, which turns off dropout.
  model.eval()

  total_eval_loss = 0
  confusion_matrix = np.zeros((len(idx2pos), 2, 2))
  predictions = []

  for (eval_pos, eval_text, eval_lengths, eval_tags, eval_tokenizations) in tqdm(evaluation_dataloader):
    eval_text = Variable(eval_text)
    eval_lengths = Variable(eval_lengths)
    eval_tags = Variable(eval_tags)
    if using_GPU:
      eval_text = eval_text.cuda()
      eval_lengths = eval_lengths.cuda()
      eval_tags = eval_tags.cuda()
    
    # calculated training loss for this batch
    eval_prediction = model(eval_text, eval_lengths)
    batch_loss = loss_criterion(eval_prediction.view(-1, 2), eval_tags.view(-1))
    total_eval_loss += batch_loss

    # get pos tags, gold labels and prediced labels
    batch_pos, batch_gold, batch_predict = get_batch_predictions(eval_pos, eval_tags, eval_prediction)
    confusion_matrix = update_confusion_matrix(confusion_matrix, batch_predict, batch_gold, batch_pos)
    predictions.extend(batch_predict)

  average_eval_loss = total_eval_loss / evaluation_dataloader.__len__()

  # Set the model back to train mode, which activates dropout again.
  model.train()
  assert (len(predictions) == len(evaluation_dataset)) 

  return average_eval_loss, predictions, print_info(confusion_matrix, idx2pos)

In [None]:
def get_performance_test(target_path):
  '''
  Read the test data and predictions
  Prints the performance of LSTM sequence model on based on genre
  Prints the performance of LSTM sequence model on regardless of genre
  :param target_path: the file with prediction

  :return: the averaged performance across genre
  '''
  # get the prediction from LSTM sequence model
  ID2sen_labelseq = {}  # ID tuple --> [genre, label_sequence, pred_sequence]
  with open(target_path) as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
      ID2sen_labelseq[line[0]] = [line[4], 
                      ast.literal_eval(line[3]), 
                      ast.literal_eval(line[-1])]
  # compute confusion_matrix
  genres = ['News', 'Fiction', 'Academic']
  confusion_matrix = np.zeros((4, 2, 2))
  for ID in ID2sen_labelseq:
    genre, label_sequence, pred_sequence = ID2sen_labelseq[ID]
    for i in range(len(label_sequence)):
      pred = pred_sequence[i]
      label = label_sequence[i]
      genre_idx = genres.index(genre)
      confusion_matrix[genre_idx][pred][label] += 1

  print('Genre-specific Performance:')
  print('\t\tPrecision\tRecall\t\tF1\t\tAccuracy\tMatrix')
  avg_performance = []
  for i in range(len(genres)):
    precision = 100 * confusion_matrix[i, 1, 1] / np.sum(confusion_matrix[i, 1])
    recall = 100 * confusion_matrix[i, 1, 1] / np.sum(confusion_matrix[i, :, 1])
    f1 = 2 * precision * recall / (precision + recall)
    accuracy = 100 * (confusion_matrix[i, 1, 1] + confusion_matrix[i, 0, 0]) / np.sum(confusion_matrix[i])
    if i == 0:
      print('- {}:\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{}'.format(genres[i], precision, recall, f1, accuracy, confusion_matrix[i].tolist()))
    else:
      print('- {}:\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{}'.format(genres[i], precision, recall, f1, accuracy, confusion_matrix[i].tolist()))
    avg_performance.append([precision, recall, f1, accuracy])
  avg_performance = np.array(avg_performance)

  print('\nGeneral Performance')
  confusion_matrix = confusion_matrix.sum(axis=0)
  precision = 100 * confusion_matrix[1, 1] / np.sum(confusion_matrix[1])
  recall = 100 * confusion_matrix[1, 1] / np.sum(confusion_matrix[:, 1])
  f1 = 2 * precision * recall / (precision + recall)
  accuracy = 100 * (confusion_matrix[1, 1] + confusion_matrix[0, 0]) / np.sum(confusion_matrix)
  print('Precision\tRecall\t\tF1\t\tAccuracy\tMatrix')
  print('{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{}\n'.format(precision, recall, f1, accuracy, confusion_matrix.tolist()))

  return avg_performance.mean(0)


In [None]:
def get_performance_pos(target_path, pos):
  '''
  Similar to get_performance_VUA_test
  In this function we aim to see the performance on a certain word class
  Prints the performance of LSTM sequence model on based on genre
  Prints the performance of LSTM sequence model on regardless of genre
  :param target_path: the file with prediction
              ['sent_index', 'sent_txt', 'pos_seq', 'metaphor_seq', 'genre', 'sent_txt_tokenized',  'sent_bert_tokens', 'sent_txt_tokenized_bert', 'BIO_seq', 'prediction']
  :param pos: the pos to investigate
  '''

  # get the prediction from LSTM sequence model
  ID2info = {}  # (ID, pos_idx) --> [pos_idx, genre, gold label, predict label]
  with open(target_path) as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
      pos_index = [i[0] for i in enumerate(ast.literal_eval(line[2])) if i[1] == pos]
      for index in pos_index:
        ID2info[(line[0], index)] = [index,
                         line[4], 
                         ast.literal_eval(line[3])[index], 
                         ast.literal_eval(line[-1])[index]]
  # compute confusion_matrix
  predictions = []
  genres = ['News', 'Fiction', 'Academic']
  confusion_matrix = np.zeros((4, 2, 2))
  for (ID, index) in ID2info:
    index, genre, gold, pred = ID2info[(ID, index)]
    predictions.append(pred)
    genre_idx = genres.index(genre)
    confusion_matrix[genre_idx][pred][gold] += 1
  assert (np.sum(confusion_matrix) == len(ID2info))
  
  print(pos)
  print('Genre-specific Performance:')
  print('\t\tPrecision\tRecall\t\tF1\t\tAccuracy\tMatrix')
  avg_performance = []
  for i in range(len(genres)):
    precision = 100 * confusion_matrix[i, 1, 1] / np.sum(confusion_matrix[i, 1])
    recall = 100 * confusion_matrix[i, 1, 1] / np.sum(confusion_matrix[i, :, 1])
    f1 = 2 * precision * recall / (precision + recall)
    accuracy = 100 * (confusion_matrix[i, 1, 1] + confusion_matrix[i, 0, 0]) / np.sum(confusion_matrix[i])
    if i == 0:
      print('- {}:\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{}'.format(genres[i], precision, recall, f1, accuracy, confusion_matrix[i].tolist()))
    else:
      print('- {}:\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{}'.format(genres[i], precision, recall, f1, accuracy, confusion_matrix[i].tolist()))    
    avg_performance.append([precision, recall, f1, accuracy])
  avg_performance = np.array(avg_performance)

  print('\nGeneral Performance')
  confusion_matrix = confusion_matrix.sum(axis=0)
  precision = 100 * confusion_matrix[1, 1] / np.sum(confusion_matrix[1])
  recall = 100 * confusion_matrix[1, 1] / np.sum(confusion_matrix[:, 1])
  f1 = 2 * precision * recall / (precision + recall)
  accuracy = 100 * (confusion_matrix[1, 1] + confusion_matrix[0, 0]) / np.sum(confusion_matrix)
  print('Precision\tRecall\t\tF1\t\tAccuracy\tMatrix')
  print('{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{}\n'.format(precision, recall, f1, accuracy, confusion_matrix.tolist()))

  return avg_performance.mean(0)

# 4. Prepare data

## preprocess data 

In [None]:
# data file path
psu = '/content/drive/MyDrive/Metaphor Detection/data/PSUCMC_sequence_new.csv'
psu_train = '/content/drive/MyDrive/Metaphor Detection/data/PSUCMC_sequence_train.csv'
psu_test = '/content/drive/MyDrive/Metaphor Detection/data/PSUCMC_sequence_test.csv'
psu_val = '/content/drive/MyDrive/Metaphor Detection/data/PSUCMC_sequence_val.csv'

In [None]:
# get raw data
raw_psu = get_raw_data(psu)
raw_train = get_raw_data(psu_train)
raw_test = get_raw_data(psu_test)
raw_val = get_raw_data(psu_val)

In [None]:
# set to predict
pred_psu = predict_dataset(raw_psu)
pred_train = predict_dataset(raw_train)
pred_test = predict_dataset(raw_test)
pred_val = predict_dataset(raw_val)
print(f'Train/test/val split: {len(pred_train)}, {len(pred_test)}, {len(pred_val)}')

Train/test/val split: 1034, 345, 345


In [None]:
# describe dataset
describe(pred_psu)
# check genre porpotion
print('\n-----------------------------------------------------')
print('\nGenre Ratio:')
print('\nall')
check_genre(raw_psu)
print('\ntrain')
check_genre(raw_train)
print('\ntest')
check_genre(raw_test)
print('\nval')
check_genre(raw_val)

>>> 1. Size of the dataset
['sentences', 'pos_seq', 'met_seq', 'genre', 'split']
Size:  1724

>>> 2. Sentence length
Mean	20.73
SD	13.93

>>> 3. Metaphor count (1 for metaphor):
('ADJ', 0)	1252
('ADJ', 1)	242
('ADP', 0)	1134
('ADP', 1)	692
('ADV', 0)	2650
('ADV', 1)	20
('NOUN', 0)	7767
('NOUN', 1)	570
('PRNOUN', 0)	1710
('PRNOUN', 1)	232
('VERB', 0)	5891
('VERB', 1)	1343
('X', 0)	12069
('X', 1)	175

>>> 4. POS tag count:
ADJ	1494	4.18%
ADP	1826	5.11%
ADV	2670	7.47%
NOUN	8337	23.32%
PRNOUN	1942	5.43%
VERB	7234	20.24%
X	12244	34.25%

-----------------------------------------------------

Genre Ratio:

all
Academic: 487, 28.25%
Fiction: 709, 41.13%
News: 528, 30.63%

train
Academic: 292, 28.24%
Fiction: 425, 41.10%
News: 317, 30.66%

test
Academic: 98, 28.41%
Fiction: 142, 41.16%
News: 105, 30.43%

val
Academic: 97, 28.12%
Fiction: 142, 41.16%
News: 106, 30.72%


In [None]:
# pos2idx
pos_set = sorted(set(sum([row[2] for row in pred_psu], [])))
pos2idx, idx2pos = get_pos2idx_idx2pos(pos_set)

In [None]:
pos_set

['ADJ', 'ADP', 'ADV', 'NOUN', 'PRNOUN', 'VERB', 'X']

In [None]:
# token2idx
vocab = sorted(list(set(sum([row[0] for row in pred_psu], []))))
# 0: [PAD], used to fill sentences to the max length in batch
vocab.insert(0, '[PAD]') 
token2idx, idx2token = get_token2idx_idx2token(vocab)
print(f'Vocab size: {len(vocab)-1}')

Vocab size: 7124


## get dataloader

In [None]:
# pretrained models
elmo = '/content/drive/MyDrive/Metaphor Detection/embeddings/elmo/PSUCMC_elmo.h5'
FT = '/content/drive/MyDrive/Metaphor Detection/embeddings/FastText/PSUCMC_FT.h5'
psu_elmo = h5py.File(elmo)
psu_FT = h5py.File(FT) 

In [None]:
# embed the datasets and get DataLoaders
# shuffle the train dataset to improve the reliabity of training model
# but do not shuffle test and validation set
# esp. when you want to print predictions out 
# if you shuffle them, then although you could get correct PRFA performance scores
# you cannot know which line the prediction belongs to 
batch_size = 10
train_dataloader = get_dataloader(dataset=pred_train, elmo=psu_elmo, FT=psu_FT, batch_size=batch_size, shuffle=True)
test_dataloader = get_dataloader(dataset=pred_test, elmo=psu_elmo, FT=psu_FT, batch_size=batch_size, shuffle=False)
val_dataloader = get_dataloader(dataset=pred_val, elmo=psu_elmo, FT=psu_FT, batch_size=batch_size, shuffle=False)

HBox(children=(FloatProgress(value=0.0, max=1034.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=345.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=345.0), HTML(value='')))




# 5. Train 

## setting

In [None]:
# Instantiate the model
# embedding_dim = elmo+FT
# activate dropout_rnn when num_layers != 1
# hidden_dim % 16 == 0
rnn_model = BiLSTMAtt(embedding_dim=1324, hidden_dim=256, num_layers=1, target_size=2,
            dropout_lstm_input=0.5, dropout_fc_input=0.5, dropout_rnn=0, bidir=True)
if using_GPU: 
  rnn_model = rnn_model.cuda()

In [None]:
# Set up criterion for calculating loss
loss_criterion = nn.NLLLoss()

In [None]:
# optimizer for updating the parameters of
rnn_optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=5e-3, eps=1e-10)

In [None]:
# Number of epochs (passes through the dataset) to train the model for.
num_epochs = 15

## Train and evaluate on val

In [None]:
train_loss = []
val_loss = []
performance_matrix = None

num_iter = 0
for epoch in range(num_epochs):
  print(">>> Starting epoch {}".format(epoch + 1))
  total_train_loss = 0
  for (example_pos, example_text, example_lengths, example_labels, example_tokenizations) in tqdm(train_dataloader):
    example_text = Variable(example_text)
    example_lengths = Variable(example_lengths)
    example_labels = Variable(example_labels)
    if using_GPU:
      example_text = example_text.cuda()
      example_lengths = example_lengths.cuda()
      example_labels = example_labels.cuda()
    
    # calculated training loss for this batch
    train_prediction = rnn_model(example_text, example_lengths)
    batch_loss = loss_criterion(train_prediction.view(-1, 2), example_labels.view(-1))
    total_train_loss += batch_loss
    rnn_optimizer.zero_grad()
    batch_loss.backward()
    rnn_optimizer.step()
    num_iter += 1

  average_train_loss = total_train_loss / train_dataloader.__len__()
  train_loss.append(average_train_loss)

  # Calculate validation and training set loss and accuracy every 104 gradient updates
  if num_iter % train_dataloader.__len__() == 0:
    # val
    print('Validation Set')
    average_val_loss, predictions, performance_matrix = evaluate(rnn_model, val_dataloader, pred_val)
    val_loss.append(average_val_loss)
    print('Iteration {}. Training Loss {:.4f}. Validation Loss {:.4f}.\n'.
           format(num_iter, torch.Tensor.log(average_train_loss), torch.Tensor.log(average_val_loss)))


>>> Starting epoch 1


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	95.45		38.18		54.55		88.18		[[240.0, 34.0], [1.0, 21.0]]
- PRFA performance for ADP:	86.67		52.00		65.00		79.17		[[201.0, 60.0], [10.0, 65.0]]
- PRFA performance for ADV:	0.00		0.00		nan		98.72		[[542.0, 6.0], [1.0, 0.0]]
- PRFA performance for NOUN:	87.50		5.65		10.61		93.07		[[1577.0, 117.0], [1.0, 7.0]]
- PRFA performance for PRNOUN:	87.50		67.31		76.09		94.46		[[340.0, 17.0], [5.0, 35.0]]
- PRFA performance for VERB:	82.09		19.23		31.16		83.16		[[1145.0, 231.0], [12.0, 55.0]]
- PRFA performance for X:	55.56		12.82		20.83		98.48		[[2456.0, 34.0], [4.0, 5.0]]
Iteration 104. Training Loss -1.0159. Validation Loss -1.3934.

>>> Starting epoch 2




HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	95.00		34.55		50.67		87.50		[[240.0, 36.0], [1.0, 19.0]]
- PRFA performance for ADP:	90.28		52.00		65.99		80.06		[[204.0, 60.0], [7.0, 65.0]]
- PRFA performance for ADV:	nan		0.00		nan		98.91		[[543.0, 6.0], [0.0, 0.0]]
- PRFA performance for NOUN:	90.48		15.32		26.21		93.71		[[1576.0, 105.0], [2.0, 19.0]]
- PRFA performance for PRNOUN:	92.50		71.15		80.43		95.47		[[342.0, 15.0], [3.0, 37.0]]
- PRFA performance for VERB:	76.00		33.22		46.23		84.68		[[1127.0, 191.0], [30.0, 95.0]]
- PRFA performance for X:	76.47		33.33		46.43		98.80		[[2456.0, 26.0], [4.0, 13.0]]
Iteration 208. Training Loss -1.5390. Validation Loss -1.7777.

>>> Starting epoch 3


  app.launch_new_instance()


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	84.00		38.18		52.50		87.16		[[237.0, 34.0], [4.0, 21.0]]
- PRFA performance for ADP:	87.93		81.60		84.65		88.99		[[197.0, 23.0], [14.0, 102.0]]
- PRFA performance for ADV:	0.00		0.00		nan		98.72		[[542.0, 6.0], [1.0, 0.0]]
- PRFA performance for NOUN:	79.49		25.00		38.04		94.07		[[1570.0, 93.0], [8.0, 31.0]]
- PRFA performance for PRNOUN:	94.44		65.38		77.27		94.96		[[343.0, 18.0], [2.0, 34.0]]
- PRFA performance for VERB:	73.26		47.90		57.93		86.21		[[1107.0, 149.0], [50.0, 137.0]]
- PRFA performance for X:	60.00		53.85		56.76		98.72		[[2446.0, 18.0], [14.0, 21.0]]
Iteration 312. Training Loss -1.8821. Validation Loss -2.0309.

>>> Starting epoch 4


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	88.46		41.82		56.79		88.18		[[238.0, 32.0], [3.0, 23.0]]
- PRFA performance for ADP:	93.62		70.40		80.37		87.20		[[205.0, 37.0], [6.0, 88.0]]
- PRFA performance for ADV:	nan		0.00		nan		98.91		[[543.0, 6.0], [0.0, 0.0]]
- PRFA performance for NOUN:	86.49		25.81		39.75		94.30		[[1573.0, 92.0], [5.0, 32.0]]
- PRFA performance for PRNOUN:	92.11		67.31		77.78		94.96		[[342.0, 17.0], [3.0, 35.0]]
- PRFA performance for VERB:	85.11		27.97		42.11		84.75		[[1143.0, 206.0], [14.0, 80.0]]
- PRFA performance for X:	60.00		30.77		40.68		98.60		[[2452.0, 27.0], [8.0, 12.0]]
Iteration 416. Training Loss -2.1434. Validation Loss -2.1458.

>>> Starting epoch 5


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	70.59		43.64		53.93		86.15		[[231.0, 31.0], [10.0, 24.0]]
- PRFA performance for ADP:	90.20		73.60		81.06		87.20		[[201.0, 33.0], [10.0, 92.0]]
- PRFA performance for ADV:	0.00		0.00		nan		98.72		[[542.0, 6.0], [1.0, 0.0]]
- PRFA performance for NOUN:	81.54		42.74		56.08		95.12		[[1566.0, 71.0], [12.0, 53.0]]
- PRFA performance for PRNOUN:	84.91		86.54		85.71		96.22		[[337.0, 7.0], [8.0, 45.0]]
- PRFA performance for VERB:	77.27		47.55		58.87		86.83		[[1117.0, 150.0], [40.0, 136.0]]
- PRFA performance for X:	68.75		56.41		61.97		98.92		[[2450.0, 17.0], [10.0, 22.0]]
Iteration 520. Training Loss -2.3413. Validation Loss -2.3110.

>>> Starting epoch 6


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	72.73		43.64		54.55		86.49		[[232.0, 31.0], [9.0, 24.0]]
- PRFA performance for ADP:	91.07		81.60		86.08		90.18		[[201.0, 23.0], [10.0, 102.0]]
- PRFA performance for ADV:	0.00		0.00		nan		98.72		[[542.0, 6.0], [1.0, 0.0]]
- PRFA performance for NOUN:	83.87		41.94		55.91		95.18		[[1568.0, 72.0], [10.0, 52.0]]
- PRFA performance for PRNOUN:	90.91		76.92		83.33		95.97		[[341.0, 12.0], [4.0, 40.0]]
- PRFA performance for VERB:	75.73		54.55		63.41		87.53		[[1107.0, 130.0], [50.0, 156.0]]
- PRFA performance for X:	71.88		58.97		64.79		99.00		[[2451.0, 16.0], [9.0, 23.0]]
Iteration 624. Training Loss -2.4945. Validation Loss -2.3735.

>>> Starting epoch 7


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	73.53		45.45		56.18		86.82		[[232.0, 30.0], [9.0, 25.0]]
- PRFA performance for ADP:	86.29		85.60		85.94		89.58		[[194.0, 18.0], [17.0, 107.0]]
- PRFA performance for ADV:	0.00		0.00		nan		98.54		[[541.0, 6.0], [2.0, 0.0]]
- PRFA performance for NOUN:	88.14		41.94		56.83		95.36		[[1571.0, 72.0], [7.0, 52.0]]
- PRFA performance for PRNOUN:	81.48		84.62		83.02		95.47		[[335.0, 8.0], [10.0, 44.0]]
- PRFA performance for VERB:	78.31		45.45		57.52		86.69		[[1121.0, 156.0], [36.0, 130.0]]
- PRFA performance for X:	69.23		46.15		55.38		98.84		[[2452.0, 21.0], [8.0, 18.0]]
Iteration 728. Training Loss -2.6116. Validation Loss -2.4134.

>>> Starting epoch 8


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	82.76		43.64		57.14		87.84		[[236.0, 31.0], [5.0, 24.0]]
- PRFA performance for ADP:	94.85		73.60		82.88		88.69		[[206.0, 33.0], [5.0, 92.0]]
- PRFA performance for ADV:	nan		0.00		nan		98.91		[[543.0, 6.0], [0.0, 0.0]]
- PRFA performance for NOUN:	89.58		34.68		50.00		94.95		[[1573.0, 81.0], [5.0, 43.0]]
- PRFA performance for PRNOUN:	95.45		80.77		87.50		96.98		[[343.0, 10.0], [2.0, 42.0]]
- PRFA performance for VERB:	81.95		38.11		52.03		86.07		[[1133.0, 177.0], [24.0, 109.0]]
- PRFA performance for X:	75.00		38.46		50.85		98.84		[[2455.0, 24.0], [5.0, 15.0]]
Iteration 832. Training Loss -2.7299. Validation Loss -2.3287.

>>> Starting epoch 9


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	65.91		52.73		58.59		86.15		[[226.0, 26.0], [15.0, 29.0]]
- PRFA performance for ADP:	89.81		77.60		83.26		88.39		[[200.0, 28.0], [11.0, 97.0]]
- PRFA performance for ADV:	0.00		0.00		nan		98.72		[[542.0, 6.0], [1.0, 0.0]]
- PRFA performance for NOUN:	86.57		46.77		60.73		95.59		[[1569.0, 66.0], [9.0, 58.0]]
- PRFA performance for PRNOUN:	93.94		59.62		72.94		94.21		[[343.0, 21.0], [2.0, 31.0]]
- PRFA performance for VERB:	75.73		54.55		63.41		87.53		[[1107.0, 130.0], [50.0, 156.0]]
- PRFA performance for X:	75.86		56.41		64.71		99.04		[[2453.0, 17.0], [7.0, 22.0]]
Iteration 936. Training Loss -2.8098. Validation Loss -2.4464.

>>> Starting epoch 10


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	76.47		47.27		58.43		87.50		[[233.0, 29.0], [8.0, 26.0]]
- PRFA performance for ADP:	87.83		80.80		84.17		88.69		[[197.0, 24.0], [14.0, 101.0]]
- PRFA performance for ADV:	0.00		0.00		nan		98.54		[[541.0, 6.0], [2.0, 0.0]]
- PRFA performance for NOUN:	83.08		43.55		57.14		95.24		[[1567.0, 70.0], [11.0, 54.0]]
- PRFA performance for PRNOUN:	84.62		84.62		84.62		95.97		[[337.0, 8.0], [8.0, 44.0]]
- PRFA performance for VERB:	75.26		51.05		60.83		86.97		[[1109.0, 140.0], [48.0, 146.0]]
- PRFA performance for X:	70.97		56.41		62.86		98.96		[[2451.0, 17.0], [9.0, 22.0]]
Iteration 1040. Training Loss -2.9540. Validation Loss -2.4632.

>>> Starting epoch 11


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	79.41		49.09		60.67		88.18		[[234.0, 28.0], [7.0, 27.0]]
- PRFA performance for ADP:	90.48		76.00		82.61		88.10		[[201.0, 30.0], [10.0, 95.0]]
- PRFA performance for ADV:	nan		0.00		nan		98.91		[[543.0, 6.0], [0.0, 0.0]]
- PRFA performance for NOUN:	87.93		41.13		56.04		95.30		[[1571.0, 73.0], [7.0, 51.0]]
- PRFA performance for PRNOUN:	95.00		73.08		82.61		95.97		[[343.0, 14.0], [2.0, 38.0]]
- PRFA performance for VERB:	79.50		44.76		57.27		86.76		[[1124.0, 158.0], [33.0, 128.0]]
- PRFA performance for X:	79.31		58.97		67.65		99.12		[[2454.0, 16.0], [6.0, 23.0]]
Iteration 1144. Training Loss -3.0633. Validation Loss -2.4397.

>>> Starting epoch 12


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	81.82		49.09		61.36		88.51		[[235.0, 28.0], [6.0, 27.0]]
- PRFA performance for ADP:	91.26		75.20		82.46		88.10		[[202.0, 31.0], [9.0, 94.0]]
- PRFA performance for ADV:	nan		0.00		nan		98.91		[[543.0, 6.0], [0.0, 0.0]]
- PRFA performance for NOUN:	83.87		41.94		55.91		95.18		[[1568.0, 72.0], [10.0, 52.0]]
- PRFA performance for PRNOUN:	80.39		78.85		79.61		94.71		[[335.0, 11.0], [10.0, 41.0]]
- PRFA performance for VERB:	81.55		47.90		60.35		87.53		[[1126.0, 149.0], [31.0, 137.0]]
- PRFA performance for X:	72.73		61.54		66.67		99.04		[[2451.0, 15.0], [9.0, 24.0]]
Iteration 1248. Training Loss -3.1177. Validation Loss -2.3960.

>>> Starting epoch 13


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	77.14		49.09		60.00		87.84		[[233.0, 28.0], [8.0, 27.0]]
- PRFA performance for ADP:	90.99		80.80		85.59		89.88		[[201.0, 24.0], [10.0, 101.0]]
- PRFA performance for ADV:	nan		0.00		nan		98.91		[[543.0, 6.0], [0.0, 0.0]]
- PRFA performance for NOUN:	78.95		48.39		60.00		95.30		[[1562.0, 64.0], [16.0, 60.0]]
- PRFA performance for PRNOUN:	87.23		78.85		82.83		95.72		[[339.0, 11.0], [6.0, 41.0]]
- PRFA performance for VERB:	77.30		50.00		60.72		87.18		[[1115.0, 143.0], [42.0, 143.0]]
- PRFA performance for X:	63.89		58.97		61.33		98.84		[[2447.0, 16.0], [13.0, 23.0]]
Iteration 1352. Training Loss -3.1746. Validation Loss -2.4158.

>>> Starting epoch 14


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	78.12		45.45		57.47		87.50		[[234.0, 30.0], [7.0, 25.0]]
- PRFA performance for ADP:	89.57		82.40		85.83		89.88		[[199.0, 22.0], [12.0, 103.0]]
- PRFA performance for ADV:	nan		0.00		nan		98.91		[[543.0, 6.0], [0.0, 0.0]]
- PRFA performance for NOUN:	88.57		50.00		63.92		95.89		[[1570.0, 62.0], [8.0, 62.0]]
- PRFA performance for PRNOUN:	91.67		63.46		75.00		94.46		[[342.0, 19.0], [3.0, 33.0]]
- PRFA performance for VERB:	75.12		53.85		62.73		87.32		[[1106.0, 132.0], [51.0, 154.0]]
- PRFA performance for X:	75.86		56.41		64.71		99.04		[[2453.0, 17.0], [7.0, 22.0]]
Iteration 1456. Training Loss -3.2011. Validation Loss -2.4778.

>>> Starting epoch 15


HBox(children=(FloatProgress(value=0.0, max=104.0), HTML(value='')))


Validation Set


HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	86.21		45.45		59.52		88.51		[[237.0, 30.0], [4.0, 25.0]]
- PRFA performance for ADP:	89.38		80.80		84.87		89.29		[[199.0, 24.0], [12.0, 101.0]]
- PRFA performance for ADV:	nan		0.00		nan		98.91		[[543.0, 6.0], [0.0, 0.0]]
- PRFA performance for NOUN:	84.06		46.77		60.10		95.48		[[1567.0, 66.0], [11.0, 58.0]]
- PRFA performance for PRNOUN:	90.48		73.08		80.85		95.47		[[341.0, 14.0], [4.0, 38.0]]
- PRFA performance for VERB:	76.70		55.24		64.23		87.80		[[1109.0, 128.0], [48.0, 158.0]]
- PRFA performance for X:	61.54		41.03		49.23		98.68		[[2450.0, 23.0], [10.0, 16.0]]
Iteration 1560. Training Loss -3.2312. Validation Loss -2.4317.



# evaluate on test

In [None]:
average_val_loss, predictions, performance_matrix = evaluate(rnn_model, test_dataloader, pred_test)

HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


				Precision	Recall		F1		Accuracy	Matrix
- PRFA performance for ADJ:	71.43		40.82		51.95		86.83		[[224.0, 29.0], [8.0, 20.0]]
- PRFA performance for ADP:	87.16		86.58		86.87		90.08		[[225.0, 20.0], [19.0, 129.0]]
- PRFA performance for ADV:	0.00		0.00		nan		98.76		[[559.0, 4.0], [3.0, 0.0]]
- PRFA performance for NOUN:	72.22		48.15		57.78		95.37		[[1514.0, 56.0], [20.0, 52.0]]
- PRFA performance for PRNOUN:	92.31		72.00		80.90		95.86		[[358.0, 14.0], [3.0, 36.0]]
- PRFA performance for VERB:	76.14		55.15		63.97		88.61		[[1165.0, 122.0], [47.0, 150.0]]
- PRFA performance for X:	64.29		23.68		34.62		98.66		[[2498.0, 29.0], [5.0, 9.0]]




In [None]:
preds = [['sent_index',
 'sent_txt',
 'pos_seq',
 'metaphor_seq',
 'genre',
 'sent_txt_tokenized',
 'sent_bert_tokens',
 'sent_txt_tokenized_bert',
 'BIO_seq',
 'predicitons']]
for i in range(len(predictions)):
  pred = []
  pred.extend(raw_test[i][:2])
  pred.append(str(pred_test[i][-1]))
  pred.extend(raw_test[i][3:])
  pred.append(str(predictions[i].cpu().numpy().tolist()))
  preds.append(pred)

In [None]:
#write into file
psu_pred = '/content/drive/MyDrive/Metaphor Detection/predict/PSUCMC_Elmo+FT_BiLSTM_LM.csv'
fh = open(psu_pred, 'w')
writer = csv.writer(fh)
writer.writerows(preds)
fh.close()

In [None]:
# ADJ evaluation
get_performance_pos(psu_pred, 'ADJ')

ADJ
Genre-specific Performance:
		Precision	Recall		F1		Accuracy	Matrix
- News:		75.00		60.00		66.67		92.31		[[66.0, 4.0], [2.0, 6.0]]
- Fiction:	75.00		18.75		30.00		84.27		[[72.0, 13.0], [1.0, 3.0]]
- Academic:	68.75		47.83		56.41		85.09		[[86.0, 12.0], [5.0, 11.0]]

General Performance
Precision	Recall		F1		Accuracy	Matrix
71.43		40.82		51.95		86.83		[[224.0, 29.0], [8.0, 20.0]]



array([72.91666667, 42.19202899, 51.02564103, 87.22169151])

In [None]:
# ADP evaluation
get_performance_pos(psu_pred, 'ADP')

ADP
Genre-specific Performance:
		Precision	Recall		F1		Accuracy	Matrix
- News:		80.65		78.12		79.37		89.17		[[82.0, 7.0], [6.0, 25.0]]
- Fiction:	87.50		87.50		87.50		93.70		[[91.0, 4.0], [4.0, 28.0]]
- Academic:	89.41		89.41		89.41		87.67		[[52.0, 9.0], [9.0, 76.0]]

General Performance
Precision	Recall		F1		Accuracy	Matrix
87.16		86.58		86.87		90.08		[[225.0, 20.0], [19.0, 129.0]]



array([85.85230867, 85.0122549 , 85.42561469, 90.17956231])

In [None]:
# ADV evaluation
get_performance_pos(psu_pred, 'ADV')

ADV
Genre-specific Performance:
		Precision	Recall		F1		Accuracy	Matrix
- News:		0.00		0.00		nan		97.86		[[183.0, 2.0], [2.0, 0.0]]
- Fiction:	0.00		nan		nan		99.52		[[209.0, 0.0], [1.0, 0.0]]
- Academic:	nan		0.00		nan		98.82		[[167.0, 2.0], [0.0, 0.0]]

General Performance
Precision	Recall		F1		Accuracy	Matrix
0.00		0.00		nan		98.76		[[559.0, 4.0], [3.0, 0.0]]





array([        nan,         nan,         nan, 98.73378005])

In [None]:
# All-Pos evaluation
print('All')
get_performance_test(psu_pred)

All
Genre-specific Performance:
		Precision	Recall		F1		Accuracy	Matrix
- News:		74.16		47.48		57.89		96.01		[[2242.0, 73.0], [23.0, 66.0]]
- Fiction:	73.73		47.28		57.62		94.70		[[2200.0, 97.0], [31.0, 87.0]]
- Academic:	82.65		70.03		75.82		93.80		[[2101.0, 104.0], [51.0, 243.0]]

General Performance
Precision	Recall		F1		Accuracy	Matrix
79.04		59.10		67.63		94.82		[[6543.0, 274.0], [105.0, 396.0]]



array([76.84639272, 54.93114718, 63.77655455, 94.83465585])

In [None]:
# NOUN evaluation
get_performance_pos(psu_pred, 'NOUN')

NOUN
Genre-specific Performance:
		Precision	Recall		F1		Accuracy	Matrix
- News:		75.00		37.50		50.00		96.95		[[563.0, 15.0], [3.0, 9.0]]
- Fiction:	70.00		24.14		35.90		94.54		[[426.0, 22.0], [3.0, 7.0]]
- Academic:	72.00		65.45		68.57		94.44		[[525.0, 19.0], [14.0, 36.0]]

General Performance
Precision	Recall		F1		Accuracy	Matrix
72.22		48.15		57.78		95.37		[[1514.0, 56.0], [20.0, 52.0]]



array([72.33333333, 42.36415883, 51.48962149, 95.3116939 ])

In [None]:
# Pronoun evaluation
get_performance_pos(psu_pred, 'PRNOUN')

PRNOUN
Genre-specific Performance:
		Precision	Recall		F1		Accuracy	Matrix
- News:		100.00		71.43		83.33		97.94		[[90.0, 2.0], [0.0, 5.0]]
- Fiction:	70.00		53.85		60.87		95.93		[[205.0, 6.0], [3.0, 7.0]]
- Academic:	100.00		80.00		88.89		93.55		[[63.0, 6.0], [0.0, 24.0]]

General Performance
Precision	Recall		F1		Accuracy	Matrix
92.31		72.00		80.90		95.86		[[358.0, 14.0], [3.0, 36.0]]



array([90.        , 68.42490842, 77.69726248, 95.80471108])

In [None]:
# VERB evaluation
get_performance_pos(psu_pred, 'VERB')

VERB
Genre-specific Performance:
		Precision	Recall		F1		Accuracy	Matrix
- News:		66.67		33.96		45.00		90.24		[[389.0, 35.0], [9.0, 18.0]]
- Fiction:	71.19		50.00		58.74		88.18		[[398.0, 42.0], [17.0, 42.0]]
- Academic:	81.08		66.67		73.17		87.64		[[378.0, 45.0], [21.0, 90.0]]

General Performance
Precision	Recall		F1		Accuracy	Matrix
76.14		55.15		63.97		88.61		[[1165.0, 122.0], [47.0, 150.0]]



array([72.97806281, 50.20964361, 58.97066348, 88.68690153])