# COMP5046 Assignment 2

##Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

**Read raw data**

In [2]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '10rWPMnFgLkVTbmDGzukP7ZLtBEN7AZdh'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('train.csv')  

raw_training_df = pd.read_csv('train.csv')

id = '1y7EDO2Thqn5mtsbQgx-YjAvWzkx9xqvn'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('test_without_lables.csv')  

testing_df = pd.read_csv('test_without_lables.csv')

id = '18lUFTMp7PxIBjqU8ImfMrymRJeysljjl'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('val.csv')  

raw_val_df = pd.read_csv('val.csv')

**Shorten Sentence**

In [3]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

def shorten_sentences(df):
    short_sentences = []
    short_labels = []
    for i, sentence, sentence_label in df[['sents','labels']].itertuples():
        words = sentence.split()
        labels = sentence_label.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
                short_labels.append(' '.join(labels[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
            short_labels.append(sentence_label)
    return pd.DataFrame({"sents":short_sentences, "labels":short_labels})

# sentence is shorten

training_df = shorten_sentences(raw_training_df)
print(f'==================== No. training rows: {len(training_df)}=======================')
print("The max length of trainging sentences is :", max(map(lambda k : len(k.split()),training_df['sents'])))


val_df = shorten_sentences(raw_val_df)
print(f'\n==================== No. validation rows: {len(val_df)}=======================')
print("The max length of validation sentences is :", max(map(lambda k : len(k.split()),val_df['sents'])))


print(f'\n==================== No. testing rows: {len(testing_df)}=======================')
print("The max length of testing sentences is :", max(map(lambda k : len(k.split()),testing_df['sents'])))


The max length of trainging sentences is : 64

The max length of validation sentences is : 64

The max length of testing sentences is : 65


**Tokenise sentences into word tokens**

In [4]:
train_data, target_y_train = list(training_df['sents']),list(training_df['labels'])
validation_data, target_y_validation = list(val_df['sents']),list(val_df['labels'])
test_data = list(testing_df['sents'])

uncased_training_data = [text.split() for text in train_data]
uncased_val_data = [text.split() for text in validation_data]
uncased_test_data = [text.split() for text in test_data]
all_uncased_data = uncased_training_data + uncased_val_data + uncased_test_data

train_data = [text.split() for text in train_data]
target_y_train = [text.split() for text in target_y_train]
validation_data = [text.split() for text in validation_data]
target_y_validation = [text.split() for text in target_y_validation]
test_data = [text.split() for text in test_data]

##Input Embedding

**Semantic Textual Feature Embedding**
- Word Embedding: WordtoVec

In [5]:
import gensim.downloader as api
word_emb_model4 = api.load("glove-wiki-gigaword-50")

# word to vector embedding example
# word_emb_model4['iraq']

**Syntactic Textual Feature Embedding**
- PoS tag
- Dependency path



In [6]:
# Get PoS tagging and dependency information about a word from spacy
import spacy
from spacy.tokenizer import Tokenizer
nlp = spacy.load('en_core_web_sm')
nlp.tokenizer = Tokenizer(nlp.vocab)

In [7]:
!pip install pymagnitude



In [8]:
# Get the vector representations of PoS tagging and dependency information from pymagnitude
from pymagnitude import *
pos_vectors = FeaturizerMagnitude(300, namespace = "PartsOfSpeech")
dependency_vectors = FeaturizerMagnitude(300, namespace = "SyntaxDependencies")
vectors = Magnitude(pos_vectors, dependency_vectors) # concatenate word2vec with pos and dependencies

**Domain Feature - Case features represented as a list**

In [9]:
# Customise case feature embedding of a word
import re
def case_features(text):
    # return case feature of a work
    return [
        int(text[0].isupper()), # Start with capital letter
        int(text.upper() == text), # All capital letters
        int(text.lower() == text), # All lower letters
        int(all([ch.isdigit() for ch in text])), # all digits
        int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',text)))), # 'is_alphanumeric' 
        1 if '-' in text else 0  #'word_has_hyphen'
        ]

# Embedding Choice for Sentence-Level Embedding
Embedding can be customised by choosing the desirable arguments.

Syntactic feature
* w1: word-level PoS embedding from Spacy
* w2: word-level dependy embedding from Spacy

Domain feature
* w3: word-level case features embedding

Only w1 and w3 are chosen as elements of sentence level embedding for best model.


In [10]:
import torch
import numpy

# Get word2vec embedding from glove,  token PoS tag embedding and dependency embedding from Space
def get_sentence_embedding(all_uncased_data, embedding_choice):
  dependency_embedding = {}
  pos_embedding = {}
  case_embedding = {}
  sentence_embeds = {}

  train_dp = all_uncased_data

  for sentence in train_dp:
    sentence = ' '.join(sentence)
    doc = nlp(sentence)

    embeds = []

    for token in doc:
      # Word-level syntactic/domain features
      pos_embedding[token.text] = token.pos_
      dependency_embedding[token.text] = token.dep_
      case_embedding[token.text] = case_features(token.text)

      w1 = torch.tensor([pos_vectors.query(token.pos_)]).float()
      w2 = torch.tensor([case_features(token.text)]).float()
      w3 = torch.tensor([dependency_vectors.query(token.dep_)]).float()

      if embedding_choice == ["PoS"]:
        embeds.append(numpy.array(w1).tolist())
      elif embedding_choice == ["PoS","case_feature"]:
        embeds.append(numpy.array(torch.cat((w1,w2),1).tolist()[0]))
      elif embedding_choice == ["PoS","dependency"]:
        embeds.append(numpy.array(torch.cat((w1,w3),1).tolist()[0]))
      elif embedding_choice == ["PoS","case_feature",'dependency']:
        embeds.append(numpy.array(torch.cat((w1,w2,w3),1).tolist()[0]))

    sentence_embeds[sentence] = np.array(embeds)
  
  # print("DP length:", len(dependency_embedding))
  # print("PoS length:", len(pos_embedding))
  # print("Case length:", len(case_embedding))
  return sentence_embeds

In [11]:
# Pos + case features + dependency of the word are the possible candidates for the sentence level embedding
all_choices = ["PoS","case_feature",'dependency']
# possible combinations are [0],[0,1],[0,2],[0,1,2] 

# Change this line to choose your desired word-level embedding choices
sentence_embedding_choices = ["PoS","case_feature"]

In [12]:
sentence_embeds = get_sentence_embedding(all_uncased_data, sentence_embedding_choices)

**Map word/tag in training/validation/test data to index**

In [13]:
# Map word to index and index to word
ix_to_word = {}
word_to_ix = {}
for sentence in all_uncased_data:
    for word in sentence:
        # word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            ix_to_word[word_to_ix[word]] = word

# Store all words in word_list
word_list = list(word_to_ix.keys())

# Map word to index and index to word
START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
ix_to_tag = {}
for tags in target_y_train+target_y_validation:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)
            ix_to_tag[tag_to_ix[tag]] = tag

# Check word_list length
len(word_list)

4291

**Construct embedding matrix for word embedding**


In [14]:
import torch

word_embedding_matrix = []
for i in range(len(word_list)):
    word = word_list[i]

    if word.lower() in word_emb_model4:
      v = word_emb_model4.wv[word.lower()]
      w1 = torch.tensor([v.tolist()]).float()
    # word embedding for OOV
    else:
      w1 = torch.tensor([[0]*50]).float()

    word_embedding_matrix.append(w1.tolist()[0])


word_embedding_matrix = np.array(word_embedding_matrix)
word_embedding_matrix.shape

  


(4291, 50)

**Map all words data to indices data**

In [15]:
def word_to_index(data, to_ix, tag=False):
    input_index_list = []
    for sent in data:
        if tag:
            input_index_list.append([to_ix[w] for w in sent])
        else:
            input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  word_to_index(uncased_training_data,word_to_ix)
train_output_index = word_to_index(target_y_train,tag_to_ix,True)
val_input_index = word_to_index(uncased_val_data,word_to_ix)
val_output_index = word_to_index(target_y_validation,tag_to_ix,True)
test_uncased_input_index  = word_to_index(uncased_test_data,word_to_ix)

In [16]:
print("word to index example", word_to_ix['Abdulbaset'])
print("test input index example", test_uncased_input_index[:10])

word to index example 3265
test input index example [[1179, 3752, 1906, 49, 138, 184, 140, 35, 20, 193, 372, 2609, 53, 40, 494, 33, 2089, 704, 1063, 33, 431, 435, 40, 33, 213, 90, 340, 25, 97, 1064, 27, 118, 377, 155, 764, 31, 2699, 33, 1169, 84, 53, 2919, 31, 33, 183, 44], [3753, 3754, 24, 33, 46, 3755, 350, 3756, 53, 3757, 53, 3758, 53, 3759, 53, 40, 3760, 3761, 777, 479, 47, 20, 3762, 3763, 3764, 44], [45, 768, 680, 35, 33, 3725, 166, 470, 639, 27, 2294, 53, 3765, 44, 3315, 44, 79, 3404, 3006, 53, 1784, 53, 415, 33, 3766, 1095, 44], [92, 92, 92, 519, 96, 1864, 158, 53, 96, 107, 3767, 31, 3559, 33, 543, 26, 59, 391, 635, 486, 53, 92, 92, 493, 160, 53, 2258, 59, 33, 184, 3768, 1568, 3769, 44, 92], [1516, 3770, 684, 20, 3771, 49, 3772, 27, 20, 2563, 1385, 35, 33, 399, 79, 2266, 53, 3649, 1841, 2980, 3773, 160, 44], [45, 229, 148, 1976, 31, 3729, 3774, 3775, 53, 1745, 352, 3776, 40, 74, 3777, 27, 206, 33, 1800, 34, 35, 3778, 40, 3779, 35, 3780, 44], [2301, 33, 1907, 398, 35, 3781, 40, 6

In [17]:
WORD_EMBEDDING_DIM = word_embedding_matrix.shape[1] # 50
choices_dim = {"PoS":4,"case_feature":6,'dependency':4}
SENTENCE_EMBEDDING_DIM = sum([choices_dim[choice] for choice in sentence_embedding_choices])

EMBEDDING_DIM = WORD_EMBEDDING_DIM + SENTENCE_EMBEDDING_DIM
EMBEDDING_DIM

60

# NER Model

In [61]:
# Reference lab9
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim,crf = True,attention = 'general',layer = 1,sent_embed_status = True):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.crf = crf
        self.attention = attention
        self.layer = layer
        self.sent_embed_status = sent_embed_status

        self.word_embeds = nn.Embedding(vocab_size, WORD_EMBEDDING_DIM)
        self.fc = nn.Linear(hidden_dim, hidden_dim, bias=False)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(word_embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=layer, dropout = 0.1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        if attention == 'no':
          self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        else:
          self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.dropout_lstm=nn.Dropout(p=0.5)
        self.dropout = nn.Dropout(p=0.3)

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2*self.layer, 1, self.hidden_dim // 2).to(device),
                torch.randn(2*self.layer, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def self_attention(self,lstm_out, l, method=None):
    
        embeds = lstm_out.view(l,-1)
        # print("emb",embeds.shape)

        k = torch.empty(self.hidden_dim,self.hidden_dim)
        #nn.init.uniform_(k,-np.sqrt(self.hidden_dim),np.sqrt(self.hidden_dim))

        q = torch.empty(self.hidden_dim,self.hidden_dim)
        #nn.init.uniform_(q,-np.sqrt(self.hidden_dim),np.sqrt(self.hidden_dim))

        v = torch.empty(self.hidden_dim,self.hidden_dim)
        #nn.init.uniform_(v,-np.sqrt(self.hidden_dim),np.sqrt(self.hidden_dim))

        torch.nn.init.xavier_normal_(k)
        torch.nn.init.xavier_normal_(q)
        torch.nn.init.xavier_normal_(v)

        
        keys = embeds @ k
        querys = embeds @ q
        values = embeds @ v

        method = self.attention

        if method == "dot":
          # For the dot scoring method, no weights or linear layers are involved
          attn_scores = querys @ keys.T
          attn_scores_softmax = F.softmax(attn_scores, dim=-1)

        
        elif method == "general":
          # For general scoring, decoder hidden state is passed through linear layers to introduce a weight matrix
          querys = self.fc(querys)
          attn_scores = querys @ keys.T
          # return encoder_outputs.bmm(out.view(1,-1,1)).squeeze(-1)
          attn_scores_softmax = F.softmax(attn_scores, dim=-1)
        
        elif method == "scaled":
          # For concat scoring, decoder hidden state and encoder outputs are concatenated first
          # out = torch.tanh(self.fc(decoder_hidden+encoder_outputs))
          # return out.bmm(self.weight.unsqueeze(-1)).squeeze(-1)
          attn_scores = querys @ keys.T
          attn_scores_softmax = F.softmax(attn_scores/np.sqrt(self.hidden_dim), dim=-1)

      
        weighted_values = values[:,None] * attn_scores_softmax.T[:,:,None]
        outputs = weighted_values.sum(dim=0)

        outputs = outputs.view(-1,1,self.hidden_dim)
        #outputs = self.dropout(outputs)

        concat_output = torch.cat((outputs, lstm_out), 1)

        return concat_output
        

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        if self.sent_embed_status:
          embeds = self.word_embeds(sentence).view(len(sentence), -1)
          sent_embed = torch.tensor(sentence_embeds[tag_to_sent(sentence.tolist())]).float().view(len(sentence), -1)
          embeds = torch.cat((embeds,sent_embed),1).view(len(sentence), 1, -1)
        else:
          embeds = self.word_embeds(sentence).view(len(sentence),1, -1)
          
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        if self.attention == 'no':
          lstm_out = lstm_out.view(-1, self.hidden_dim)
        else:
          lstm_out = self.self_attention(lstm_out,len(sentence))
          lstm_out = lstm_out.view(-1, self.hidden_dim*2)

        lstm_out = self.dropout_lstm(lstm_out)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)
        
        if not self.crf:
          return 0,torch.argmax(lstm_feats,dim=1)

        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [19]:
# Reference lab9
import numpy as np
import itertools

def tag_to_sent(sentence):
  sent = []
  for idx in sentence:
    sent.append(ix_to_word[idx])
  return ' '.join(sent)

def cal_acc(model, input_index, output_index):
  predicted = []
  for sentence in input_index:
    score, tag_seq = model.forward(torch.tensor(sentence).to(device))
    predicted.append(tag_seq)
  predicted =  list(itertools.chain.from_iterable(predicted))
  ground_truth = list(itertools.chain.from_iterable(output_index))
  correct = 0
  for i in range(0,len(predicted)):
    if predicted[i] == ground_truth[i]:
      correct += 1
  return predicted,ground_truth,((correct/len(predicted)))

In [20]:
# IMPORTANT TIPS:
# Try this if running NER models shows a 

# RuntimeError: PyTorch was compiled without NumPy support

!pip install numpy==1.16.4
!pip install torch==1.6.0

Collecting numpy==1.16.4
  Using cached https://files.pythonhosted.org/packages/fc/d1/45be1144b03b6b1e24f9a924f23f66b4ad030d834ad31fb9e5581bd328af/numpy-1.16.4-cp37-cp37m-manylinux1_x86_64.whl
[31mERROR: xarray 0.18.2 has requirement numpy>=1.17, but you'll have numpy 1.16.4 which is incompatible.[0m
[31mERROR: torchvision 0.9.1+cu101 has requirement torch==1.8.1, but you'll have torch 1.6.0 which is incompatible.[0m
[31mERROR: torchtext 0.9.1 has requirement torch==1.8.1, but you'll have torch 1.6.0 which is incompatible.[0m
[31mERROR: tensorflow 2.5.0 has requirement h5py~=3.1.0, but you'll have h5py 3.2.1 which is incompatible.[0m
[31mERROR: tensorflow 2.5.0 has requirement numpy~=1.19.2, but you'll have numpy 1.16.4 which is incompatible.[0m
[31mERROR: tensorflow 2.5.0 has requirement six~=1.15.0, but you'll have six 1.16.0 which is incompatible.[0m
[31mERROR: pyerfa 2.0.0 has requirement numpy>=1.17, but you'll have numpy 1.16.4 which is incompatible.[0m
[31mERROR: 



In [21]:
import torch

device = torch.device("cpu")
HIDDEN_DIM = 128

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

  "num_layers={}".format(dropout, num_layers))


In [50]:
"""Each epoch will take about 1-2 minutes"""

import datetime
# Early stopping
def start_training(model):
  the_last_loss = 100
  patience = 2
  trigger_times = 0
  best = model

  for epoch in range(15):  
      time1 = datetime.datetime.now()
      train_loss = 0

      model.train()
      for i, idxs in enumerate(train_input_index):
          tags_index = train_output_index[i]

          # Step 1. Remember that Pytorch accumulates gradients.
          # We need to clear them out before each instance
          model.zero_grad()

          # Step 2. Get our inputs ready for the network, that is,
          # turn them into Tensors of word indices.
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)

          # Step 3. Run our forward pass.
          loss = model.neg_log_likelihood(sentence_in, targets)

          # Step 4. Compute the loss, gradients, and update the parameters by
          # calling optimizer.step()
          loss.backward()
          optimizer.step()

          train_loss+=loss.item()

      model.eval()
      # Call the cal_acc functions you implemented as required
      _, _, train_acc = cal_acc(model,train_input_index,train_output_index)
      _, _, val_acc = cal_acc(model,val_input_index,val_output_index)

      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()

      # # Early stopping
      if val_loss > the_last_loss:
          trigger_times += 1
          # print('trigger times:', trigger_times)
          if trigger_times >= patience:
              print('Early stopping!\nProceed to prediciton process.')
              break
      else:
          # print('trigger times: 0')
          trigger_times = 0
          best = model
      the_last_loss = val_loss

      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs, Early Stopping trigger times: %d" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds(), trigger_times))

  # use the model that did not show increase in loss
  model = best
  # The log below is the sample output for this section

In [None]:
start_training(model)

In [None]:
predicted = []
for sentence in test_uncased_input_index:
  score, tag_seq = model.forward(torch.tensor(sentence).to(device))
  #tag_seq = [int(x)-2 for x in tag_seq]
  tag_seq = [ix_to_tag[int(x)] for x in tag_seq]
  predicted.append(tag_seq)
predicted =  list(itertools.chain.from_iterable(predicted))

In [None]:
print("There are a total of {} tags".format(len(predicted)))

To submission

In [None]:
# import pandas as pd
# df = pd.read_csv('submisson-pos-glove-dp--wiki50.csv', index_col=0)

ids = [i for i in range(len(predicted))]
df = pd.DataFrame(
    {
        'Id':ids,
     'Predicted': predicted
    })

df.to_csv('submission-bestmodel.csv', index=False)

##Testing and Evaluation - Setup

**DO NOT run the code under setup secetion, those are just evidence that showing the results are come form our model** 


**1. Baseline model**


In [85]:
# import gensim.downloader as api
# word_emb_model_base = api.load("glove-twitter-100")

# # word to vector embedding example
# # word_emb_model4['iraq']



In [112]:
# import torch

# word_embedding_matrix = []
# for i in range(len(word_list)):
#     word = word_list[i]

#     if word.lower() in word_emb_model_base:
#       v = word_emb_model_base.wv[word.lower()]
#       w1 = torch.tensor([v.tolist()]).float()
#     # word embedding for OOV
#     else:
#       w1 = torch.tensor([[0]*100]).float()

#     word_embedding_matrix.append(w1.tolist()[0])


# word_embedding_matrix = np.array(word_embedding_matrix)
# word_embedding_matrix.shape

  


(4291, 50)

In [90]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='no',sent_embed_status = False,crf=True,layer=1).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'baseline-model.pt')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 15635.23, train acc: 0.7620, val loss: 4550.30, val acc: 0.7026, time: 119.71s, Early Stopping trigger times: 1
Epoch:2, Training loss: 8860.65, train acc: 0.8094, val loss: 3534.04, val acc: 0.7523, time: 120.27s, Early Stopping trigger times: 0
Epoch:3, Training loss: 6922.89, train acc: 0.8340, val loss: 3070.07, val acc: 0.7707, time: 120.12s, Early Stopping trigger times: 0
Epoch:4, Training loss: 5886.88, train acc: 0.8538, val loss: 2730.00, val acc: 0.7827, time: 120.17s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5101.98, train acc: 0.8546, val loss: 2660.27, val acc: 0.7858, time: 120.06s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4513.29, train acc: 0.8745, val loss: 2468.29, val acc: 0.7961, time: 119.57s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4138.76, train acc: 0.8845, val loss: 2375.49, val acc: 0.8033, time: 120.03s, Early Stopping trigger times: 0
Epoch:8, Training loss: 3641.68, train acc: 0.9018, va

**2. Ablation Study - different input embedding model**

~~~
Base model -glove + 1 layer + crf
~~~

In [59]:
# # Change this line to choose your desired word-level embedding choices
# sentence_embedding_choices = []
# sentence_embeds = get_sentence_embedding(all_uncased_data, sentence_embedding_choices)
# WORD_EMBEDDING_DIM = word_embedding_matrix.shape[1] # 50
# choices_dim = {"PoS":4,"case_feature":6,'dependency':4}
# SENTENCE_EMBEDDING_DIM = sum([choices_dim[choice] for choice in sentence_embedding_choices])

# EMBEDDING_DIM = WORD_EMBEDDING_DIM + SENTENCE_EMBEDDING_DIM
# EMBEDDING_DIM = 50

In [67]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='scaled',sent_embed_status = False,crf=True).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-glove.pt')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 14747.81, train acc: 0.7401, val loss: 4795.25, val acc: 0.6799, time: 123.16s, Early Stopping trigger times: 1
Epoch:2, Training loss: 9425.27, train acc: 0.7848, val loss: 3543.07, val acc: 0.7205, time: 122.07s, Early Stopping trigger times: 0
Epoch:3, Training loss: 7480.08, train acc: 0.7982, val loss: 3071.07, val acc: 0.7438, time: 122.95s, Early Stopping trigger times: 0
Epoch:4, Training loss: 6393.17, train acc: 0.8213, val loss: 2716.78, val acc: 0.7620, time: 122.72s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5686.42, train acc: 0.8364, val loss: 2471.11, val acc: 0.7767, time: 122.65s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4954.25, train acc: 0.8523, val loss: 2243.60, val acc: 0.7862, time: 123.12s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4518.76, train acc: 0.8646, val loss: 2122.03, val acc: 0.7992, time: 123.79s, Early Stopping trigger times: 0
Epoch:8, Training loss: 4184.88, train acc: 0.8670, va

~~~
Syntactic Textual Feature Embedding:  POS tag information
~~~

In [113]:
# sentence_embedding_choices = ["PoS"]
# sentence_embeds = get_sentence_embedding(all_uncased_data, sentence_embedding_choices)
# WORD_EMBEDDING_DIM = word_embedding_matrix.shape[1] # 50
# choices_dim = {"PoS":4,"case_feature":6,'dependency':4}
# SENTENCE_EMBEDDING_DIM = sum([choices_dim[choice] for choice in sentence_embedding_choices])

# EMBEDDING_DIM = WORD_EMBEDDING_DIM + SENTENCE_EMBEDDING_DIM
# EMBEDDING_DIM

54

In [114]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='general',sent_embed_status = True,crf=True).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-glove-pos.pt')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 15527.45, train acc: 0.7433, val loss: 4830.98, val acc: 0.6905, time: 122.15s, Early Stopping trigger times: 1
Epoch:2, Training loss: 9293.35, train acc: 0.7902, val loss: 3669.94, val acc: 0.7376, time: 121.64s, Early Stopping trigger times: 0
Epoch:3, Training loss: 7306.16, train acc: 0.8178, val loss: 3008.75, val acc: 0.7665, time: 123.27s, Early Stopping trigger times: 0
Epoch:4, Training loss: 6280.31, train acc: 0.8241, val loss: 2832.48, val acc: 0.7633, time: 121.73s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5418.59, train acc: 0.8445, val loss: 2451.62, val acc: 0.7841, time: 122.30s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4814.53, train acc: 0.8613, val loss: 2286.99, val acc: 0.7927, time: 123.12s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4419.86, train acc: 0.8670, val loss: 2182.21, val acc: 0.7987, time: 122.34s, Early Stopping trigger times: 0
Epoch:8, Training loss: 4058.12, train acc: 0.8830, va

~~~
Semantic Textual Feature Embedding: glove-wiki-gigaword-50
~~~

~~~
Domain Feature Embedding: case features
~~~

In [116]:
# sentence_embedding_choices = ["PoS","case_feature"]
# sentence_embeds = get_sentence_embedding(all_uncased_data, sentence_embedding_choices)
# WORD_EMBEDDING_DIM = word_embedding_matrix.shape[1] # 50
# choices_dim = {"PoS":4,"case_feature":6,'dependency':4}
# SENTENCE_EMBEDDING_DIM = sum([choices_dim[choice] for choice in sentence_embedding_choices])

# EMBEDDING_DIM = WORD_EMBEDDING_DIM + SENTENCE_EMBEDDING_DIM
# EMBEDDING_DIM

60

In [117]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='scaled',sent_embed_status = True,crf=True).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-glove-pos-case.pt')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 15073.15, train acc: 0.7712, val loss: 4474.64, val acc: 0.7153, time: 123.13s, Early Stopping trigger times: 1
Epoch:2, Training loss: 8799.65, train acc: 0.8042, val loss: 3426.09, val acc: 0.7581, time: 122.83s, Early Stopping trigger times: 0
Epoch:3, Training loss: 7033.62, train acc: 0.8218, val loss: 3002.46, val acc: 0.7652, time: 123.81s, Early Stopping trigger times: 0
Epoch:4, Training loss: 5999.13, train acc: 0.8304, val loss: 2767.18, val acc: 0.7737, time: 122.18s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5263.33, train acc: 0.8477, val loss: 2436.75, val acc: 0.7968, time: 122.65s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4828.98, train acc: 0.8561, val loss: 2311.63, val acc: 0.7988, time: 122.41s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4415.15, train acc: 0.8623, val loss: 2158.07, val acc: 0.8113, time: 122.95s, Early Stopping trigger times: 0
Epoch:8, Training loss: 4051.91, train acc: 0.8768, va

**3. Ablation Study - different attention strategy**

```
Base model - glove + case features + pos + 1 layer + crf
```

In [80]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='no').to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-no-attention.pt')

~~~
Dot-product
~~~

In [36]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='dot').to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-dot-product.pt')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 14709.39, train acc: 0.7714, val loss: 4406.71, val acc: 0.7265, time: 120.75s, Early Stopping trigger times: 1
Epoch:2, Training loss: 8709.91, train acc: 0.7994, val loss: 3534.63, val acc: 0.7536, time: 120.73s, Early Stopping trigger times: 0
Epoch:3, Training loss: 7113.20, train acc: 0.8212, val loss: 3103.72, val acc: 0.7704, time: 120.48s, Early Stopping trigger times: 0
Epoch:4, Training loss: 6073.18, train acc: 0.8364, val loss: 2788.99, val acc: 0.7825, time: 120.90s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5423.05, train acc: 0.8498, val loss: 2474.28, val acc: 0.7942, time: 121.25s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4861.61, train acc: 0.8583, val loss: 2276.70, val acc: 0.8050, time: 121.14s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4430.91, train acc: 0.8683, val loss: 2145.25, val acc: 0.8167, time: 121.35s, Early Stopping trigger times: 0
Epoch:8, Training loss: 4092.48, train acc: 0.8766, va

~~~
Scaled dot-product
~~~

In [38]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='scaled').to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-scaled-dot-product.pt')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 14982.51, train acc: 0.7638, val loss: 4509.81, val acc: 0.7114, time: 120.45s, Early Stopping trigger times: 1
Epoch:2, Training loss: 8784.68, train acc: 0.8005, val loss: 3532.51, val acc: 0.7441, time: 119.52s, Early Stopping trigger times: 0
Epoch:3, Training loss: 6980.21, train acc: 0.8168, val loss: 2994.10, val acc: 0.7691, time: 119.54s, Early Stopping trigger times: 0
Epoch:4, Training loss: 5872.61, train acc: 0.8430, val loss: 2633.98, val acc: 0.7838, time: 120.02s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5280.14, train acc: 0.8571, val loss: 2336.38, val acc: 0.8033, time: 120.52s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4811.48, train acc: 0.8640, val loss: 2212.96, val acc: 0.8063, time: 120.54s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4326.30, train acc: 0.8726, val loss: 2087.79, val acc: 0.8122, time: 120.37s, Early Stopping trigger times: 0
Epoch:8, Training loss: 3898.82, train acc: 0.8804, va

~~~
General
~~~

In [39]:
# device = torch.device("cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='general').to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-general.pt')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 14591.74, train acc: 0.7662, val loss: 4407.14, val acc: 0.7138, time: 120.53s, Early Stopping trigger times: 1
Epoch:2, Training loss: 8879.62, train acc: 0.7971, val loss: 3530.93, val acc: 0.7492, time: 121.71s, Early Stopping trigger times: 0
Epoch:3, Training loss: 7212.03, train acc: 0.8217, val loss: 2988.05, val acc: 0.7670, time: 121.47s, Early Stopping trigger times: 0
Epoch:4, Training loss: 6206.05, train acc: 0.8366, val loss: 2623.92, val acc: 0.7832, time: 120.49s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5489.73, train acc: 0.8370, val loss: 2553.20, val acc: 0.7841, time: 120.80s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4924.69, train acc: 0.8576, val loss: 2249.89, val acc: 0.8033, time: 120.27s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4500.39, train acc: 0.8678, val loss: 2183.46, val acc: 0.8072, time: 120.22s, Early Stopping trigger times: 0
Epoch:8, Training loss: 4162.30, train acc: 0.8735, va

**4. Ablation Study - different Stacked layer or # of encoder/decoder strategy**

~~~
Base model: glove + pos + case features + scaled dot-product + 1-layer + crf
~~~

In [41]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='scaled', layer = 1).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-layer&crf.pt')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 14726.83, train acc: 0.7620, val loss: 4203.32, val acc: 0.7099, time: 121.27s, Early Stopping trigger times: 1
Epoch:2, Training loss: 8709.66, train acc: 0.8020, val loss: 3280.22, val acc: 0.7568, time: 120.55s, Early Stopping trigger times: 0
Epoch:3, Training loss: 6979.37, train acc: 0.8191, val loss: 2848.00, val acc: 0.7737, time: 120.30s, Early Stopping trigger times: 0
Epoch:4, Training loss: 5955.63, train acc: 0.8367, val loss: 2608.30, val acc: 0.7810, time: 120.26s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5268.82, train acc: 0.8514, val loss: 2299.05, val acc: 0.7955, time: 120.36s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4776.85, train acc: 0.8626, val loss: 2128.08, val acc: 0.8083, time: 120.89s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4332.62, train acc: 0.8714, val loss: 2089.16, val acc: 0.8119, time: 120.54s, Early Stopping trigger times: 0
Epoch:8, Training loss: 4033.14, train acc: 0.8748, va

~~~
2-layer
~~~

In [77]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='general', layer = 2).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-2-layer')

Epoch:1, Training loss: 16146.15, train acc: 0.7273, val loss: 5279.17, val acc: 0.6738, time: 132.91s, Early Stopping trigger times: 1
Epoch:2, Training loss: 10234.20, train acc: 0.7836, val loss: 3803.95, val acc: 0.7410, time: 132.35s, Early Stopping trigger times: 0
Epoch:3, Training loss: 8007.77, train acc: 0.8035, val loss: 3374.34, val acc: 0.7518, time: 132.64s, Early Stopping trigger times: 0
Epoch:4, Training loss: 6683.39, train acc: 0.8242, val loss: 2958.45, val acc: 0.7773, time: 132.64s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5784.23, train acc: 0.8450, val loss: 2535.71, val acc: 0.7908, time: 132.88s, Early Stopping trigger times: 0
Epoch:6, Training loss: 5143.91, train acc: 0.8562, val loss: 2404.02, val acc: 0.8007, time: 132.93s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4591.42, train acc: 0.8678, val loss: 2240.57, val acc: 0.8082, time: 132.43s, Early Stopping trigger times: 0
Epoch:8, Training loss: 4164.64, train acc: 0.8722, v

~~~
3-layer
~~~

In [45]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='scaled', layer = 3).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-3-layer')

Epoch:1, Training loss: 17996.30, train acc: 0.7008, val loss: 6071.56, val acc: 0.6479, time: 142.34s, Early Stopping trigger times: 1
Epoch:2, Training loss: 12688.39, train acc: 0.7459, val loss: 4889.34, val acc: 0.6885, time: 143.73s, Early Stopping trigger times: 0
Epoch:3, Training loss: 9727.87, train acc: 0.7705, val loss: 3958.76, val acc: 0.7216, time: 145.60s, Early Stopping trigger times: 0
Epoch:4, Training loss: 8150.32, train acc: 0.8014, val loss: 3423.74, val acc: 0.7516, time: 145.25s, Early Stopping trigger times: 0
Epoch:5, Training loss: 6950.12, train acc: 0.8116, val loss: 3169.41, val acc: 0.7575, time: 146.09s, Early Stopping trigger times: 0
Epoch:6, Training loss: 6115.05, train acc: 0.8316, val loss: 2868.89, val acc: 0.7758, time: 144.25s, Early Stopping trigger times: 0
Epoch:7, Training loss: 5512.18, train acc: 0.8397, val loss: 2651.63, val acc: 0.7828, time: 144.78s, Early Stopping trigger times: 0
Epoch:8, Training loss: 5078.23, train acc: 0.8515, v

**5. Ablation Study - with/without CRF**

~~~
Base model: glove + pos + case features + scaled + 1-layer + crf
~~~

In [None]:
# device = torch.device("cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='scaled', layer = 1,crf = True).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-with-crf')

~~~
With out crf
~~~

In [46]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# HIDDEN_DIM = 128
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,attention='general', layer = 1,crf = False).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# start_training(model)
# torch.save(model,'basemodel-without-crf')

  "num_layers={}".format(dropout, num_layers))


Epoch:1, Training loss: 14754.39, train acc: 0.7497, val loss: 4565.45, val acc: 0.7039, time: 107.68s, Early Stopping trigger times: 1
Epoch:2, Training loss: 8826.29, train acc: 0.7654, val loss: 3467.57, val acc: 0.7131, time: 105.69s, Early Stopping trigger times: 0
Epoch:3, Training loss: 7129.69, train acc: 0.7777, val loss: 3033.94, val acc: 0.7268, time: 106.52s, Early Stopping trigger times: 0
Epoch:4, Training loss: 6123.78, train acc: 0.7848, val loss: 2668.77, val acc: 0.7367, time: 105.01s, Early Stopping trigger times: 0
Epoch:5, Training loss: 5367.21, train acc: 0.7954, val loss: 2459.93, val acc: 0.7428, time: 104.77s, Early Stopping trigger times: 0
Epoch:6, Training loss: 4890.44, train acc: 0.7994, val loss: 2301.04, val acc: 0.7475, time: 105.20s, Early Stopping trigger times: 0
Epoch:7, Training loss: 4356.73, train acc: 0.8028, val loss: 2061.03, val acc: 0.7493, time: 106.10s, Early Stopping trigger times: 0
Epoch:8, Training loss: 4038.99, train acc: 0.8020, va

##Evaluation

**1. Performance Comparison**

~~~
Baseline model:

Model: Bi-lstm+crf
 
Input embedding dimension: word embedding (glove-twitter) 100
 
Epochs: 15 with early stopping
 
Learing rate: 0.01
 
Weight decay: 1e-4
 
Optimiser: SGD
 
LSTM dropout: 0.1

Hidden dimensions: 50
~~~

~~~
Best model:

Model: Bi-lstm + crf + scaled dot-product attention + 1 layer
 
Input embedding dimension: word embedding (glove-wiki) 50 + sentence embedding (pos tag) 4 + domain embedding (case features) 6
 
Epochs: 15 with early stopping
 
Learing rate: 0.01
 
Weight decay: 1e-4
 
Optimiser: SGD
 
LSTM dropout: 0.1

Hidden dimensions: 128
~~~


In [91]:
!pip install tabulate
from tabulate import tabulate

def get_accuracy(model_name,is_gpu=False):
  # Load Model
  model = torch.load(model_name)
  model.eval()

  from sklearn.metrics import classification_report
  #print(classification_report(ground_truth, predicted))
  
  _, _, val_acc = cal_acc(model,val_input_index,val_output_index)
  print("val acc: %.4f"%val_acc)

  return val_acc



In [94]:
title = ['Models','Accuracy']
acc_base = get_accuracy('baseline-model.pt')
acc_2 = get_accuracy('best-model.pt')

table = [title,
         ['Base model (lab 9)',acc_base],
          ['Our model:',acc_2]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

val acc: 0.8201
val acc: 0.8377
╒════════════════════╤════════════╕
│ Models             │   Accuracy │
╞════════════════════╪════════════╡
│ Base model (lab 9) │   0.82006  │
├────────────────────┼────────────┤
│ Our model:         │   0.837737 │
╘════════════════════╧════════════╛


**2. Ablation Study - different input embedding model**

~~~
Model: Bi-lstm+crf

Base model: word embedding (glove-wiki) 50 + scaled dot-product + crf + 1 layer

Input embedding dimension: word embedding (glove-wiki) 50 + sentence embedding (pos tag) 4 + domain embedding (case features) 6

Epochs: 15 with early stopping

Learing rate: 0.01

Weight decay: 1e-4

Optimiser: SGD

LSTM dropout: 0.1

Hidden dimensions: 128
~~~

In [124]:

title = ['Models','Accuracy']
acc_base = get_accuracy('basemodel-glove.pt')
acc_1 = get_accuracy('basemodel-glove-pos.pt')
acc_2 = get_accuracy('best-model.pt')

table = [title,
         ['Base model (glove-wiki)',acc_base],
          ['Syntactic Textual Feature Embedding: PoS tag information',acc_1],
          ['Domain Feature Embedding: case features',acc_2]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

val acc: 0.8128
val acc: 0.8253
val acc: 0.8390
╒══════════════════════════════════════════════════════════╤════════════╕
│ Models                                                   │   Accuracy │
╞══════════════════════════════════════════════════════════╪════════════╡
│ Base model (glove-wiki)                                  │   0.812802 │
├──────────────────────────────────────────────────────────┼────────────┤
│ Syntactic Textual Feature Embedding: PoS tag information │   0.82527  │
├──────────────────────────────────────────────────────────┼────────────┤
│ Domain Feature Embedding: case features                  │   0.83904  │
╘══════════════════════════════════════════════════════════╧════════════╛


**3. Ablation Study - different attention strategy** 

~~~
Model: Bi-lstm+crf

Base model: word embedding (glove-wiki) 50 + sentence embedding (pos tag) 4 + domain embedding (case features) 6 + scaled dot-product + crf + 1 layer

Input embedding dimension: word embedding (glove-wiki) 50 + sentence embedding (pos tag) 4 + domain embedding (case features) 6

Epochs: 15 with early stopping

Learing rate: 0.01

Weight decay: 1e-4

Optimiser: SGD

LSTM dropout: 0.1

Hidden dimensions: 128
~~~

In [98]:
title = ['Models','Accuracy']
acc_base = get_accuracy('basemodel-no-attention.pt')
acc_dot = get_accuracy('basemodel-dot-product.pt')
acc_scaled = get_accuracy('basemodel-scaled-dot-product.pt')
acc_general = get_accuracy('basemodel-general.pt')

table = [title,
         ['Base model (no attention)',acc_base],
          ['Dot product:',acc_dot],
          ['Scaled dot product:',acc_scaled],
          ['General:',acc_general]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

val acc: 0.8282
val acc: 0.8229
val acc: 0.8374
val acc: 0.8249
╒═══════════════════════════╤════════════╕
│ Models                    │   Accuracy │
╞═══════════════════════════╪════════════╡
│ Base model (no attention) │   0.828247 │
├───────────────────────────┼────────────┤
│ Dot product:              │   0.822851 │
├───────────────────────────┼────────────┤
│ Scaled dot product:       │   0.837365 │
├───────────────────────────┼────────────┤
│ General:                  │   0.824898 │
╘═══════════════════════════╧════════════╛


**4. Ablation Study - different Stacked layer or # of encoder/decoder strategy**

~~~
Model: Bi-lstm+crf

Base model: word embedding (glove-wiki) 50 + sentence embedding (pos tag) 4 + domain embedding (case features) 6 + scaled dot-product + crf + 1 layer

Input embedding dimension: word embedding (glove-wiki) 50 + sentence embedding (pos tag) 4 + domain embedding (case features) 6

Epochs: 15 with early stopping

Learing rate: 0.01

Weight decay: 1e-4

Optimiser: SGD

LSTM dropout: 0.1

Hidden dimensions: 128
~~~

In [99]:
title = ['Models','Accuracy']
acc_base = get_accuracy('best-model.pt')
acc_2 = get_accuracy('basemodel-2-layer')
acc_3 = get_accuracy('basemodel-3-layer')

table = [title,
         ['Base model (1 layer)',acc_base],
          ['2 Stacked layer:',acc_2],
          ['3 Stacked layer:',acc_3]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

val acc: 0.8389
val acc: 0.8262
val acc: 0.8214
╒══════════════════════╤════════════╕
│ Models               │   Accuracy │
╞══════════════════════╪════════════╡
│ Base model (1 layer) │   0.838854 │
├──────────────────────┼────────────┤
│ 2 Stacked layer:     │   0.8262   │
├──────────────────────┼────────────┤
│ 3 Stacked layer:     │   0.821362 │
╘══════════════════════╧════════════╛


**5. Ablation Study - with/without CRF**

~~~
Model: Bi-lstm+crf

Base model: word embedding (glove-wiki) 50 + sentence embedding (pos tag) 4 + domain embedding (case features) 6 + scaled dot-product + no crf + 1 layer

Input embedding dimension: word embedding (glove-wiki) 50 + sentence embedding (pos tag) 4 + domain embedding (case features) 6

Epochs: 15 with early stopping

Learing rate: 0.01

Weight decay: 1e-4

Optimiser: SGD

LSTM dropout: 0.1

Hidden dimensions: 128
~~~

In [104]:
title = ['Models','Accuracy']
acc_base = get_accuracy('best-model.pt')
acc_2 = get_accuracy('basemodel-without-crf')

table = [title,
         ['Base model (with CRF)',acc_base],
          ['Without CRF:',acc_2]]
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

val acc: 0.8398
val acc: 0.7540
╒═══════════════════════╤════════════╕
│ Models                │   Accuracy │
╞═══════════════════════╪════════════╡
│ Base model (with CRF) │   0.839784 │
├───────────────────────┼────────────┤
│ Without CRF:          │   0.754001 │
╘═══════════════════════╧════════════╛
