#Named Entity Recognition


In [2]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate
drive = None
def authenticate():
    global drive
    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

#Download files
def downloadFiles(fileIds):
    authenticate()
    for fileId in fileIds:    
        downloaded = drive.CreateFile({"id": fileId[1]})
        downloaded.GetContentFile(fileId[0])

In [3]:
#Download file if not existing

try:
  _ = open("train.csv", "r")
except:
  downloadFiles([["train.csv", "1Jvl2fKMwW5ASI1VjdJp_suGDVmHyLmCs"]])

try:
  _ = open("val.csv", "r")
except:
  downloadFiles([["val.csv", "1el2udPaXY7d5T6KjPW9h12JQtWzTxw8f"]])

try:
  _ = open("test_without_labels.csv", "r")
except:
  downloadFiles([["test_without_labels.csv", "1-AvWF2a9s0PQxq_ek2KxTgt0CgFFxLme"]])

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import SGDClassifier
# from sklearn.linear_model import PassiveAggressiveClassifier
# from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [5]:
dftrain = pd.read_csv("train.csv")
dfval = pd.read_csv("val.csv")
dftest = pd.read_csv("test_without_labels.csv")

dftrain['sents'] = dftrain['sents'].apply(lambda x: x.lower())
dfval['sents'] = dfval['sents'].apply(lambda x: x.lower())
dftest['sents'] = dftest['sents'].apply(lambda x: x.lower())
dftrain.head()

Unnamed: 0,sents,labels
0,operation steel curtain ( arabic : ا ل ح ج ا ب...,O O O O O O O O O O O O O O O O O O O O O O O ...
1,the hospital has facilities for mri and ct sca...,B-Location I-Location O O O O O O O O O O O O ...
2,the operation was important in that it was the...,O O O O O O O O O O O O O O B-Organisation I-O...
3,this was my first visit to uzbekistan and an i...,O O B-Person O O O B-Location O O O O O O O O ...
4,the group was founded by sheikh abu omar al - ...,B-Organisation I-Organisation O O O B-Person I...


In [6]:
dftest.head()

Unnamed: 0,sents
0,carter thanked abadi for nearly two years of a...
1,analyses performed by the hospital lab include...
2,the last meeting of the small group took place...
3,""" "" "" as we meet here , we are hoping to gener..."
4,one jet bombed a school for girls in a souther...


## 1 Data Preprocessing


In [7]:
# Generate word_to_ix and tag_to_ix
#from lab 09
word_to_ix = {}
for sentence in list(dftrain.sents)+list(dftest.sents)+list(dfval.sents):
    for word in sentence.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())
print(word_list[:5])

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in list(dftrain.labels)+list(dfval.labels):
    for tag in tags.split():
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)
print(tag_to_ix)

['operation', 'steel', 'curtain', '(', 'arabic']
{'<START>': 0, '<STOP>': 1, 'O': 2, 'B-Organisation': 3, 'I-Organisation': 4, 'B-Temporal': 5, 'I-Temporal': 6, 'B-Nationality': 7, 'B-Location': 8, 'I-Location': 9, 'B-Person': 10, 'I-Person': 11, 'B-DocumentReference': 12, 'I-DocumentReference': 13, 'B-Money': 14, 'I-Money': 15, 'B-Quantity': 16, 'B-MilitaryPlatform': 17, 'I-MilitaryPlatform': 18, 'B-Weapon': 19, 'I-Weapon': 20, 'I-Quantity': 21, 'I-Nationality': 22}


In [10]:
#from lab 9
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent.split()])
    return input_index_list

train_input_index =  to_index(dftrain.sents,word_to_ix)
train_output_index = to_index(dftrain.labels,tag_to_ix)
val_input_index = to_index(dfval.sents,word_to_ix)
val_output_index = to_index(dfval.labels,tag_to_ix)
test_input_index = to_index(dftest.sents,word_to_ix)
print(train_input_index[0])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 6, 7, 11, 12, 7, 6, 13, 14, 15, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 32, 38, 39, 40, 32, 41, 42, 43]


## 2 Input Embeddings


####2.1 Syntactic Textual Feature Embedding (PoS tag information, Depedency Path)

In [9]:
# from lab 7 and lab 6
import spacy
nlp = spacy.load("en_core_web_sm")
class myTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words=text.split()
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return spacy.tokens.doc.Doc(self.vocab, words=words, spaces=spaces)
nlp.tokenizer = myTokenizer(nlp.vocab)
print(nlp.pipe_names)

def tagging(data):
    POS, DEP = [],[]
    for sent in data:
        parse = nlp(sent)
        assert len(parse)==len(sent.split()),"should be the same length"
        postags = []
        deptags = []
        for token in parse:
            postags.append(token.pos_)
            deptags.append(token.dep_)
        assert len(postags)==len(sent.split()),"should be the same length"
        POS.append(postags)
        DEP.append(deptags)   
    return POS, DEP

trainPOS, trainDEP = tagging(dftrain.sents)
valPOS, valDEP = tagging(dfval.sents)
testPOS, testDEP = tagging(dftest.sents)


def get_tagix(taglist):
  tmp = {}
  for tags in taglist:
    for tag in tags:
      if tag not in tmp:
        tmp[tag] = len(tmp)
  return tmp
tag_to_ix_POS = get_tagix(trainPOS)
tag_to_ix_DEP = get_tagix(trainDEP)
POS_list = list(tag_to_ix_POS.keys())
DEP_list = list(tag_to_ix_DEP.keys())

def to_index_1(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_POS_index =  to_index_1(trainPOS,tag_to_ix_POS)
train_DEP_index =  to_index_1(trainDEP,tag_to_ix_DEP)
val_POS_index =  to_index_1(valPOS,tag_to_ix_POS)
val_DEP_index =  to_index_1(valDEP,tag_to_ix_DEP)
test_POS_index =  to_index_1(testPOS,tag_to_ix_POS)
test_DEP_index =  to_index_1(testDEP,tag_to_ix_DEP)

['tagger', 'parser', 'ner']


In [11]:
embedding_matrix_POS = []
EMBEDDING_DIM_POS = len(POS_list)
for i,word in enumerate(POS_list):
  a = [0]*EMBEDDING_DIM_POS
  a[i] = 1
  embedding_matrix_POS.append(a)
embedding_matrix_POS = np.array(embedding_matrix_POS)
print(embedding_matrix_POS.shape)

embedding_matrix_DEP = []
EMBEDDING_DIM_DEP = len(DEP_list)
for i,word in enumerate(DEP_list):
  a = [0]*EMBEDDING_DIM_DEP
  a[i] = 1
  embedding_matrix_DEP.append(a)
embedding_matrix_DEP = np.array(embedding_matrix_DEP)
print(embedding_matrix_DEP.shape)


(17, 17)
(44, 44)


###2.2 Semantic Textual Feature Embedding: Word Embeddings (FasText, Pretrained Glove-twitter-25)

In [None]:
# FasText 100
# For data processing
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
import re
# For parsing our XML data
from lxml import etree 

# For implementing the word2vec family of algorithms
from gensim.models import Word2Vec,FastText

# Data preprocessing for word embedding does't remove stopwords, but should remove pure numbers
id = '1B47OiEiG2Lo1jUY6hy_zMmHBxfKQuJ8-'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('ted_en-20160408.xml') 

# Please comment your code
targetXML=open('ted_en-20160408.xml', 'r', encoding='UTF8')
# Getting contents of <content> tag from the xml file
target_text = etree.parse(targetXML)
parse_text = '\n'.join(target_text.xpath('//content/text()'))
# Removing "Sound-effect labels" using regular expression (regex) (i.e. (Audio), (Laughter))
content_text = re.sub(r'\([^)]*\)', '', parse_text)
# Tokenising the sentence to process it by using NLTK library
sent_text=sent_tokenize(content_text)

# Removing punctuation and changing all characters to lower case
normalized_text = []
for string in sent_text:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)
sentences=[]
# Tokenising each sentence to process individual word
sentences=[[t for t in word_tokenize(sentence) if re.match(r"[a-z]",t) is not None] for sentence in normalized_text]

# Prints only 2 (tokenised) sentences
print(sentences[:2])

EMBEDDING_DIM_Fast = 100
fast_sg_model = FastText(sentences, size=EMBEDDING_DIM_Fast, window=5, min_count=5, workers=2, sg=1)


embedding_matrix_fast = []
for word in word_list:
    try:
        embedding_matrix_fast.append(fast_sg_model.wv[word])
    except:
        embedding_matrix_fast.append([0]*EMBEDDING_DIM_Fast)
embedding_matrix_fast = np.array(embedding_matrix_fast)
embedding_matrix_fast.shape
# from my ass1

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[['here', 'are', 'two', 'reasons', 'companies', 'fail', 'they', 'only', 'do', 'more', 'of', 'the', 'same', 'or', 'they', 'only', 'do', 'what', 's', 'new'], ['to', 'me', 'the', 'real', 'real', 'solution', 'to', 'quality', 'growth', 'is', 'figuring', 'out', 'the', 'balance', 'between', 'two', 'activities', 'exploration', 'and', 'exploitation']]


(3957, 100)

In [12]:
# Pretrained Glove-twitter-25
import gensim.downloader as api
word_emb_model = api.load("glove-twitter-25") 

EMBEDDING_DIM = 25

embedding_matrix = []
for word in word_list:
    try:
        embedding_matrix.append(word_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape



  # Remove the CWD from sys.path while we load stuff.


(3957, 25)

In [13]:
# Pretrained Glove-twitter-50
import gensim.downloader as api
word_emb_model = api.load("glove-twitter-50") 

EMBEDDING_DIM_50 = 50

embedding_matrix_50 = []
for word in word_list:
    try:
        embedding_matrix_50.append(word_emb_model.wv[word])
    except:
        embedding_matrix_50.append([0]*EMBEDDING_DIM_50)
embedding_matrix_50 = np.array(embedding_matrix_50)
embedding_matrix_50.shape



  # Remove the CWD from sys.path while we load stuff.


(3957, 50)

## 3 Baseline Bi-LSTM CRF
Use Glove-twitter-25 as baseline input embedding

#### 3.1 help Function

In [14]:
HIDDEN_DIM = 50
max_epoch = 20
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]
    
from sklearn.metrics import classification_report

In [15]:
#from lab 9
def cal_acc(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)
    
# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, embedding_matrix):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.embedding_matrix = embedding_matrix

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

#### 3.2 Initialize Model

In [None]:
#from lab9
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM,embedding_matrix).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

####3.3 Train the model

In [None]:
"""Each epoch will take about 2-3 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))


Epoch:1, Training loss: 9071.29, train acc: 0.7905, val loss: 3699.75, val acc: 0.7357, time: 184.14s
Epoch:2, Training loss: 6982.21, train acc: 0.8123, val loss: 3260.01, val acc: 0.7573, time: 182.79s
Epoch:3, Training loss: 5854.00, train acc: 0.8298, val loss: 2939.77, val acc: 0.7711, time: 181.24s
Epoch:4, Training loss: 5123.93, train acc: 0.8426, val loss: 2678.40, val acc: 0.7829, time: 181.81s
Epoch:5, Training loss: 4525.96, train acc: 0.8532, val loss: 2552.76, val acc: 0.7884, time: 181.55s
Epoch:6, Training loss: 4080.88, train acc: 0.8629, val loss: 2416.15, val acc: 0.7911, time: 180.66s
Epoch:7, Training loss: 3686.76, train acc: 0.8737, val loss: 2275.35, val acc: 0.7977, time: 180.70s
Epoch:8, Training loss: 3345.84, train acc: 0.8834, val loss: 2224.93, val acc: 0.8013, time: 181.17s
Epoch:9, Training loss: 3047.98, train acc: 0.8924, val loss: 2188.44, val acc: 0.8039, time: 181.91s
Epoch:10, Training loss: 2797.06, train acc: 0.8986, val loss: 2190.67, val acc: 0

###3.4 Testing

In [None]:
y_true,y_pred,_ = cal_acc(model,val_input_index,val_output_index)

y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

In [None]:
print(classification_report(y_true_decode,y_pred_decode,digits=4))

                     precision    recall  f1-score   support

B-DocumentReference     0.2000    0.2667    0.2286        15
         B-Location     0.5860    0.6566    0.6193       166
 B-MilitaryPlatform     0.0625    0.2000    0.0952         5
            B-Money     0.2000    0.5000    0.2857         2
      B-Nationality     0.1250    0.1429    0.1333         7
     B-Organisation     0.6857    0.7138    0.6995       269
           B-Person     0.7745    0.8061    0.7900        98
         B-Quantity     0.6182    0.6939    0.6538        49
         B-Temporal     0.6809    0.6154    0.6465        52
           B-Weapon     0.1579    0.2500    0.1935        24
I-DocumentReference     0.2771    0.3898    0.3239        59
         I-Location     0.4453    0.5784    0.5032       204
 I-MilitaryPlatform     0.1250    1.0000    0.2222         2
            I-Money     0.6000    1.0000    0.7500         6
      I-Nationality     0.0000    0.0000    0.0000         0
     I-Organisation    

  _warn_prf(average, modifier, msg_start, len(result))


## 4 Evaluate Input Embeddings
Setting different input embeddings

####4.1 Bi-LSTM CRF with FastText

In [None]:
p_EMBEDDING_DIM = EMBEDDING_DIM_Fast
p_embedding_matrix = embedding_matrix_fast
HIDDEN_DIM = 50


# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM, p_embedding_matrix).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)


"""Each epoch will take about 2-3 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

y_true,y_pred,_ = cal_acc(model,val_input_index,val_output_index)
y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

Epoch:1, Training loss: 14502.48, train acc: 0.7547, val loss: 4573.59, val acc: 0.7031, time: 180.45s
Epoch:2, Training loss: 8867.72, train acc: 0.8143, val loss: 3619.82, val acc: 0.7518, time: 180.21s
Epoch:3, Training loss: 6687.58, train acc: 0.8390, val loss: 3194.51, val acc: 0.7632, time: 179.21s
Epoch:4, Training loss: 5434.80, train acc: 0.8568, val loss: 2924.22, val acc: 0.7711, time: 178.98s
Epoch:5, Training loss: 4574.38, train acc: 0.8714, val loss: 2738.96, val acc: 0.7797, time: 178.17s
Epoch:6, Training loss: 3907.04, train acc: 0.8897, val loss: 2567.59, val acc: 0.7929, time: 177.97s
Epoch:7, Training loss: 3369.70, train acc: 0.9022, val loss: 2514.16, val acc: 0.7935, time: 177.58s
Epoch:8, Training loss: 2945.96, train acc: 0.9146, val loss: 2406.40, val acc: 0.7984, time: 177.84s
Epoch:9, Training loss: 2542.80, train acc: 0.9253, val loss: 2440.97, val acc: 0.8047, time: 177.37s
Epoch:10, Training loss: 2208.59, train acc: 0.9341, val loss: 2491.27, val acc: 

  _warn_prf(average, modifier, msg_start, len(result))


####4.2 Bi-LSTM CRF with Glove-twitter-25 and POS information

In [None]:
p_EMBEDDING_DIM = EMBEDDING_DIM
p_embedding_matrix = embedding_matrix
t__EMBEDDING_DIM = EMBEDDING_DIM_POS
t_embedding_matrix = embedding_matrix_POS
t_vocab = embedding_matrix_POS.shape[0]
t_train_input_index = train_POS_index
t_val_input_index = val_POS_index
HIDDEN_DIM = 50

def cal_acc(model, input_index, syn_input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        syn_idxs = syn_input_index[i]
        ground_truth += output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        synlist_in = torch.tensor(syn_idxs, dtype=torch.long).to(device)
        score, pred = model(sentence_in,synlist_in)
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(p_embedding_matrix))

        self.tag_embeds = nn.Embedding(t_vocab, t__EMBEDDING_DIM)
        self.tag_embeds.weight.data.copy_(torch.from_numpy(t_embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim+t__EMBEDDING_DIM, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, syntaglist):
        self.hidden = self.init_hidden()
        wordembeds = self.word_embeds(sentence)
        tagembeds = self.tag_embeds(syntaglist)
        embeds = torch.cat((wordembeds, tagembeds), 1).view(len(sentence), 1, -1)
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, syntaglist, tags):
        feats = self._get_lstm_features(sentence,syntaglist)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, syntaglist):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,syntaglist)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)


"""Each epoch will take about 2-3 minutes"""

import datetime

for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]
        syn_idxs = t_train_input_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        syntaglist_in = torch.tensor(syn_idxs, dtype=torch.long).to(device)

        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in,syntaglist_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,t_train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,t_val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        syn_idxs = t_val_input_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        synlist_in = torch.tensor(syn_idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, synlist_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

y_true,y_pred,_ = cal_acc(model,val_input_index,t_val_input_index,val_output_index)
y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

Epoch:1, Training loss: 13951.74, train acc: 0.7423, val loss: 4467.23, val acc: 0.6942, time: 182.01s
Epoch:2, Training loss: 8346.71, train acc: 0.7810, val loss: 3581.83, val acc: 0.7222, time: 181.68s
Epoch:3, Training loss: 6438.22, train acc: 0.8180, val loss: 3057.40, val acc: 0.7558, time: 182.00s
Epoch:4, Training loss: 5413.80, train acc: 0.8361, val loss: 2745.62, val acc: 0.7719, time: 182.02s
Epoch:5, Training loss: 4732.87, train acc: 0.8502, val loss: 2507.99, val acc: 0.7806, time: 181.28s
Epoch:6, Training loss: 4212.91, train acc: 0.8612, val loss: 2386.85, val acc: 0.7874, time: 181.29s
Epoch:7, Training loss: 3807.86, train acc: 0.8692, val loss: 2278.51, val acc: 0.7952, time: 181.07s
Epoch:8, Training loss: 3460.80, train acc: 0.8780, val loss: 2176.11, val acc: 0.8043, time: 181.99s
Epoch:9, Training loss: 3169.74, train acc: 0.8844, val loss: 2161.56, val acc: 0.8051, time: 181.18s
Epoch:10, Training loss: 2918.18, train acc: 0.8878, val loss: 2130.93, val acc: 

  _warn_prf(average, modifier, msg_start, len(result))


####4.3 Bi-LSTM CRF with Glove-twitter-25 and Dependency information

In [None]:
p_EMBEDDING_DIM = EMBEDDING_DIM
p_embedding_matrix = embedding_matrix
t__EMBEDDING_DIM = EMBEDDING_DIM_DEP
t_embedding_matrix = embedding_matrix_DEP
t_vocab = embedding_matrix_DEP.shape[0]
t_train_input_index = train_DEP_index
t_val_input_index = val_DEP_index

def cal_acc(model, input_index, syn_input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        syn_idxs = syn_input_index[i]
        ground_truth += output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        synlist_in = torch.tensor(syn_idxs, dtype=torch.long).to(device)
        score, pred = model(sentence_in,synlist_in)
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(p_embedding_matrix))

        self.tag_embeds = nn.Embedding(t_vocab, t__EMBEDDING_DIM)
        self.tag_embeds.weight.data.copy_(torch.from_numpy(t_embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim+t__EMBEDDING_DIM, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, syntaglist):
        self.hidden = self.init_hidden()
        wordembeds = self.word_embeds(sentence)
        tagembeds = self.tag_embeds(syntaglist)
        embeds = torch.cat((wordembeds, tagembeds), 1).view(len(sentence), 1, -1)
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, syntaglist, tags):
        feats = self._get_lstm_features(sentence,syntaglist)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, syntaglist):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,syntaglist)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)


"""Each epoch will take about 2-3 minutes"""

import datetime

for epoch in range(max_epoch):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]
        syn_idxs = t_train_input_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        syntaglist_in = torch.tensor(syn_idxs, dtype=torch.long).to(device)

        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in,syntaglist_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,t_train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,t_val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        syn_idxs = t_val_input_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        synlist_in = torch.tensor(syn_idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, synlist_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

y_true,y_pred,_ = cal_acc(model,val_input_index,t_val_input_index,val_output_index)
y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

Epoch:1, Training loss: 14555.92, train acc: 0.7416, val loss: 4528.22, val acc: 0.6881, time: 175.18s
Epoch:2, Training loss: 8348.77, train acc: 0.7980, val loss: 3488.86, val acc: 0.7347, time: 175.62s
Epoch:3, Training loss: 6404.40, train acc: 0.8257, val loss: 2920.13, val acc: 0.7672, time: 175.18s
Epoch:4, Training loss: 5449.75, train acc: 0.8443, val loss: 2553.14, val acc: 0.7799, time: 175.35s
Epoch:5, Training loss: 4776.67, train acc: 0.8578, val loss: 2371.16, val acc: 0.7882, time: 174.12s
Epoch:6, Training loss: 4297.62, train acc: 0.8715, val loss: 2216.87, val acc: 0.8011, time: 174.86s
Epoch:7, Training loss: 3887.90, train acc: 0.8763, val loss: 2144.47, val acc: 0.8049, time: 175.19s
Epoch:8, Training loss: 3514.81, train acc: 0.8872, val loss: 2075.80, val acc: 0.8098, time: 175.36s
Epoch:9, Training loss: 3210.18, train acc: 0.8895, val loss: 2097.20, val acc: 0.8089, time: 175.70s
Epoch:10, Training loss: 2930.49, train acc: 0.8939, val loss: 2066.72, val acc: 

  _warn_prf(average, modifier, msg_start, len(result))


###4.4 Bi-LSTM CRF with Glove-twitter-25 and POS & Dependency information

In [None]:
p_EMBEDDING_DIM = EMBEDDING_DIM
p_embedding_matrix = embedding_matrix

def cal_acc(model, input_index, val_POS_index ,val_DEP_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]

        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)

        score, pred = model(sentence_in,postaglist_in,deptaglist_in)
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(p_embedding_matrix))

        self.postag_embeds = nn.Embedding(embedding_matrix_POS.shape[0], EMBEDDING_DIM_POS)
        self.postag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_POS))

        self.deptag_embeds = nn.Embedding(embedding_matrix_DEP.shape[0], EMBEDDING_DIM_DEP)
        self.deptag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_DEP))
        
        self.lstm = nn.LSTM(embedding_dim+EMBEDDING_DIM_POS+EMBEDDING_DIM_DEP, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, postaglist, deptaglist):
        self.hidden = self.init_hidden()
        wordembeds = self.word_embeds(sentence)
        postagembeds = self.postag_embeds(postaglist)
        deptagembeds = self.deptag_embeds(deptaglist)
        embeds = torch.cat((wordembeds, postagembeds, deptagembeds), 1).view(len(sentence), 1, -1)
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, postaglist,deptaglist, tags):
        feats = self._get_lstm_features(sentence,postaglist,deptaglist)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, postaglist,deptaglist):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,postaglist,deptaglist)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)


"""Each epoch will take about 2-3 minutes"""

import datetime

for epoch in range(max_epoch):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]
        pos_idxs = train_POS_index[i]
        dep_idxs = train_DEP_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)

        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in,postaglist_in,deptaglist_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,train_POS_index,train_DEP_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, postaglist_in,deptaglist_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

y_true,y_pred,_ = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index)
y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

Epoch:1, Training loss: 14156.66, train acc: 0.7435, val loss: 4384.63, val acc: 0.6892, time: 176.46s
Epoch:2, Training loss: 7907.80, train acc: 0.7907, val loss: 3482.87, val acc: 0.7328, time: 172.61s
Epoch:3, Training loss: 6180.36, train acc: 0.8097, val loss: 3090.25, val acc: 0.7482, time: 171.93s
Epoch:4, Training loss: 5265.32, train acc: 0.8390, val loss: 2701.72, val acc: 0.7778, time: 176.32s
Epoch:5, Training loss: 4621.83, train acc: 0.8487, val loss: 2521.27, val acc: 0.7878, time: 178.57s
Epoch:6, Training loss: 4147.45, train acc: 0.8658, val loss: 2286.06, val acc: 0.7969, time: 178.10s
Epoch:7, Training loss: 3713.32, train acc: 0.8773, val loss: 2148.69, val acc: 0.8041, time: 178.05s
Epoch:8, Training loss: 3371.51, train acc: 0.8863, val loss: 2081.74, val acc: 0.8129, time: 178.24s
Epoch:9, Training loss: 3107.78, train acc: 0.8948, val loss: 2031.29, val acc: 0.8130, time: 178.18s
Epoch:10, Training loss: 2840.75, train acc: 0.9032, val loss: 2001.92, val acc: 

  _warn_prf(average, modifier, msg_start, len(result))


###4.5 Bi-LSTM CRF with Glove-twitter-50 and POS & Dependency information

In [None]:
p_EMBEDDING_DIM = EMBEDDING_DIM_50
p_embedding_matrix = embedding_matrix_50

def cal_acc(model, input_index, val_POS_index ,val_DEP_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]

        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)

        score, pred = model(sentence_in,postaglist_in,deptaglist_in)
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(p_embedding_matrix))

        self.postag_embeds = nn.Embedding(embedding_matrix_POS.shape[0], EMBEDDING_DIM_POS)
        self.postag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_POS))

        self.deptag_embeds = nn.Embedding(embedding_matrix_DEP.shape[0], EMBEDDING_DIM_DEP)
        self.deptag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_DEP))
        
        self.lstm = nn.LSTM(embedding_dim+EMBEDDING_DIM_POS+EMBEDDING_DIM_DEP, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, postaglist, deptaglist):
        self.hidden = self.init_hidden()
        wordembeds = self.word_embeds(sentence)
        postagembeds = self.postag_embeds(postaglist)
        deptagembeds = self.deptag_embeds(deptaglist)
        embeds = torch.cat((wordembeds, postagembeds, deptagembeds), 1).view(len(sentence), 1, -1)
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, postaglist,deptaglist, tags):
        feats = self._get_lstm_features(sentence,postaglist,deptaglist)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, postaglist,deptaglist):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,postaglist,deptaglist)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)


"""Each epoch will take about 2-3 minutes"""

import datetime

for epoch in range(max_epoch):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]
        pos_idxs = train_POS_index[i]
        dep_idxs = train_DEP_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)

        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in,postaglist_in,deptaglist_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,train_POS_index,train_DEP_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, postaglist_in,deptaglist_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

y_true,y_pred,_ = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index)
y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

Epoch:1, Training loss: 13822.45, train acc: 0.7527, val loss: 4337.94, val acc: 0.6976, time: 93.83s
Epoch:2, Training loss: 7556.03, train acc: 0.7953, val loss: 3375.18, val acc: 0.7397, time: 94.76s
Epoch:3, Training loss: 5821.77, train acc: 0.8234, val loss: 2811.22, val acc: 0.7651, time: 95.31s
Epoch:4, Training loss: 4894.83, train acc: 0.8535, val loss: 2453.44, val acc: 0.7901, time: 95.44s
Epoch:5, Training loss: 4262.00, train acc: 0.8639, val loss: 2282.00, val acc: 0.7956, time: 98.15s
Epoch:6, Training loss: 3752.40, train acc: 0.8734, val loss: 2164.16, val acc: 0.8053, time: 96.61s
Epoch:7, Training loss: 3354.75, train acc: 0.8837, val loss: 2118.13, val acc: 0.8093, time: 96.89s
Epoch:8, Training loss: 2974.44, train acc: 0.8946, val loss: 2073.51, val acc: 0.8115, time: 96.96s
Epoch:9, Training loss: 2686.66, train acc: 0.9038, val loss: 2036.06, val acc: 0.8113, time: 97.14s
Epoch:10, Training loss: 2379.47, train acc: 0.9071, val loss: 2082.58, val acc: 0.8108, t

  _warn_prf(average, modifier, msg_start, len(result))


Save model

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Save the Model
torch.save(model, '/content/gdrive/MyDrive/2021-comp5046-a2/BestInputEmbedding_NER_Model.pt')

load model

In [None]:
# Load the model
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1BbOFJee2_-cZzrBQb0_gSn3Ft5XG8NxM'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('BestInputEmbedding_NER_Model.pt')

BestInputEmbedding_NER_Model = torch.load('BestInputEmbedding_NER_Model.pt')

In [None]:
y_true,y_pred,_ = cal_acc(BestInputEmbedding_NER_Model,val_input_index,val_POS_index,val_DEP_index,val_output_index)
y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

                     precision    recall  f1-score   support

B-DocumentReference     0.1000    0.2500    0.1429         8
         B-Location     0.6452    0.6742    0.6593       178
 B-MilitaryPlatform     0.1250    0.3333    0.1818         6
            B-Money     0.2000    0.5000    0.2857         2
      B-Nationality     0.1250    0.5000    0.2000         2
     B-Organisation     0.6750    0.7269    0.7000       260
           B-Person     0.8333    0.8173    0.8252       104
         B-Quantity     0.8000    0.6984    0.7458        63
         B-Temporal     0.7872    0.7115    0.7475        52
           B-Weapon     0.1316    0.2000    0.1587        25
I-DocumentReference     0.2048    0.6071    0.3063        28
         I-Location     0.5509    0.6348    0.5899       230
 I-MilitaryPlatform     0.0625    1.0000    0.1176         1
            I-Money     0.6000    1.0000    0.7500         6
      I-Nationality     0.0000    0.0000    0.0000         0
     I-Organisation    

  _warn_prf(average, modifier, msg_start, len(result))


#### Discussion
Which one is the optimal (according to the accuracy and weighted average F1 score)?

In this section, we test four models that use Bi-LSTM CRF architecture and vary the input embeddings. 

1. FastText vs. Glove-twitter-25: \
Comparing Bi-LSTM CRF with FastText and baseline model which uses Glove-twitter-25 pretrained embeddings, we can find using Glove-twitter-25 can achieve better performance. 

2. Baseline vs. Incorporating POS or Dependency information:\
Incorporating POS or Dependency information can improve performance.

3. POS information vs. Dependency information vs. Combining all:\
Using POS information achieved better performance than using Dependency information.\
Using both POS and Dependecy information achieved the best performance.

4. Glove-twitter-25 vs. Glove-twitter-50
Using Glove-twitter-50 incorporating POS and Dependecy information  achieved the best performance.


Therefore, the optimal input embedding model is ***Glove-twitter-50 incorporating POS and Dependency information***. The subsequent experiments are using the optimal input embedding model.


## 5 Stacked Bi-LSTM CRF
provide the optimal number of stacked layers\
Note: You should justify the optimal number of stacked layers. (NOTE: at least 2 different numbers should be tested).

Use Glove-twitter-25 incorporating POS and Dependency information as input embedding. because Glove-twitter-25 can save more time than Glove-twitter-50.\
Setting different number of stacked layers: ***2, 3***

Functions

In [None]:

def cal_acc(model, input_index, val_POS_index ,val_DEP_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]

        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)

        score, pred = model(sentence_in,postaglist_in,deptaglist_in)
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, NUM_LAYERS ):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.NUM_LAYERS = NUM_LAYERS

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(p_embedding_matrix))

        self.postag_embeds = nn.Embedding(embedding_matrix_POS.shape[0], EMBEDDING_DIM_POS)
        self.postag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_POS))

        self.deptag_embeds = nn.Embedding(embedding_matrix_DEP.shape[0], EMBEDDING_DIM_DEP)
        self.deptag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_DEP))
        
        self.lstm = nn.LSTM(embedding_dim+EMBEDDING_DIM_POS+EMBEDDING_DIM_DEP, hidden_dim // 2,
                            num_layers=NUM_LAYERS, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2*NUM_LAYERS, 1, self.hidden_dim // 2).to(device),
                torch.randn(2*NUM_LAYERS, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, postaglist, deptaglist):
        self.hidden = self.init_hidden()
        wordembeds = self.word_embeds(sentence)
        postagembeds = self.postag_embeds(postaglist)
        deptagembeds = self.deptag_embeds(deptaglist)
        embeds = torch.cat((wordembeds, postagembeds, deptagembeds), 1).view(len(sentence), 1, -1)
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, postaglist,deptaglist, tags):
        feats = self._get_lstm_features(sentence,postaglist,deptaglist)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, postaglist,deptaglist):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,postaglist,deptaglist)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq




"""Each epoch will take about 2-3 minutes"""
import datetime
def model_performance(model):

  for epoch in range(max_epoch):  
      time1 = datetime.datetime.now()
      train_loss = 0

      model.train()
      for i, idxs in enumerate(train_input_index):
          tags_index = train_output_index[i]
          pos_idxs = train_POS_index[i]
          dep_idxs = train_DEP_index[i]

          # Step 1. Remember that Pytorch accumulates gradients.
          # We need to clear them out before each instance
          model.zero_grad()

          # Step 2. Get our inputs ready for the network, that is,
          # turn them into Tensors of word indices.
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
          deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)

          targets = torch.tensor(tags_index, dtype=torch.long).to(device)

          # Step 3. Run our forward pass.
          loss = model.neg_log_likelihood(sentence_in,postaglist_in,deptaglist_in, targets)

          # Step 4. Compute the loss, gradients, and update the parameters by
          # calling optimizer.step()
          loss.backward()
          optimizer.step()

          train_loss+=loss.item()

      model.eval()
      _, _, train_acc = cal_acc(model,train_input_index,train_POS_index,train_DEP_index,train_output_index)
      _, _, val_acc = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index)

      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          pos_idxs = val_POS_index[i]
          dep_idxs = val_DEP_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
          deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, postaglist_in,deptaglist_in, targets)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()

      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

  y_true,y_pred,_ = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index)
  y_true_decode = decode_output(y_true)
  y_pred_decode = decode_output(y_pred)

  print(classification_report(y_true_decode,y_pred_decode,digits=4))

####5.1: 2 layers

In [None]:
p_EMBEDDING_DIM = EMBEDDING_DIM
p_embedding_matrix = embedding_matrix
NUM_LAYERS = 2


# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

"""Each epoch will take about 2-3 minutes"""
#performance
model_performance(model)

Epoch:1, Training loss: 15860.98, train acc: 0.7145, val loss: 5151.72, val acc: 0.6640, time: 104.28s
Epoch:2, Training loss: 9815.57, train acc: 0.7564, val loss: 3988.28, val acc: 0.7000, time: 104.31s
Epoch:3, Training loss: 7439.39, train acc: 0.7945, val loss: 3450.17, val acc: 0.7395, time: 104.44s
Epoch:4, Training loss: 6191.40, train acc: 0.8175, val loss: 3078.48, val acc: 0.7617, time: 105.22s
Epoch:5, Training loss: 5391.57, train acc: 0.8292, val loss: 2777.27, val acc: 0.7681, time: 104.80s
Epoch:6, Training loss: 4764.60, train acc: 0.8456, val loss: 2529.09, val acc: 0.7802, time: 106.53s
Epoch:7, Training loss: 4296.01, train acc: 0.8537, val loss: 2371.35, val acc: 0.7880, time: 105.70s
Epoch:8, Training loss: 3871.04, train acc: 0.8665, val loss: 2220.47, val acc: 0.7971, time: 105.99s
Epoch:9, Training loss: 3554.33, train acc: 0.8755, val loss: 2137.31, val acc: 0.8043, time: 104.64s
Epoch:10, Training loss: 3240.16, train acc: 0.8853, val loss: 2068.16, val acc: 

  _warn_prf(average, modifier, msg_start, len(result))


####5.2: 3 layers

In [None]:
p_EMBEDDING_DIM = EMBEDDING_DIM
p_embedding_matrix = embedding_matrix
NUM_LAYERS = 3

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

#performance
model_performance(model)


Epoch:1, Training loss: 16740.55, train acc: 0.6999, val loss: 5743.02, val acc: 0.6485, time: 180.96s
Epoch:2, Training loss: 12467.94, train acc: 0.7173, val loss: 4554.24, val acc: 0.6629, time: 180.45s
Epoch:3, Training loss: 9435.74, train acc: 0.7366, val loss: 3937.10, val acc: 0.6885, time: 180.63s
Epoch:4, Training loss: 8102.19, train acc: 0.7563, val loss: 3575.14, val acc: 0.7052, time: 180.00s
Epoch:5, Training loss: 7051.72, train acc: 0.7851, val loss: 3216.82, val acc: 0.7251, time: 180.31s
Epoch:6, Training loss: 6127.77, train acc: 0.8048, val loss: 2970.64, val acc: 0.7473, time: 179.60s
Epoch:7, Training loss: 5423.33, train acc: 0.8246, val loss: 2673.93, val acc: 0.7666, time: 179.83s
Epoch:8, Training loss: 4841.56, train acc: 0.8419, val loss: 2424.91, val acc: 0.7852, time: 180.05s
Epoch:9, Training loss: 4372.80, train acc: 0.8563, val loss: 2271.90, val acc: 0.7926, time: 179.80s
Epoch:10, Training loss: 4016.38, train acc: 0.8690, val loss: 2129.14, val acc:

  _warn_prf(average, modifier, msg_start, len(result))


#### Discussion
Which one is the optimal?

In this section, we test 2 models that set 2 or 3 hidden layers, using Glove-twitter-25 incorporating POS and Dependency information as input embedding. 

Comparing with the one with only one hidden layer, we can find stacking 2 or 3 hidden layers doesn't achieve better performance.

The optimal input embedding model is the one that set one hidden layer. The subsequent experiments are using the optimal input embedding model.


## 6 Attention
consider to use attention\
Note: You may need to justify your attention score calculation method (at least 3 methods should be tested), the position, and the number of attention you used. This is really depends on your model.

Use Glove-twitter-25 incorporating POS and Dependency information as input embedding. because Glove-twitter-25 can save more time than Glove-twitter-50\
Set the optimal number of stacked layers: 1. \
Setting different attention score calculation methods: ***dot product, scaled dot product, and cosine similarity***.

In [None]:
import torch.nn.functional as F

p_EMBEDDING_DIM = EMBEDDING_DIM
p_embedding_matrix = embedding_matrix
NUM_LAYERS = 1

def cal_acc(model, input_index, val_POS_index ,val_DEP_index, output_index,method):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]

        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)

        score, pred = model(sentence_in,postaglist_in,deptaglist_in,method)
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(p_embedding_matrix))

        self.postag_embeds = nn.Embedding(embedding_matrix_POS.shape[0], EMBEDDING_DIM_POS)
        self.postag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_POS))

        self.deptag_embeds = nn.Embedding(embedding_matrix_DEP.shape[0], EMBEDDING_DIM_DEP)
        self.deptag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_DEP))
        
        self.lstm = nn.LSTM(embedding_dim+EMBEDDING_DIM_POS+EMBEDDING_DIM_DEP, hidden_dim // 2,
                            num_layers=NUM_LAYERS, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2*NUM_LAYERS, 1, self.hidden_dim // 2).to(device),
                torch.randn(2*NUM_LAYERS, 1, self.hidden_dim // 2).to(device))
    # from lab 10    
    def cal_attention(self, decoder_hiddens, encoder_hiddens, method):
        if method == "Dot Product":
            # bmm: https://pytorch.org/docs/master/generated/torch.bmm.html
            attn_weights = F.softmax(torch.bmm(decoder_hiddens.unsqueeze(0), encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], decoder_hiddens), 1)

        elif method == "Scaled Dot Product":
            # COMPLETE THIS PART - Scale Dot Product calculation method

            attn_weights = F.softmax(1/np.sqrt(self.hidden_dim)*torch.bmm(decoder_hiddens.unsqueeze(0), encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], decoder_hiddens), 1)


        elif method == "Cosine Similarity":
          attn_weights = torch.cosine_similarity(decoder_hiddens.unsqueeze(1), decoder_hiddens.unsqueeze(0), dim=-1)
          attn_output = torch.bmm(attn_weights.unsqueeze(0), encoder_hiddens.unsqueeze(0))
          concat_output = torch.cat((attn_output[0], decoder_hiddens), 1)

        return concat_output
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, postaglist, deptaglist, method):
        self.hidden = self.init_hidden()
        wordembeds = self.word_embeds(sentence)
        postagembeds = self.postag_embeds(postaglist)
        deptagembeds = self.deptag_embeds(deptaglist)
        embeds = torch.cat((wordembeds, postagembeds, deptagembeds), 1).view(len(sentence), 1, -1)
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        concat_output = self.cal_attention(lstm_out, lstm_out, method) #concat lstm output and attention output

        lstm_feats = self.hidden2tag(concat_output)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, postaglist,deptaglist, tags,method):
        feats = self._get_lstm_features(sentence,postaglist,deptaglist,method)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, postaglist,deptaglist,method):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,postaglist,deptaglist,method)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq


import datetime
"""Each epoch will take about 2-3 minutes"""
def model_performance_attention(model, ATTENTION_METHOD):

  for epoch in range(max_epoch):  
      time1 = datetime.datetime.now()
      train_loss = 0

      model.train()
      for i, idxs in enumerate(train_input_index):
          tags_index = train_output_index[i]
          pos_idxs = train_POS_index[i]
          dep_idxs = train_DEP_index[i]

          # Step 1. Remember that Pytorch accumulates gradients.
          # We need to clear them out before each instance
          model.zero_grad()

          # Step 2. Get our inputs ready for the network, that is,
          # turn them into Tensors of word indices.
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
          deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)

          targets = torch.tensor(tags_index, dtype=torch.long).to(device)

          # Step 3. Run our forward pass.
          loss = model.neg_log_likelihood(sentence_in,postaglist_in,deptaglist_in,targets,ATTENTION_METHOD)

          # Step 4. Compute the loss, gradients, and update the parameters by
          # calling optimizer.step()
          loss.backward()
          optimizer.step()

          train_loss+=loss.item()

      model.eval()
      _, _, train_acc = cal_acc(model,train_input_index,train_POS_index,train_DEP_index,train_output_index,ATTENTION_METHOD)
      _, _, val_acc = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index,ATTENTION_METHOD)

      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          pos_idxs = val_POS_index[i]
          dep_idxs = val_DEP_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
          deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, postaglist_in, deptaglist_in, targets, ATTENTION_METHOD)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()

      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

  y_true,y_pred,_ = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index,ATTENTION_METHOD)
  y_true_decode = decode_output(y_true)
  y_pred_decode = decode_output(y_pred)

  print(classification_report(y_true_decode,y_pred_decode,digits=4))


#### Dot Product

In [None]:
ATTENTION_METHOD = "Dot Product"

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_Dot_Product= BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model_Dot_Product.parameters(), lr=0.01, weight_decay=1e-4)

#model performance dot product
model_performance_attention(model_Dot_Product,ATTENTION_METHOD)


Epoch:1, Training loss: 17592.84, train acc: 0.6999, val loss: 6022.23, val acc: 0.6485, time: 112.52s
Epoch:2, Training loss: 13087.11, train acc: 0.7190, val loss: 4715.50, val acc: 0.6701, time: 113.72s
Epoch:3, Training loss: 9697.44, train acc: 0.7525, val loss: 3822.24, val acc: 0.6951, time: 114.42s
Epoch:4, Training loss: 8114.91, train acc: 0.7696, val loss: 3409.19, val acc: 0.7196, time: 114.36s
Epoch:5, Training loss: 7011.42, train acc: 0.7946, val loss: 3126.66, val acc: 0.7353, time: 114.48s
Epoch:6, Training loss: 6091.08, train acc: 0.8139, val loss: 2874.91, val acc: 0.7474, time: 114.37s
Epoch:7, Training loss: 5395.33, train acc: 0.8261, val loss: 2638.07, val acc: 0.7700, time: 114.24s
Epoch:8, Training loss: 4832.35, train acc: 0.8446, val loss: 2406.10, val acc: 0.7819, time: 114.69s
Epoch:9, Training loss: 4379.74, train acc: 0.8557, val loss: 2302.64, val acc: 0.7867, time: 114.43s
Epoch:10, Training loss: 3953.69, train acc: 0.8697, val loss: 2167.67, val acc:

  _warn_prf(average, modifier, msg_start, len(result))


#### Scaled Dot Product

In [None]:
ATTENTION_METHOD = "Scaled Dot Product"

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_Scaled_Dot_Product = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model_Scaled_Dot_Product.parameters(), lr=0.01, weight_decay=1e-4)

#model performance dot product
model_performance_attention(model_Scaled_Dot_Product,ATTENTION_METHOD)



Epoch:1, Training loss: 17534.20, train acc: 0.6999, val loss: 5936.52, val acc: 0.6485, time: 114.33s
Epoch:2, Training loss: 12771.68, train acc: 0.7204, val loss: 4632.85, val acc: 0.6665, time: 114.55s
Epoch:3, Training loss: 9545.10, train acc: 0.7491, val loss: 3878.73, val acc: 0.7067, time: 115.31s
Epoch:4, Training loss: 8064.55, train acc: 0.7771, val loss: 3487.94, val acc: 0.7243, time: 116.27s
Epoch:5, Training loss: 7123.36, train acc: 0.7945, val loss: 3201.57, val acc: 0.7387, time: 115.98s
Epoch:6, Training loss: 6311.17, train acc: 0.8131, val loss: 2949.42, val acc: 0.7543, time: 116.11s
Epoch:7, Training loss: 5529.25, train acc: 0.8276, val loss: 2750.50, val acc: 0.7645, time: 115.25s
Epoch:8, Training loss: 4965.70, train acc: 0.8376, val loss: 2597.05, val acc: 0.7700, time: 115.85s
Epoch:9, Training loss: 4477.49, train acc: 0.8478, val loss: 2398.49, val acc: 0.7812, time: 115.73s
Epoch:10, Training loss: 4069.45, train acc: 0.8560, val loss: 2305.41, val acc:

  _warn_prf(average, modifier, msg_start, len(result))


#### Cosine Similarity

In [None]:

ATTENTION_METHOD = "Cosine Similarity"

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_Cosine_Similarity = BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model_Cosine_Similarity.parameters(), lr=0.01, weight_decay=1e-4)


model_performance_attention(model_Cosine_Similarity, ATTENTION_METHOD)

Epoch:1, Training loss: 552502.34, train acc: 0.6546, val loss: 9736.20, val acc: 0.6166, time: 110.89s
Epoch:2, Training loss: 17699.41, train acc: 0.6872, val loss: 5784.57, val acc: 0.6314, time: 110.67s
Epoch:3, Training loss: 13730.71, train acc: 0.6781, val loss: 5393.62, val acc: 0.6333, time: 111.36s
Epoch:4, Training loss: 12815.46, train acc: 0.6934, val loss: 5384.46, val acc: 0.6458, time: 110.78s
Epoch:5, Training loss: 11776.24, train acc: 0.6885, val loss: 4544.03, val acc: 0.6543, time: 111.01s
Epoch:6, Training loss: 11064.05, train acc: 0.6923, val loss: 4334.23, val acc: 0.6371, time: 111.10s
Epoch:7, Training loss: 10456.40, train acc: 0.6961, val loss: 4334.06, val acc: 0.6454, time: 110.64s
Epoch:8, Training loss: 10090.16, train acc: 0.6953, val loss: 3977.95, val acc: 0.6439, time: 111.34s
Epoch:9, Training loss: 9765.88, train acc: 0.7039, val loss: 3906.54, val acc: 0.6481, time: 111.24s
Epoch:10, Training loss: 9624.21, train acc: 0.6835, val loss: 3994.84, v

  _warn_prf(average, modifier, msg_start, len(result))


#### Discussion
Which one is the optimal?

In this section, we test 3 models that set different attention score calculation methods: ***dot product, scaled dot product, and cosine similarity, using Glove-twitter-25 incorporating POS and Dependency information as input embedding. 

The optimal  attention score calculation methods is dot product, which get 0.8115 accuracy, which is higher than others.

## 7 CRF Attachment
NOTE: You should test your NER model with CRF/ without CRF.

Use Glove-twitter-50 as baseline input embedding.\
Set the optimal number of stacked layers = 1\
Setting BiLSTM model without CRF \

In [None]:
# Stacked Attentional Bi-LSTM without CRF
p_EMBEDDING_DIM = EMBEDDING_DIM_50
p_embedding_matrix = embedding_matrix_50

def cal_acc(model, input_index, val_POS_index ,val_DEP_index, output_index):
    ground_truth = []
    predicted = []
    truth = 0
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]

        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)

        _, pred = model(sentence_in,postaglist_in,deptaglist_in)
        predicted += list(pred.cpu().numpy())
        
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

class BiLSTM(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(p_embedding_matrix))

        self.postag_embeds = nn.Embedding(embedding_matrix_POS.shape[0], EMBEDDING_DIM_POS)
        self.postag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_POS))

        self.deptag_embeds = nn.Embedding(embedding_matrix_DEP.shape[0], EMBEDDING_DIM_DEP)
        self.deptag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_DEP))
        
        self.lstm = nn.LSTM(embedding_dim+EMBEDDING_DIM_POS+EMBEDDING_DIM_DEP, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _get_lstm_features(self, sentence, postaglist, deptaglist):
        self.hidden = self.init_hidden()
        wordembeds = self.word_embeds(sentence)
        postagembeds = self.postag_embeds(postaglist)
        deptagembeds = self.deptag_embeds(deptaglist)
        embeds = torch.cat((wordembeds, postagembeds, deptagembeds), 1).view(len(sentence), 1, -1)
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def forward(self, sentence, postaglist,deptaglist):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,postaglist,deptaglist)
        return lstm_feats,torch.argmax(lstm_feats,dim=-1)


In [None]:
# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
"""Each epoch will take about 2-3 minutes"""

import datetime

for epoch in range(max_epoch):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]
        pos_idxs = train_POS_index[i]
        dep_idxs = train_DEP_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)

        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        #print(targets.shape)
        # Step 3. Run our forward pass.
        pred, _ = model(sentence_in,postaglist_in,deptaglist_in)
        #print(pred.shape)
        loss = criterion(pred,targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,train_POS_index,train_DEP_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        pred,_ = model(sentence_in,postaglist_in,deptaglist_in)
        loss = criterion(pred,targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

Epoch:1, Training loss: 913.49, train acc: 0.6998, val loss: 281.71, val acc: 0.6485, time: 9.42s
Epoch:2, Training loss: 736.66, train acc: 0.6999, val loss: 264.97, val acc: 0.6485, time: 10.20s
Epoch:3, Training loss: 692.16, train acc: 0.6999, val loss: 253.58, val acc: 0.6485, time: 11.34s
Epoch:4, Training loss: 660.24, train acc: 0.7010, val loss: 244.82, val acc: 0.6492, time: 9.87s
Epoch:5, Training loss: 633.34, train acc: 0.7068, val loss: 237.61, val acc: 0.6530, time: 9.37s
Epoch:6, Training loss: 611.03, train acc: 0.7135, val loss: 231.65, val acc: 0.6585, time: 9.18s
Epoch:7, Training loss: 589.94, train acc: 0.7189, val loss: 225.96, val acc: 0.6642, time: 9.24s
Epoch:8, Training loss: 574.24, train acc: 0.7254, val loss: 221.33, val acc: 0.6706, time: 9.56s
Epoch:9, Training loss: 558.52, train acc: 0.7307, val loss: 216.74, val acc: 0.6786, time: 9.19s
Epoch:10, Training loss: 548.77, train acc: 0.7350, val loss: 213.60, val acc: 0.6824, time: 9.48s
Epoch:11, Trainin

In [None]:
y_true,y_pred,_ = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index)
y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)
print(y_true_decode)
print(y_pred_decode)
print(classification_report(y_true_decode,y_pred_decode,digits=4))

['O', 'I-Person', 'O', 'O', 'O', 'B-Person', 'B-Person', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-Person', 'I-Person', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Person', 'I-Location', 'O', 'O', 'O', 'O', 'O', 'I-Organisation', 'O', 'O', 'O', 'O', 'O', 'I-Location', 'O', 'O', 'O', 'O', 'I-Person', 'I-Person', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Location', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-Organisation', 'I-Person', 'I-Person', 'I-Person', 'I-Person', 'O', 'B-Organisation', 'I-Person', 'I-Organisation', 'O', 'I-Organisation', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-Location', 'O', 'O', 'O', 'O', 'O', 'B-Location', 'B-Location', 'O', 'B-Location', 'O', 'I-Location', 'B-Location', 'O', 'O', 'I-Location', 'O', 'O', 'I-Person', 'O', 'O', 'B-Person', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Person', 'O', 'O', 'O', 'O', 'O', 'B-Person', 'I-Person', 'I-Person', 'O', 'B-Organisation', 'I-Organisation', 

  _warn_prf(average, modifier, msg_start, len(result))


#### Discussion
As we can see from above result, the performance is poor witout CRF. This indicate that the LSTM model with CRF can handle the dependency between predicted entity names, which has a significant impact to improve prediction accuracy.

## 8 Predicting Test Data with the Best Model
using which one?

the best model we choosed is:

BiLSRM with CRF,

num_layers = 1,

attention way = dot product,

input embedding = pos-tagging + dependency parsing + glove-50

In [17]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

p_EMBEDDING_DIM = EMBEDDING_DIM_50
p_embedding_matrix = embedding_matrix_50
NUM_LAYERS = 1

def cal_acc(model, input_index, val_POS_index ,val_DEP_index, output_index,method):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]

        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)

        score, pred = model(sentence_in,postaglist_in,deptaglist_in,method)
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return predicted, ground_truth, accuracy

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(p_embedding_matrix))

        self.postag_embeds = nn.Embedding(embedding_matrix_POS.shape[0], EMBEDDING_DIM_POS)
        self.postag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_POS))

        self.deptag_embeds = nn.Embedding(embedding_matrix_DEP.shape[0], EMBEDDING_DIM_DEP)
        self.deptag_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix_DEP))
        
        self.lstm = nn.LSTM(embedding_dim+EMBEDDING_DIM_POS+EMBEDDING_DIM_DEP, hidden_dim // 2,
                            num_layers=NUM_LAYERS, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim*2, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2*NUM_LAYERS, 1, self.hidden_dim // 2).to(device),
                torch.randn(2*NUM_LAYERS, 1, self.hidden_dim // 2).to(device))
    # from lab 10    
    def cal_attention(self, decoder_hiddens, encoder_hiddens, method):
        if method == "Dot Product":
            # bmm: https://pytorch.org/docs/master/generated/torch.bmm.html
            attn_weights = F.softmax(torch.bmm(decoder_hiddens.unsqueeze(0), encoder_hiddens.T.unsqueeze(0)),dim=-1)
            attn_output = torch.bmm(attn_weights, encoder_hiddens.unsqueeze(0))
            concat_output = torch.cat((attn_output[0], decoder_hiddens), 1)


        return concat_output
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence, postaglist, deptaglist, method):
        self.hidden = self.init_hidden()
        wordembeds = self.word_embeds(sentence)
        postagembeds = self.postag_embeds(postaglist)
        deptagembeds = self.deptag_embeds(deptaglist)
        embeds = torch.cat((wordembeds, postagembeds, deptagembeds), 1).view(len(sentence), 1, -1)
        
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)

        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        concat_output = self.cal_attention(lstm_out, lstm_out, method) #concat lstm output and attention output

        lstm_feats = self.hidden2tag(concat_output)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, postaglist,deptaglist, tags,method):
        feats = self._get_lstm_features(sentence,postaglist,deptaglist,method)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence, postaglist,deptaglist,method):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence,postaglist,deptaglist,method)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq


import datetime
"""Each epoch will take about 2-3 minutes"""
def model_performance_attention(model, ATTENTION_METHOD):

  for epoch in range(max_epoch):  
      time1 = datetime.datetime.now()
      train_loss = 0

      model.train()
      for i, idxs in enumerate(train_input_index):
          tags_index = train_output_index[i]
          pos_idxs = train_POS_index[i]
          dep_idxs = train_DEP_index[i]

          # Step 1. Remember that Pytorch accumulates gradients.
          # We need to clear them out before each instance
          model.zero_grad()

          # Step 2. Get our inputs ready for the network, that is,
          # turn them into Tensors of word indices.
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
          deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)

          targets = torch.tensor(tags_index, dtype=torch.long).to(device)

          # Step 3. Run our forward pass.
          loss = model.neg_log_likelihood(sentence_in,postaglist_in,deptaglist_in,targets,ATTENTION_METHOD)

          # Step 4. Compute the loss, gradients, and update the parameters by
          # calling optimizer.step()
          loss.backward()
          optimizer.step()

          train_loss+=loss.item()

      model.eval()
      _, _, train_acc = cal_acc(model,train_input_index,train_POS_index,train_DEP_index,train_output_index,ATTENTION_METHOD)
      _, _, val_acc = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index,ATTENTION_METHOD)

      val_loss = 0
      for i, idxs in enumerate(val_input_index):
          tags_index = val_output_index[i]
          pos_idxs = val_POS_index[i]
          dep_idxs = val_DEP_index[i]
          sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
          postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
          deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
          targets = torch.tensor(tags_index, dtype=torch.long).to(device)
          loss = model.neg_log_likelihood(sentence_in, postaglist_in, deptaglist_in, targets, ATTENTION_METHOD)
          val_loss+=loss.item()
      time2 = datetime.datetime.now()

      print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))

  y_true,y_pred,_ = cal_acc(model,val_input_index,val_POS_index,val_DEP_index,val_output_index,ATTENTION_METHOD)
  y_true_decode = decode_output(y_true)
  y_pred_decode = decode_output(y_pred)

  print(classification_report(y_true_decode,y_pred_decode,digits=4))

In [19]:
ATTENTION_METHOD = "Dot Product"

# initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_Dot_Product= BiLSTM_CRF(len(word_to_ix), tag_to_ix, p_EMBEDDING_DIM, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model_Dot_Product.parameters(), lr=0.01, weight_decay=1e-4)

#model performance dot product
model_performance_attention(model_Dot_Product,ATTENTION_METHOD)

Epoch:1, Training loss: 13884.36, train acc: 0.7761, val loss: 4404.93, val acc: 0.7184, time: 183.28s
Epoch:2, Training loss: 7731.58, train acc: 0.8117, val loss: 3439.68, val acc: 0.7488, time: 183.66s
Epoch:3, Training loss: 5888.50, train acc: 0.8354, val loss: 2967.22, val acc: 0.7704, time: 187.03s
Epoch:4, Training loss: 4868.83, train acc: 0.8557, val loss: 2626.43, val acc: 0.7865, time: 188.17s
Epoch:5, Training loss: 4199.38, train acc: 0.8699, val loss: 2430.96, val acc: 0.7964, time: 187.55s
Epoch:6, Training loss: 3676.37, train acc: 0.8840, val loss: 2307.70, val acc: 0.8051, time: 187.35s
Epoch:7, Training loss: 3244.92, train acc: 0.8954, val loss: 2227.89, val acc: 0.8127, time: 187.15s
Epoch:8, Training loss: 2864.24, train acc: 0.9057, val loss: 2171.74, val acc: 0.8195, time: 183.65s
Epoch:9, Training loss: 2571.14, train acc: 0.9140, val loss: 2155.57, val acc: 0.8189, time: 185.07s
Epoch:10, Training loss: 2235.05, train acc: 0.9224, val loss: 2235.16, val acc: 

  _warn_prf(average, modifier, msg_start, len(result))


save model

In [20]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [23]:
# Save the Model
torch.save(model_Dot_Product, '/content/gdrive/MyDrive/2021-comp5046-a2/Best_NER_Model.pt')

load model

In [24]:
# Load the model
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '16366bjjdy3fyEEsDGiTWcg2bXYGwk-sQ'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Best_NER_Model.pt')

Best_NER_Model = torch.load('Best_NER_Model.pt')

In [26]:
y_true,y_pred,_ = cal_acc(Best_NER_Model,val_input_index,val_POS_index,val_DEP_index,val_output_index,ATTENTION_METHOD)
y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

print(classification_report(y_true_decode,y_pred_decode,digits=4))

                     precision    recall  f1-score   support

B-DocumentReference     0.2500    0.5000    0.3333        10
         B-Location     0.6667    0.6739    0.6703       184
 B-MilitaryPlatform     0.0000    0.0000    0.0000         1
            B-Money     0.2000    0.5000    0.2857         2
      B-Nationality     0.1250    0.1429    0.1333         7
     B-Organisation     0.6321    0.7024    0.6654       252
           B-Person     0.8725    0.8241    0.8476       108
         B-Quantity     0.7636    0.6885    0.7241        61
         B-Temporal     0.7660    0.7500    0.7579        48
           B-Weapon     0.1579    0.2500    0.1935        24
I-DocumentReference     0.3253    0.6923    0.4426        39
         I-Location     0.5434    0.7236    0.6207       199
 I-MilitaryPlatform     0.0000    0.0000    0.0000         0
            I-Money     0.6000    1.0000    0.7500         6
      I-Nationality     0.0000    0.0000    0.0000         0
     I-Organisation    

  _warn_prf(average, modifier, msg_start, len(result))


get prediction files

In [28]:
#create test predictions to submit
test_input_index = to_index(dftest.sents, word_to_ix)
t_test_input_index = test_POS_index
d_test_input_index = test_DEP_index

import numpy as np

def cal_acc(model, input_index, val_POS_index ,val_DEP_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        pos_idxs = val_POS_index[i]
        dep_idxs = val_DEP_index[i]

        postaglist_in = torch.tensor(pos_idxs, dtype=torch.long).to(device)
        deptaglist_in = torch.tensor(dep_idxs, dtype=torch.long).to(device)
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)

        score, pred = model(sentence_in,postaglist_in,deptaglist_in,ATTENTION_METHOD)
        predicted += pred
    
    return predicted

y_pred_test = cal_acc(Best_NER_Model, test_input_index,t_test_input_index,d_test_input_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

test_output = decode_output(y_pred_test)

In [29]:
df = pd.DataFrame({'ID' :  np.arange(0, len(test_output)),'Predicted': test_output})
df.to_csv('Best_predictions.csv',index=False)