In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import time

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')
# directory = r"drive/MyDrive/50.040 NLP/final project/dataset"
directory = r"./dataset"

In [3]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBED_DIM = 5
HIDDEN_DIM = 4
LR = 1e-2
DECAY = 1e-4
EPOCHS = 150 # 100 epochs took abt 2 hrs

In [4]:
def read_train_file(directory):
    x_train = []
    y_train = []
    all_tags = []
    all_words = []
    train_data = []

    with open(directory) as f:
        x_sent = []
        y = []
        for line in f:
            if line == '\n': # end of a sentence
                x_train.append(x_sent)
                y_train.append(y)
                x_sent = []
                y = []
            else:
                temp = line.strip().split()
                x_sent.append(temp[0]) # word
                y.append(temp[1]) # tag

                if temp[1] not in all_tags:
                    all_tags.append(temp[1])
                if temp[0] not in all_words:
                    all_words.append(temp[0])
    
    for i in range(len(x_train)):
        train_data.append((x_train[i], y_train[i]))

    return train_data, x_train, y_train, all_tags, all_words

In [5]:
def read_validation_file(directory):
    dev = []
    with open(directory) as f:
        s = []
        for line in f:
            if line == "\n":
                if len(s) == 0: # there are two consecutive blank spaces
                    continue
                else:
                    dev.append(s)
                    s = []
            else:
                temp = line.strip()
                s.append(temp)
    return dev

In [6]:
def word_index_mapping(data):
    index_mapping = {'UNK':0}
    for sent, _ in data:
        for w in sent:
            if w not in index_mapping:
                index_mapping[w] = len(index_mapping)
    
    return index_mapping

In [7]:
def tag_index_mapping(data):
    tag_mapping = {}
    for label in data:
        if label not in tag_mapping:
            tag_mapping[label] = len(tag_mapping)
            
    tag_mapping[START_TAG] = len(tag_mapping)
    tag_mapping[STOP_TAG] = len(tag_mapping)
    return tag_mapping

In [8]:
train_data, x_train, y_train, TAGS, _ = read_train_file(directory + '/train')
index_mapping = word_index_mapping(train_data)
tag_mapping = tag_index_mapping(TAGS)
# print(TAGS)
# print(tag_mapping)

In [9]:
def write_output(directory, x, y):
    with open(directory, 'w') as f:
        for i in range(len(x)):
            for j in range(len(x[i])):
                f.write(f"{x[i][j]} {y[i][j]}\n")
            f.write("\n")

In [10]:
def predict_and_write_output(inp_dir, out_dir, model, index_mapping):
    x = []
    test_file = read_validation_file(inp_dir)
    
    for i in range(len(test_file)):
        x.append(process_input(test_file[i], index_mapping))
    
    predict = []
    with torch.no_grad():
        for i in range(len(x)):
            predict.append(model(x[i])[1])
            
    for i in range(len(predict)):
        for j in range(len(predict[i])):
            predict[i][j] = TAGS[predict[i][j]]
    
    write_output(out_dir, x, predict)

In [11]:
def argmax(vec):
    return vec.argmax().item()

In [12]:
def log_sum_exp(vec):
    _max = vec[0, argmax(vec)]
    _max_vec = _max.view(1, -1).expand(1, vec.size()[1])
    return _max + torch.log(torch.sum(torch.exp(vec - _max_vec)))

In [13]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_mapping, embed_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.target_size = len(tag_mapping)
        self.tag_mapping = tag_mapping
        self.hidden = (torch.randn(2, 1, self.hidden_dim // 2),
                       torch.randn(2, 1, self.hidden_dim // 2))
        
        self.word_embedding = nn.Embedding(vocab_size, self.embed_dim)
        self.bilstm = nn.LSTM(self.embed_dim, self.hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        self.hidden_to_tag = nn.Linear(hidden_dim, self.target_size)

        # transition params
        self.transitions = nn.Parameter(torch.randn(self.target_size, self.target_size))

        # To prevent transition to START and from STOP
        self.transitions.data[tag_mapping[START_TAG], :] = -9999
        self.transitions.data[:, tag_mapping[STOP_TAG]] = -9999

    def _get_forward_score(self, feature):
        alpha = torch.full((1, self.target_size), -10000.)
        
        # init START_TAG score
        alpha[0][self.tag_mapping[START_TAG]] = 0.
        placeholder = alpha

        for f in feature:
            forward_t = [] 
            for v in range(self.target_size):
                emit_score = f[v].view(1, -1)
                emit_score = emit_score.expand(1, self.target_size)

                # transition of i -> next label
                trans_score = self.transitions[v].view(1, -1)
                
                next_label = placeholder + trans_score + emit_score
                # total score
                total_score = log_sum_exp(next_label).view(1)
                forward_t.append(total_score)
            placeholder = torch.cat(forward_t).view(1, -1)

        result = placeholder + self.transitions[self.tag_mapping[STOP_TAG]]
        return log_sum_exp(result)

    def _get_features(self, sent):
        # reset
        sent_length = len(sent)
        self.hidden = (torch.randn(2, 1, self.hidden_dim // 2),
                       torch.randn(2, 1, self.hidden_dim // 2))
        
        embeds = self.word_embedding(sent).view(sent_length, 1, -1)
        out, self.hidden = self.bilstm(embeds, self.hidden)
        feature = self.hidden_to_tag(out.view(sent_length, self.hidden_dim))
        return feature

    def _get_gold_score(self, feature, labels):
        # return the score of the actual tag seq
        score = torch.zeros(1)
        temp_t = torch.tensor([self.tag_mapping[START_TAG]], dtype=torch.long)
        labels = torch.cat([temp_t, labels])
        for idx, f in enumerate(feature):
            score += self.transitions[labels[idx + 1], labels[idx]] + f[labels[idx + 1]]
        score += self.transitions[self.tag_mapping[STOP_TAG], labels[-1]]
        return score

    def _decode(self, feature):
        pointer = []

        beta = torch.full((1, self.target_size), -10000.)
        beta[0][self.tag_mapping[START_TAG]] = 0
        
        placeholder = beta

        for f in feature:
            temp_pointer = []
            temp = []
            
            # transition
            for label in range(self.target_size):
                # next_label[i] -> label i from the previous step + score of transition(label i -> next label)
                next_label = placeholder + self.transitions[label]
                best_label = argmax(next_label)

                temp_pointer.append(best_label)
                temp.append(next_label[0][best_label].view(1))
            
            # emission score
            _temp = torch.cat(temp) + f
            placeholder = _temp.view(1, -1)
            pointer.append(temp_pointer)

        # v -> STOP
        terminal = placeholder + self.transitions[self.tag_mapping[STOP_TAG]]
        best_label = argmax(terminal)
        best_path_score = terminal[0][best_label]

        # trace back best path
        best_tag_seq = [best_label]
        for bp in reversed(pointer):
            best_label = bp[best_label]
            best_tag_seq.append(best_label)
        
        # remove start tag
        best_tag_seq = best_tag_seq[:-1]
        best_tag_seq.reverse()
        
        return best_path_score, best_tag_seq

    def nll_loss(self, sent, tags):
        features = self._get_features(sent)
        forward = self._get_forward_score(features)
        target_score = self._get_gold_score(features, tags)
        loss = forward - target_score
        return loss

    def forward(self, sent):
        features = self._get_features(sent)
        score, label_seq = self._decode(features)
        return score, label_seq

In [14]:
def format_time(t):
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f"{h:.0f}h {m:.0f}m {s:.2f}s"

In [15]:
def process_input(sent, index_mapping):
    inp = []
    for i in range(len(sent)):
        try:
            inp.append(index_mapping[sent[i]])
        except KeyError:
            inp.append(index_mapping['UNK'])
    
    return torch.tensor(inp, dtype=torch.long)

In [16]:
model = BiLSTM_CRF(len(index_mapping), tag_mapping, EMBED_DIM, HIDDEN_DIM)
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=DECAY)

In [17]:
start_time = time.time()
for epoch in range(EPOCHS): 
    loss_num = 0
    num = 0
    for sent, labels in train_data:
        model.zero_grad()
        
        model_input = process_input(sent, index_mapping)
        temp = []
        for t in labels:
            temp.append(tag_mapping[t])
        labels = torch.tensor(temp, dtype=torch.long)
        
        # run model
        loss = model.nll_loss(model_input, labels)
        
        loss.backward()
        optimizer.step()
        
        loss_num += loss.item()
        num += len(sent)
    loss_num /= num
    
    if epoch % 10 == 0:
        print(f"Epoch:\t\t\t{epoch}")
        print(f"loss:\t\t\t{loss_num}")
        print(f"time taken so far:\t{format_time(time.time() - start_time)}")
print(f"total time taken:\t{format_time(time.time()- start_time)}")

Epoch:			0
loss:			0.2519407210582644
time taken so far:	0h 1m 10.44s
Epoch:			10
loss:			0.0697127307179113
time taken so far:	0h 12m 53.15s
Epoch:			20
loss:			0.06254346862724892
time taken so far:	0h 24m 38.17s
Epoch:			30
loss:			0.061056729821626785
time taken so far:	0h 36m 23.55s
Epoch:			40
loss:			0.06177653920786302
time taken so far:	0h 48m 8.55s
Epoch:			50
loss:			0.0590605245113531
time taken so far:	0h 59m 50.95s
Epoch:			60
loss:			0.058663638871194586
time taken so far:	1h 11m 33.25s
Epoch:			70
loss:			0.0591599003877714
time taken so far:	1h 23m 15.69s
Epoch:			80
loss:			0.05773980858130712
time taken so far:	1h 34m 58.24s
Epoch:			90
loss:			0.062226535138081344
time taken so far:	1h 46m 40.00s
Epoch:			100
loss:			0.05966374061042671
time taken so far:	1h 58m 11.14s
Epoch:			110
loss:			0.05892419811547108
time taken so far:	2h 9m 37.19s
Epoch:			120
loss:			0.05752089959735307
time taken so far:	2h 21m 2.81s
Epoch:			130
loss:			0.06293952841748354
time taken so

In [18]:
torch.save(model.state_dict(), directory+"/model.pt")

In [19]:
# inp_dir = directory + '/dev.in'
inp_dir = directory + '/test.in'
out_dir = directory + '/test.p6.model.out'
predict_and_write_output(inp_dir, out_dir,  model, index_mapping)

In [20]:
inp_dir = directory + '/dev.in'
out_dir = directory + '/model.p6.model.out'
predict_and_write_output(inp_dir, out_dir,  model, index_mapping)