In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import copy
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import classification_report
import gzip

import time

# Prepare dataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train_df = pd.read_csv('./data/train', names=['s_idx','word','tag'], delim_whitespace=True, \
                       keep_default_na=False, engine='python')
dev_df = pd.read_csv('./data/dev', names=['s_idx','word','tag'], delim_whitespace=True, \
                       keep_default_na=False, engine='python')
test_df = pd.read_csv('./data/test', names=['s_idx','word'], delim_whitespace=True, \
                       keep_default_na=False, engine='python')

# Prepare vocabulary

In [4]:
def create_dataset(df, word_vocab, tags_vocab):
    x = []
    y = []
    sentence = []
    tags = []
    for idx, word in enumerate(df['word']):
        if df.iloc[idx].s_idx==1:
            if idx != 0:
                x.append(torch.tensor(sentence, dtype=torch.long))
                y.append(torch.tensor(tags, dtype=torch.long))
                sentence = []
                tags = []
        sentence.append(word_vocab[word] if word in word_vocab else word_vocab['<unk>'])
        tags.append(tags_vocab[df.iloc[idx].tag])
        if idx == len(df) -1:
            x.append(torch.tensor(sentence, dtype=torch.long))
            y.append(torch.tensor(tags, dtype=torch.long))
    return x, y



def create_test_dataset(df, word_vocab):
    x = []
    sentence = []
    for idx, word in enumerate(df['word']):
        if df.iloc[idx].s_idx==1:
            if idx != 0:
                x.append(torch.tensor(sentence, dtype=torch.long))
                sentence = [] 
        sentence.append(word_vocab[word] if word in word_vocab else word_vocab['<unk>'])
        if idx == len(df) -1:
            x.append(torch.tensor(sentence, dtype=torch.long))
    return x

def decode_ner(data, tags_vocab):
    decode_list = []
    for i in data:
        for j in i:
            decode_list.append(tags_vocab[int(j)])
    return decode_list

In [5]:
word_vocab = {}
tags_vocab = {}
decode_tags_vocab = {}
for idx, word in enumerate(train_df['word'].value_counts().keys()):
    word_vocab[word] = idx
for idx, tag in enumerate(train_df['tag'].value_counts().keys()):
    tags_vocab[tag] = idx
    decode_tags_vocab[idx] = tag
word_vocab['<unk>'] = len(word_vocab)

In [6]:
random_word_embeddings = {}
tmp_embed = nn.Embedding(len(word_vocab), 100)
for i, word in enumerate(word_vocab.keys()):
    random_word_embeddings[word] = tmp_embed(torch.LongTensor([i]))[0].tolist()
glove_vocab = {}
with gzip.open('glove.6B.100d.gz','r') as f:        
    for line in f:   
        tmp = line.split()
        word = tmp[0].decode('utf-8')
        v = [float(x.decode('utf-8')) for x in tmp[1:]]
        glove_vocab[word] = v
not_list_word = []
for idx, word in enumerate(random_word_embeddings.keys()):
    if word not in glove_vocab:
        not_list_word.append(word)
for word in not_list_word:
    if word.lower() not in glove_vocab:
        glove_vocab[word] = random_word_embeddings[word]
    else:
        glove_vocab[word] = glove_vocab[word.lower()]
glove_vocab_list = {}
for idx, word in enumerate(glove_vocab.keys()):
    glove_vocab_list[word] = idx
glove_embedding = [glove_vocab[key] for key in glove_vocab_list.keys()]
glove_embedding = torch.FloatTensor(glove_embedding)

# Task 1: Simple Bidirectional LSTM model

In [7]:
class LSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, linear_dim, vocab_size, tagset_size):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.linear_dim = linear_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2, dropout=0.33, num_layers=1, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.fc = nn.Linear(hidden_dim, linear_dim)
        self.elu = nn.ELU()
        self.hidden2tag = nn.Linear(linear_dim, tagset_size)
        self.h, self.c = self.init_hidden()
        
    def init_hidden(self):
        return torch.randn(2, 1, self.hidden_dim // 2), torch.randn(2, 1, self.hidden_dim // 2)

    def forward(self, sentence):
        self.h, self.c = self.init_hidden()
        embeds = self.word_embeddings(sentence)
        lstm_out, (self.h, self.c) = self.lstm(embeds.view(len(sentence), 1, -1), (self.h.to(device), self.c.to(device)))
        drop_out = self.dropout(lstm_out)
        fc_out = self.fc(drop_out)
        elu_out = self.elu(fc_out)
        tag_space = self.hidden2tag(elu_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [8]:
# set up dataloader
train_x, train_y = create_dataset(train_df, word_vocab, tags_vocab)
train_loader = [(train_x[i],train_y[i]) for i in range(len(train_x))]
dev_x, dev_y = create_dataset(dev_df, word_vocab, tags_vocab)
dev_loader = [(dev_x[i],dev_y[i]) for i in range(len(dev_x))]
test_x = create_test_dataset(test_df, word_vocab)
test_loader = [test_x[i] for i in range(len(test_x))]

In [12]:
# set up model
model = LSTM(100, 256, 128, len(word_vocab), len(tags_vocab)).to(device)
loss_function = nn.NLLLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [13]:
# train model
model.train()
cnt = 0
for epoch in tqdm(range(200)):
    epoch_loss = 0.0
    cnt += 1
    for sentence, tags in train_loader:
        model.zero_grad()
        tag_scores = model(sentence.to(device))
        loss = loss_function(tag_scores, tags.to(device))
        epoch_loss+=loss
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
    if cnt == 10:
        print(f"epoch {epoch}: ", epoch_loss/len(train_loader))
        cnt = 0

  5%|███▉                                                                           | 10/200 [06:36<2:05:02, 39.49s/it]

epoch 9:  tensor(0.0929, device='cuda:0', grad_fn=<DivBackward0>)


 10%|███████▉                                                                       | 20/200 [13:10<1:58:19, 39.44s/it]

epoch 19:  tensor(0.0456, device='cuda:0', grad_fn=<DivBackward0>)


 15%|███████████▊                                                                   | 30/200 [19:44<1:51:43, 39.43s/it]

epoch 29:  tensor(0.0246, device='cuda:0', grad_fn=<DivBackward0>)


 20%|███████████████▊                                                               | 40/200 [26:18<1:44:53, 39.33s/it]

epoch 39:  tensor(0.0233, device='cuda:0', grad_fn=<DivBackward0>)


 25%|███████████████████▊                                                           | 50/200 [32:52<1:38:24, 39.36s/it]

epoch 49:  tensor(0.0155, device='cuda:0', grad_fn=<DivBackward0>)


 30%|███████████████████████▋                                                       | 60/200 [39:26<1:32:07, 39.48s/it]

epoch 59:  tensor(0.0118, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███████████████████████████▋                                                   | 70/200 [46:01<1:25:24, 39.42s/it]

epoch 69:  tensor(0.0128, device='cuda:0', grad_fn=<DivBackward0>)


 40%|███████████████████████████████▌                                               | 80/200 [52:34<1:18:40, 39.33s/it]

epoch 79:  tensor(0.0099, device='cuda:0', grad_fn=<DivBackward0>)


 45%|███████████████████████████████████▌                                           | 90/200 [59:08<1:12:12, 39.39s/it]

epoch 89:  tensor(0.0068, device='cuda:0', grad_fn=<DivBackward0>)


 50%|██████████████████████████████████████                                      | 100/200 [1:05:41<1:05:36, 39.36s/it]

epoch 99:  tensor(0.0088, device='cuda:0', grad_fn=<DivBackward0>)


 55%|██████████████████████████████████████████▉                                   | 110/200 [1:12:15<59:01, 39.35s/it]

epoch 109:  tensor(0.0069, device='cuda:0', grad_fn=<DivBackward0>)


 60%|██████████████████████████████████████████████▊                               | 120/200 [1:18:48<52:31, 39.39s/it]

epoch 119:  tensor(0.0065, device='cuda:0', grad_fn=<DivBackward0>)


 65%|██████████████████████████████████████████████████▋                           | 130/200 [1:25:22<45:54, 39.34s/it]

epoch 129:  tensor(0.0056, device='cuda:0', grad_fn=<DivBackward0>)


 70%|██████████████████████████████████████████████████████▌                       | 140/200 [1:31:55<39:21, 39.36s/it]

epoch 139:  tensor(0.0052, device='cuda:0', grad_fn=<DivBackward0>)


 75%|██████████████████████████████████████████████████████████▌                   | 150/200 [1:38:29<32:47, 39.35s/it]

epoch 149:  tensor(0.0055, device='cuda:0', grad_fn=<DivBackward0>)


 80%|██████████████████████████████████████████████████████████████▍               | 160/200 [1:45:03<26:15, 39.38s/it]

epoch 159:  tensor(0.0047, device='cuda:0', grad_fn=<DivBackward0>)


 85%|██████████████████████████████████████████████████████████████████▎           | 170/200 [1:51:36<19:40, 39.36s/it]

epoch 169:  tensor(0.0042, device='cuda:0', grad_fn=<DivBackward0>)


 90%|██████████████████████████████████████████████████████████████████████▏       | 180/200 [1:58:10<13:06, 39.33s/it]

epoch 179:  tensor(0.0031, device='cuda:0', grad_fn=<DivBackward0>)


 95%|██████████████████████████████████████████████████████████████████████████    | 190/200 [2:04:44<06:33, 39.39s/it]

epoch 189:  tensor(0.0030, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████████████████████████████████████████████████████████████████████████| 200/200 [2:11:18<00:00, 39.39s/it]

epoch 199:  tensor(0.0031, device='cuda:0', grad_fn=<DivBackward0>)





In [14]:
# evaluate model
model.eval()
dev_predit =[]
with torch.no_grad():
    for sentence, tags in dev_loader:
        tag_scores = model(sentence.to(device))
        dev_predit.append(torch.argmax(tag_scores,dim=1))

In [15]:
dev_decode_pred = decode_ner(dev_predit, decode_tags_vocab)
dev_df['pred'] = dev_decode_pred
dev_df.to_csv('dev1.out', header=None, index=None, sep=' ', mode='a')

In [16]:
#predeiction
model.eval()
test_predit =[]
with torch.no_grad():
    for sentence in test_loader:
        tag_scores = model(sentence.to(device))
        test_predit.append(torch.argmax(tag_scores,dim=1))

In [17]:
test_decode_pred = decode_ner(test_predit, decode_tags_vocab)
test_df['pred'] = test_decode_pred
test_df.to_csv('test1.out', header=None, index=None, sep=' ', mode='a')

In [18]:
torch.save(model, 'blstm1.pt')

## Task 2

In [58]:
class LSTM_t2(nn.Module):

    def __init__(self, glove_embedding, embedding_dim, hidden_dim, linear_dim, tagset_size):
        super(LSTM_t2, self).__init__()
        self.hidden_dim = hidden_dim
        self.linear_dim = linear_dim
        self.word_embeddings = nn.Embedding.from_pretrained(glove_embedding, freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim//2, dropout=0.33, num_layers=1, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.fc = nn.Linear(hidden_dim, linear_dim)
        self.elu = nn.ELU()
        self.hidden2tag = nn.Linear(linear_dim, tagset_size)
        self.h, self.c = self.init_hidden()
        
    def init_hidden(self):
        return torch.randn(2, 1, self.hidden_dim // 2), torch.randn(2, 1, self.hidden_dim // 2)

    def forward(self, sentence):
        self.h, self.c = self.init_hidden()
        embeds = self.word_embeddings(sentence)
        lstm_out, (self.h, self.c) = self.lstm(embeds.view(len(sentence), 1, -1), (self.h.to(device), self.c.to(device)))
        drop_out = self.dropout(lstm_out)
        fc_out = self.fc(drop_out)
        elu_out = self.elu(fc_out)
        tag_space = self.hidden2tag(elu_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [43]:
# set up dataloader
train_x_t2, train_y_t2 = create_dataset(train_df, glove_vocab_list, tags_vocab)
train_loader_t2 = [(train_x_t2[i],train_y_t2[i]) for i in range(len(train_x_t2))]
dev_x_t2, dev_y_t2 = create_dataset(dev_df, glove_vocab_list, tags_vocab)
dev_loader_t2 = [(dev_x_t2[i],dev_y_t2[i]) for i in range(len(dev_x_t2))]
test_x_t2 = create_test_dataset(test_df, glove_vocab_list)
test_loader_t2 = [test_x_t2[i] for i in range(len(test_x_t2))]

In [59]:
# set up model
task2_model = LSTM_t2(glove_embedding, 100, 256, 128, len(tags_vocab)).to(device)
task2_loss_function = nn.NLLLoss().to(device)
task2_optimizer = torch.optim.SGD(task2_model.parameters(), lr=0.01)

In [None]:
# train model
task2_model.train()
cnt = 0
for epoch in tqdm(range(100)):
    epoch_loss = 0.0
    cnt += 1
    for sentence, tags in train_loader_t2:
        task2_model.zero_grad()
        tag_scores = task2_model(sentence.to(device))
        loss = task2_loss_function(tag_scores, tags.to(device))
        epoch_loss+=loss
        loss.backward()
        nn.utils.clip_grad_norm_(task2_model.parameters(), 5)
        task2_optimizer.step()
    if cnt == 10:
        print(f"epoch {epoch+1}: ", epoch_loss/len(train_loader_t2))
        cnt = 0

 10%|███████▉                                                                       | 10/100 [07:19<1:05:53, 43.92s/it]

epoch 10:  tensor(0.1612, device='cuda:0', grad_fn=<DivBackward0>)


 20%|████████████████▏                                                                | 20/100 [14:37<58:22, 43.78s/it]

epoch 20:  tensor(0.1163, device='cuda:0', grad_fn=<DivBackward0>)


 30%|████████████████████████▎                                                        | 30/100 [21:40<48:39, 41.71s/it]

epoch 30:  tensor(0.0906, device='cuda:0', grad_fn=<DivBackward0>)


 40%|████████████████████████████████▍                                                | 40/100 [28:33<41:17, 41.30s/it]

epoch 40:  tensor(0.0792, device='cuda:0', grad_fn=<DivBackward0>)


 50%|████████████████████████████████████████▌                                        | 50/100 [35:26<34:29, 41.39s/it]

epoch 50:  tensor(0.0657, device='cuda:0', grad_fn=<DivBackward0>)


 60%|████████████████████████████████████████████████▌                                | 60/100 [42:19<27:33, 41.33s/it]

epoch 60:  tensor(0.0589, device='cuda:0', grad_fn=<DivBackward0>)


 63%|███████████████████████████████████████████████████                              | 63/100 [44:24<25:37, 41.56s/it]

In [None]:
# evaluate model
task2_model.eval()
dev_predit_t2 =[]
with torch.no_grad():
    for sentence, tags in dev_loader_t2:
        tag_scores = task2_model(sentence.to(device))
        dev_predit_t2.append(torch.argmax(tag_scores,dim=1))

In [None]:
dev_decode_pred_t2 = decode_ner(dev_predit_t2, decode_tags_vocab)
dev_df_t2 = dev_df
dev_df_t2['pred'] = dev_decode_pred_t2
dev_df_t2.to_csv('dev2.out', header=None, index=None, sep=' ', mode='a')

In [None]:
# prediction
task2_model.eval()
test_predit_t2 =[]
with torch.no_grad():
    for sentence in test_loader_t2:
        tag_scores = task2_model(sentence.to(device))
        test_predit_t2.append(torch.argmax(tag_scores,dim=1))

In [None]:
test_decode_pred_t2 = decode_ner(test_predit_t2, decode_tags_vocab)
test_df_t2 = test_df
test_df_t2['pred'] = test_decode_pred_t2
test_df_t2.to_csv('test2.out', header=None, index=None, sep=' ', mode='a')

In [None]:
torch.save(task2_model, 'blstm2.pt')

In [None]:
# Reference:
# https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
# https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html