In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
import random
import time

random.seed(1)
torch.manual_seed(1)

device = torch.device('cuda')

START_TAG = "<START>"
STOP_TAG = "<STOP>"
MINIUM = -2147483640
PAD_TAG = "<PAD>"
EMBEDDING_DIM = 400
HIDDEN_DIM = 2048
AMOUNT = 8000
BATCH_SIZE = 16
NUM_LAYERS = 3

In [1]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def prepare_sequence_batch(data ,word_to_ix, tag_to_ix):
    seqs = [i[0] for i in data]
    tags = [i[1] for i in data]
    max_len = max([len(seq) for seq in seqs])
    seqs_pad=[]
    tags_pad=[]
    for seq,tag in zip(seqs, tags):
        seq_pad = seq + ['<PAD>'] * (max_len-len(seq))
        tag_pad = tag + ['<PAD>'] * (max_len-len(tag))
        seqs_pad.append(seq_pad)
        tags_pad.append(tag_pad)
    idxs_pad = torch.tensor([[word_to_ix[w] for w in seq] for seq in seqs_pad], dtype=torch.long).to(device)
    tags_pad = torch.tensor([[tag_to_ix[t] for t in tag] for tag in tags_pad], dtype=torch.long).to(device)
    return idxs_pad, tags_pad


class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, num_layers):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.num_layers = num_layers
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=num_layers, bidirectional=True, batch_first=True)
#         self.hidden2tag = nn.Sequential(
#             nn.Linear(hidden_dim, hidden_dim // 2),
#             nn.Linear(hidden_dim // 2, hidden_dim // 4),
#             nn.Linear(hidden_dim // 4, self.tagset_size)
#         )
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))
        self.transitions.data[tag_to_ix[START_TAG], :] = MINIUM
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = MINIUM
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2 * self.num_layers, 1, self.hidden_dim // 2).to(device),
                torch.randn(2 * self.num_layers, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        init_alphas = torch.full([feats.shape[0], self.tagset_size], MINIUM).to(device)
        init_alphas[:, self.tag_to_ix[START_TAG]] = 0.
        forward_var_list = []
        forward_var_list.append(init_alphas)
        for feat_index in range(feats.shape[1]):
            gamar_r_l = torch.stack([forward_var_list[feat_index]] * feats.shape[2]).transpose(0, 1).to(device)
            t_r1_k = torch.unsqueeze(feats[:, feat_index, :], 1).transpose(1, 2).to(device)
            aa = gamar_r_l + t_r1_k + torch.unsqueeze(self.transitions, 0)
            forward_var_list.append(torch.logsumexp(aa, dim=2))
        terminal_var = forward_var_list[-1] + self.transitions[self.tag_to_ix[STOP_TAG]].repeat([feats.shape[0], 1]).to(device)
        alpha = torch.logsumexp(terminal_var, dim=1)
        return alpha
    
    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).unsqueeze(dim=0).to(device)
        lstm_out, self.hidden = self.lstm(embeds)
        lstm_out = lstm_out.squeeze().to(device)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _get_lstm_features_train(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).to(device)
        lstm_out, self.hidden = self.lstm(embeds)
        lstm_feats = self.hidden2tag(lstm_out.to(device))
        return lstm_feats

    def _score_sentence(self, feats, tags):
        score = torch.zeros(tags.shape[0]).to(device)
        tags = torch.cat([torch.full([tags.shape[0],1],self.tag_to_ix[START_TAG], dtype=torch.long).to(device),tags],dim=1).to(device)
        for i in range(feats.shape[1]):
            feat=feats[:,i,:]
            score = score + \
                    self.transitions[tags[:,i + 1], tags[:,i]] + feat[range(feat.shape[0]),tags[:,i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[:,-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []
        init_vvars = torch.full((1, self.tagset_size), MINIUM).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0
        forward_var_list = []
        forward_var_list.append(init_vvars)

        for feat_index in range(feats.shape[0]):
            gamar_r_l = torch.stack([forward_var_list[feat_index]] * feats.shape[1])
            gamar_r_l = torch.squeeze(gamar_r_l).to(device)
            next_tag_var = gamar_r_l + self.transitions
            viterbivars_t, bptrs_t = torch.max(next_tag_var, dim=1)
            t_r1_k = torch.unsqueeze(feats[feat_index], 0).to(device)
            forward_var_new = torch.unsqueeze(viterbivars_t, 0).to(device) + t_r1_k
            forward_var_list.append(forward_var_new)
            backpointers.append(bptrs_t.tolist())
        terminal_var = forward_var_list[-1] + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = torch.argmax(terminal_var).tolist()
        path_score = terminal_var[0][best_tag_id]
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood_parallel(self, sentences, tags):
        feats = self._get_lstm_features_train(sentences).to(device)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return torch.sum(forward_score - gold_score)

    def forward(self, sentence):
        lstm_feats = self._get_lstm_features(sentence).to(device)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [2]:
train_file = open('sents.train', 'r')
test_file = open('sents.test', 'r')
answer_file = open('sents.answer', 'r')

def generate(file):
    cur = file.readline()
    sentences = []
    words = []
    labels = []
    while cur:
        if cur == '\n':
            cur = file.readline()
            continue
        ws = cur.split(' ')
        for word in ws:
            w, label = word.split('/') if len(word.split('/')) == 2 else (''.join(word.split('/')[:-1]), word.split('/')[-1])
            if label.endswith('\n'): label = label[:-1]
            if (w or label) and not (w and label):
                print(file, word)
                continue
            words.append(w)
            labels.append(label)
        sentences.append((words, labels))
        words = []
        labels = []
        cur = file.readline()
    return sentences

training_data = generate(train_file)
testing_data = generate(answer_file)

train_file.close()
test_file.close()
answer_file.close()

In [3]:
word_to_ix = {}
word_to_ix['<PAD>'] = 0
tag_to_ix = {}

word_count = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            word_count[word] = 1
        else: word_count[word] += 1
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

word_to_ix['ONCE_OTHER_EXCEPTION'] = len(word_to_ix)
for i, (sentence, tags) in enumerate(training_data):
    for j, word in enumerate(sentence):
        if word_count[word] == 1: 
            training_data[i][0][j] = 'ONCE_OTHER_EXCEPTION'

print(len(training_data))
print(len(testing_data))
tag_to_ix[START_TAG] = len(tag_to_ix)
tag_to_ix[STOP_TAG] = len(tag_to_ix)
tag_to_ix[PAD_TAG] = len(tag_to_ix)
print(len(word_to_ix), len(tag_to_ix))

39832
1993
44368 48


In [4]:
def prepare_test_sequence(sentence, to_ix):
    idxs = []
    for w in sentence:
        if w in to_ix: idxs.append(to_ix[w])
        else:
            idxs.append(to_ix['ONCE_OTHER_EXCEPTION'])
    return torch.tensor(idxs, dtype=torch.long)
def accuracy():
    cor = cnt = 0
    test_data = testing_data
    for sentence, tags in test_data:
        precheck_sent = prepare_test_sequence(sentence, word_to_ix).to(device)
        answer = [tag_to_ix[t] for t in tags]
        score, prediction = model(precheck_sent)
        for i in range(len(answer)):
            if answer[i] == prediction[i]: cor += 1
        cnt += len(tags)
    return cor, cnt

In [None]:
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS).to(device)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
optimizer = optim.Adam(model.parameters(), lr=0.001)

with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix).to(device)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long).to(device)
    print(model(precheck_sent))

start_time = time.time()
for epoch in range(200): 
    model.zero_grad()
    epoch_data = random.sample(training_data, AMOUNT)
    for i in range(len(epoch_data) // BATCH_SIZE):
        train_data = epoch_data[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
        sentence_in_pad, targets_pad = prepare_sequence_batch(train_data, word_to_ix, tag_to_ix)
        sentence_in_pad = sentence_in_pad.to(device)
        targets_pad = targets_pad.to(device)
        loss = model.neg_log_likelihood_parallel(sentence_in_pad, targets_pad).to(device)
        loss.backward()
        optimizer.step()
    cor, cnt = accuracy()
    print("epoch : ", str(epoch), " accuracy : ", cor / cnt, " elapse time : ", time.time() - start_time)

In [None]:
torch.save(model, '.\model.pkl')
temp_model = torch.load('.\model.pkl')
start = time.time()
def temp_accuracy():
    cor = cnt = 0
    test_data = testing_data
    for sentence, tags in test_data:
        precheck_sent = prepare_test_sequence(sentence, word_to_ix).to(device)
        answer = [tag_to_ix[t] for t in tags]
        score, prediction = temp_model(precheck_sent)
        for i in range(len(answer)):
            if answer[i] == prediction[i]: cor += 1
        cnt += len(tags)
    return cor, cnt
cor, cnt = temp_accuracy()
print(time.time() - start, cor / cnt)

In [6]:
torch.cuda.empty_cache()