In [1]:
import pandas as pd

diorisis_df = pd.read_excel('output.xlsx')

print(diorisis_df.columns)

Index(['B_Word', 'B_Lemma', 'I_Lemma', 'B_Analysis'], dtype='object')


In [33]:
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim


In [34]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

In [14]:
b_lemmas: list[str] = diorisis_df['B_Lemma'].tolist()
i_lemmas: list[str] = diorisis_df['I_Lemma'].tolist()

In [17]:
i_lemmas

['proper',
 'preposition',
 'noun',
 'adjective',
 'proper',
 'article',
 'noun',
 'noun',
 'proper',
 'article',
 'adjective',
 'proper',
 'article',
 'noun',
 'noun',
 'adjective',
 'noun',
 'preposition',
 'noun',
 'adjective',
 'adverb',
 'verb',
 'article',
 'noun',
 'adverb',
 'particle',
 'article',
 'noun',
 'preposition',
 'noun',
 'preposition',
 'adjective',
 'verb',
 'noun',
 'adjective',
 'verb',
 'conjunction',
 'article',
 'noun',
 'adverb',
 'verb',
 'conjunction',
 'verb',
 'article',
 'noun',
 'adjective',
 'noun',
 'conjunction',
 'verb',
 'particle',
 'adverb',
 'article',
 'noun',
 'preposition',
 'noun',
 'verb',
 'particle',
 'article',
 'noun',
 'preposition',
 'article',
 'noun',
 'adverb',
 'verb',
 'preposition',
 'adjective',
 'noun',
 'noun',
 'verb',
 'pronoun',
 'article',
 'article',
 'proper',
 'noun',
 nan,
 'pronoun',
 'article',
 'proper',
 'verb',
 'verb',
 'particle',
 'conjunction',
 'article',
 'adjective',
 'adjective',
 'conjunction',
 'verb',


In [38]:
tagged_data = diorisis_df[['B_Lemma', 'I_Lemma']]
tagged_sents = [tuple(x) for x in tagged_data.values]

In [39]:
tags = list(set(tag for _, tag in tagged_sents))
tags = ["<pad>"] + tags

tag2idx = {tag: idx for idx, tag in enumerate(tags)}
idx2tag = {idx: tag for idx, tag in enumerate(tags)}

In [43]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sents, test_size=0.2)
len(train_data), len(test_data)

(33329, 8333)

Data loader

In [44]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

class PosDataset(data.Dataset):
    def __init__(self, tagged_sents):
        sents, tags_li = [], []
        for b_lemma, i_lemma in tagged_sents:
            words = [b_lemma]
            tags = [i_lemma]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx]

        x, y = [], []
        is_heads = []
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)
            yy = [tag2idx[each] for each in t]

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x) == len(y) == len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        seqlen = len(y)

        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen

def pad(batch):
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch]
    x = f(1, maxlen)
    y = f(-2, maxlen)

    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

In [29]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [35]:
from pytorch_pretrained_bert import BertModel

Model

In [36]:
class BertNet(nn.Module):
    def __init__(self, vocab_size=None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device
    
    def forward(self, x, y):
        x = x.to(device)
        y = y.to(device)

        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

In [46]:
from tqdm import tqdm

def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(tqdm(iterator, desc="Training Progress")):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y
        optimizer.zero_grad()
        logits, y, _ = model(x, y)

        logits = logits.view(-1, logits.shape[-1])
        y = y.view(-1)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i % 10 == 0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

In [45]:
def eval(model, iterator):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)

In [48]:
model = BertNet(vocab_size=len(tag2idx))
model.to(device)
model = nn.DataParallel(model)

train_dataset = PosDataset(train_data)
eval_dataset = PosDataset(test_data)

train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)

optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

train(model, train_iter, optimizer, criterion)
eval(model, test_iter)



KeyboardInterrupt: 