In [1]:
import torch
from transformers import AutoTokenizer
from utils.saver import tokenizer_loader
from utils.NERDataset import NERDataset
from utils.NERModel import NERModel

DATASET_NAME = "msra.min"
tokenizer = tokenizer_loader(AutoTokenizer, "bert-base-chinese")

In [2]:
train_dataset = NERDataset(tokenizer=tokenizer, filename=f"./data/{DATASET_NAME}.train")
dev_dataset = NERDataset(tokenizer=tokenizer, filename=f"./data/{DATASET_NAME}.dev")
test_dataset = NERDataset(tokenizer=tokenizer, filename=f"./data/{DATASET_NAME}.test")

model = NERModel(train_dataset.num_labels, model_name="bert-base-chinese")
if torch.cuda.is_available():
    model.to(torch.device("cuda:0"))

assert train_dataset.id_label == dev_dataset.id_label
assert train_dataset.id_label == test_dataset.id_label

In [3]:
import tqdm
from utils.metrics import calc_acc

LEARNING_RATE = 5e-3
EPOCH = 1
BATCH_SIZE = 4

def train_loop(train_dataset, dev_dataset, model):
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
    for _ in range(EPOCH):
        train(train_dataset, model, optimizer)
        dev_acc = test(dev_dataset, model)
        print(dev_acc)

def train(dataset, model, optimizer):
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE)
    for batch_index, (batch_X, batch_Y) in enumerate(dataloader):
        _, __, loss = model(batch_X, batch_Y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def test(dataset, model):
    test_loader = torch.utils.data.DataLoader(dataset, batch_size=dataset.length)
    _, (X, Y) = next(enumerate(test_loader))
    predict, ans, loss = model(X, Y)
    return calc_acc(predict, ans)

In [4]:
train_loop(train_dataset, dev_dataset, model)

1.0


In [5]:
test(test_dataset, model)

1.0