In [1]:
import comet_ml

import os
import collections

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import random

import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from ner.utils import create_dataset_and_document_dataloader
from ner.trainer import Trainer
from ner.model import BertNERBiLSTM, BertNER, DocumentContextBertBaseNER

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

SEED = 693

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

comet_ml is installed but `COMET_API_KEY` is not set.


In [2]:
torch.cuda.get_device_name(device=1)

'TITAN V'

In [3]:
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
torch.cuda.set_device(1)
DEVICE = 'cuda' if torch.cuda.is_available else 'cpu'
BATCH_SIZE = 32

### CoNLL

In [4]:
train_dataset, train_documents, train_dataloader = create_dataset_and_document_dataloader('conll', 'data/conll2003/train.txt', batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)
eval_dataset, eval_documents, eval_dataloader = create_dataset_and_document_dataloader('conll', 'data/conll2003/valid.txt', batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)
test_dataset, test_documents, test_dataloader = create_dataset_and_document_dataloader('conll', 'data/conll2003/test.txt', batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)

In [5]:
eval_dataset.idx2tag = train_dataset.idx2tag
eval_dataset.tag2idx = train_dataset.tag2idx
test_dataset.idx2tag = train_dataset.idx2tag
test_dataset.tag2idx = train_dataset.tag2idx

### Experiment

In [10]:
classes = len(train_dataset.ner_tags)

params = {
    'model': 'Bert-Base-Cased',
    'corpus': 'conll',
    'document_context': True,
    'hidden_size': 768,
    'batch_size': BATCH_SIZE,
    'shuffle_batch': False,
    'optimizer': 'AdamW',
    'learning_rate': 1e-6,
    'epochs': 5,
    'last_epoch_lstm': False,
    'seed': SEED
}

model = DocumentContextBertBaseNER(classes, DEVICE).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=params['learning_rate'])
criterion = nn.CrossEntropyLoss(ignore_index=-100).to(DEVICE)

experiment = comet_ml.Experiment(api_key='fxEY7T7JQW6R5I9DkDazSYRpp', project_name='ner-with-nonlocal-features', workspace='ryzhtus', log_graph=True)
experiment.set_model_graph(model)

trainer = Trainer(experiment, model, params, optimizer, criterion, None, False, params['epochs'], False, train_dataloader, eval_dataloader, test_dataloader,
                  train_documents, eval_documents, test_documents, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE)

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/ryzhtus/ner-with-nonlocal-features/1ea9ab862cd248608df978e65e4404de



In [11]:
trainer.fit()

[1 / 5] Train: Loss = 1.09839, Token F1-score = 0.45%, Span F1-score = 0.44%: 100%|██████████| 439/439 [03:23<00:00,  2.16it/s]
[1 / 5] Eval : Loss = 0.41761, Token F1-score = 0.00%, Span F1-score = 0.00%: 100%|██████████| 102/102 [00:39<00:00,  2.55it/s]


ZeroDivisionError: float division by zero

In [None]:
trainer.test()

In [12]:
experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/ryzhtus/ner-with-nonlocal-features/1ea9ab862cd248608df978e65e4404de
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     train_Train Precision : 0.014698596201486376
COMET INFO:     train_Train Recall    : 0.0026760478681820917
COMET INFO:     train_Train Span F1   : 0.0043834115270025
COMET INFO:     train_Train Token F1  : 0.00452776435275863
COMET INFO:     train_loss [44]       : (0.4678955674171448, 2.2160940170288086)
COMET INFO:   Parameters:
COMET INFO:     batch_size       : 32
COMET INFO:     corpus           : conll
COMET INFO:     document_context : True
COMET INFO:     epochs           : 5
COMET INFO:     hidden_size      : 768
COMET INFO:     last_epoch_lstm  : 1
COMET INFO:     learning_rate    : 1e-06
COMET INFO:     model    