In [1]:
import os
import collections

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import random

import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from named_entity_recognition.utils import create_dataset_and_document_dataloader
from named_entity_recognition.train_document_word_context import train_model, test_model
from named_entity_recognition.model import DocumentWordContextBertNER

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

SEED = 693

"""torch.manual_seed(SEED)
random.seed(SEED)
numpy.random.seed(SEED)"""

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

comet_ml is installed but `COMET_API_KEY` is not set.


In [2]:
torch.cuda.get_device_name(device=0)

'TITAN V'

In [3]:
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
torch.cuda.set_device(0)
DEVICE = 'cuda' if torch.cuda.is_available else 'cpu'
EPOCHS = 5
BATCH_SIZE = 32

In [4]:
train_dataset, train_documents, train_dataloader = create_dataset_and_document_dataloader('conll', "data/conll2003/train.txt", batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)
eval_dataset, eval_documents, eval_dataloader = create_dataset_and_document_dataloader('conll', "data/conll2003/valid.txt", batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)
test_dataset, test_documents, test_dataloader = create_dataset_and_document_dataloader('conll', "data/conll2003/test.txt", batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)

In [5]:
eval_dataset.idx2tag = train_dataset.idx2tag
eval_dataset.tag2idx = train_dataset.tag2idx
test_dataset.idx2tag = train_dataset.idx2tag
test_dataset.tag2idx = train_dataset.tag2idx

## Shuffled Sentences Batches

### SEED 693

In [None]:
classes = len(train_dataset.ner_tags)

model = DocumentWordContextBertNER(classes, DEVICE).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=0).to(DEVICE)

train_model(model, criterion, optimizer, train_dataloader, eval_dataloader, train_documents, eval_documents, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE, False, None, EPOCHS)
test_model(model, criterion, test_dataloader, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE, test_documents)

### 1793