In [1]:
import numpy as np
import re
import json
import torch
from pathlib import Path
from transformers import BertTokenizerFast, Trainer, TrainingArguments, BertForTokenClassification, TokenClassificationPipeline
from sklearn.model_selection import train_test_split

In [2]:
p = Path('.').glob('data/markup/*.json')
data = []
for filename in p:
    with open(filename, 'rt') as file:
        data += json.load(file)

In [3]:
def includes(in_start, in_end, out_start, out_end):
    if out_start <= in_start and in_end <= out_end:
        return True
    return False

In [4]:
def make_tags(data):
    total_result = []
    for query in data:
        if 'ner' not in query.keys() or len(query['ner']) == 3:
            continue
        words_with_pos = [(m.group(0), m.start(), m.end()) for m in re.finditer(r'\S+', query['text'])]
        ner_list = [(x['htmllabels'][0], x['startOffset'], x['endOffset']) for x in query['ner']]
        if ner_list[0][0] == 'Пропущено':
            ner_list = []
        if len(ner_list) == 2:
            ner_list[1] = (ner_list[1][0], ner_list[0][2] + ner_list[1][1], ner_list[0][2] + ner_list[1][2])
        sentence_wtags = []
        for word, w_start, w_end in words_with_pos:
            tagged = False
            for tag, t_start, t_end in ner_list:
                if includes(w_start, w_start, t_start, t_end):
                    sentence_wtags.append((word, tag))
                    tagged = True
                    break
            if not tagged:
                sentence_wtags.append((word, 'O'))
        total_result.append(sentence_wtags)
    texts = [[x[0] for x in y] for y in total_result]
    tags = [[x[1] for x in y] for y in total_result]
    return texts, tags

In [5]:
texts, tags = make_tags(data)

In [6]:
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=0.05, random_state=42)

In [7]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {'O': 0, 'Товар': 1, 'Бренд': 2}
id2tag = {id: tag for tag, id in tag2id.items()}

In [8]:
print(unique_tags)
print(tag2id)
print(id2tag)

{'Товар', 'Бренд', 'O'}
{'O': 0, 'Товар': 1, 'Бренд': 2}
{0: 'O', 1: 'Товар', 2: 'Бренд'}


In [9]:
tokenizer = BertTokenizerFast.from_pretrained('tokenizer')

In [10]:
train_encodings = tokenizer(train_texts, return_token_type_ids=False, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, return_token_type_ids=False, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [11]:
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    i = 0
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        
        if ((arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)).sum() != len(doc_labels):
            print(i)
            print(doc_labels)
            print(arr_offset)
            continue
        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())
        i += 1
    
    return encoded_labels

In [12]:
val_labels = encode_tags(val_tags, val_encodings)

In [13]:
train_labels = encode_tags(train_tags, train_encodings)

In [14]:
class NerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = NerDataset(train_encodings, train_labels)
val_dataset = NerDataset(val_encodings, val_labels)

In [15]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=50,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    dataloader_num_workers=8,
    evaluation_strategy='steps',
    eval_steps=50,
    save_steps=100
)

In [16]:
model = BertForTokenClassification.from_pretrained("pre-trained", num_labels=len(unique_tags))

Some weights of the model checkpoint at pre-trained were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at pre-trained and are newly initialized: ['clas

In [17]:
#  Замораживаем ли  слои?
#for i, p in enumerate(model.parameters()):
#    if i > 85:#i == 99 or i == 100 or i == 101 or i == 102:
#        continue
#    else:
#        p.requires_grad = False

In [18]:
model.config

BertConfig {
  "_name_or_path": "pre-trained",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 6,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

In [19]:
model.config.id2label = id2tag
model.config.label2id = tag2id
model.config.attention_probs_dropout_prob = 0.2
model.config.hidden_dropout_prob = 0.2

In [20]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

In [21]:
trainer.train()

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
50,0.2601,0.26052,0.5108,978.946
100,0.2161,0.197065,0.5204,960.765
150,0.1899,0.177777,0.5644,885.83
200,0.1007,0.184997,0.513,974.727
250,0.11,0.186718,0.5239,954.341
300,0.077,0.184482,0.4774,1047.375
350,0.0569,0.203369,0.488,1024.499
400,0.0478,0.197296,0.4829,1035.315


TrainOutput(global_step=447, training_loss=0.1689124878754285, metrics={'train_runtime': 92.4521, 'train_samples_per_second': 4.835, 'total_flos': 732669906981870.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 2473680896, 'init_mem_gpu_alloc_delta': 264506880, 'init_mem_cpu_peaked_delta': 203825152, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 32595968, 'train_mem_gpu_alloc_delta': 811728896, 'train_mem_cpu_peaked_delta': 184422400, 'train_mem_gpu_peaked_delta': 1480026112})