In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datasets
conll = datasets.load_dataset("conll2003")
CONLL_NER_TAGS = conll['train'].features['ner_tags'].feature.names
print(CONLL_NER_TAGS)
conll["test"][2]

Found cached dataset conll2003 (/Users/kirillsergeev/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 143.09it/s]

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']





{'id': '2',
 'tokens': ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'],
 'pos_tags': [22, 6, 22, 22, 23, 11],
 'chunk_tags': [11, 0, 11, 12, 12, 12],
 'ner_tags': [5, 0, 5, 6, 6, 0]}

In [3]:
from transformers import (pipeline, 
        AutoModelForTokenClassification, AutoTokenizer, 
        BertForTokenClassification, BertTokenizer)

# Load pretrained model and tokenizer for English NER task (dslim/bert-base-NER)
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

In [4]:
sorted(list(set([tag.split('-')[-1] for tag in CONLL_NER_TAGS]) - set(['O'])))

['LOC', 'MISC', 'ORG', 'PER']

In [5]:
import numpy as np
import torch
from tqdm import tqdm

class ModelWrapper():
    def __init__(self, model, tokenizer, classes):
        self.model = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=None)
        self.classes_ = classes

    def _unite_entities(self, entities):
        if len(entities) <= 1:
            return entities

        united_result = []
        cur_entity = {key: entities[0][key] for key in ['entity', 'word', 'start', 'end']}
        for entity in entities[1:]:
            if entity['word'].startswith('##'):
                cur_entity['word'] += entity['word'].lstrip('#')
                cur_entity['end'] = entity['end']
            else:
                united_result.append(cur_entity)
                cur_entity = {key: entity[key] for key in ['entity', 'word', 'start', 'end']}
        united_result.append(cur_entity)
        return united_result

    def _convert_entities_to_bio(self, tokens, entities):
        bio_tags = []
        cur_entity_idx = 0
        for token in tokens:
            if (cur_entity_idx < len(entities))\
                    and (token == entities[cur_entity_idx]['word']):
                bio_tags.append(entities[cur_entity_idx]['entity'])
                cur_entity_idx += 1
            else:
                bio_tags.append('O')
        return bio_tags

    def _postprocessing(self, tokens, model_output):
        entities = self._unite_entities(model_output)
        bio_tags = self._convert_entities_to_bio(tokens, entities)
        return bio_tags

    def predict(self, X):
        with torch.no_grad():
            ner_entitites = self.model(X)
            tags = []
            for text, entities in tqdm(zip(X, ner_entitites)):
                tags.append(self._postprocessing(text.split(), entities))
            return tags

wrapped_model = ModelWrapper(model, tokenizer, ['LOC', 'MISC', 'ORG', 'PER'])

In [6]:
def preprocessing_dataset(dataset):
    result = {'X': [], 'y_true': []}
    for sample in dataset:
        result['X'].append(' '.join(sample['tokens']))
        result['y_true'].append([CONLL_NER_TAGS[tag] for tag in sample['ner_tags']])
    return result

In [7]:
# from itertools import islice
# train_data = preprocessing_dataset(conll['train'])
# train_data['y_pred'] = wrapped_model.predict(train_data['X'])

# oos_data = preprocessing_dataset(conll['test'])
# oos_data['y_pred'] = wrapped_model.predict(oos_data['X'])

In [8]:
# import pickle

# with open('./train_data.pkl', 'wb') as f:
#     pickle.dump(train_data, f)
# with open('./oos_data.pkl', 'wb') as f:
#     pickle.dump(oos_data, f)

In [9]:
import pickle

with open('./train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)

with open('./oos_data.pkl', 'rb') as f:
    oos_data = pickle.load(f)

In [10]:
print(all([len(i) == len(j) for i, j in zip(train_data['y_true'], train_data['y_pred'])]))
print(all([len(i) == len(j) for i, j in zip(oos_data['y_true'], oos_data['y_pred'])]))

True
True


In [11]:
from sbe_vallib.validation.sampler.ner_sampler import NerSampler

sampler = NerSampler(train=train_data, oos=oos_data)
sampler.set_seed(42, bootstrap=True)

In [12]:
from sbe_vallib.validation.scorer.ner_scorer import NerScorer
from sbe_vallib.validation.utils.metrics import NER_IOB_METRICS


scorer = NerScorer(metrics=NER_IOB_METRICS)
scores = scorer.score(oos_data['y_true'], oos_data['y_pred'])
scores

{'precision_score': {'LOC': 0.9363057324840764,
  'MISC': 0.804920913884007,
  'ORG': 0.8944262295081967,
  'PER': 0.9319965126416739,
  'micro': 0.9043806982432733,
  'macro': 0.8919123471294885,
  'weighted': 0.9064257739880867},
 'f1_score': {'LOC': 0.8043775649794801,
  'MISC': 0.7206923682140047,
  'ORG': 0.8562460765850596,
  'PER': 0.7735166425470332,
  'micro': 0.8017742730409069,
  'macro': 0.7887081630813945,
  'weighted': 0.800394646791491},
 'recall_score': {'LOC': 0.7050359712230215,
  'MISC': 0.6524216524216524,
  'ORG': 0.8211920529801324,
  'PER': 0.6611008039579468,
  'micro': 0.7200779036827195,
  'macro': 0.7099376201456882,
  'weighted': 0.7200779036827195},
 'support': {'LOC': 1668,
  'MISC': 702,
  'ORG': 1661,
  'PER': 1617,
  'micro': 5648,
  'macro': 5648,
  'weighted': 5648}}

In [13]:
ner_res = scorer.ner_metrics(oos_data['y_true'], oos_data['y_pred'], wrapped_model)

In [14]:
import pandas as pd

pd.DataFrame(ner_res).reset_index(names='Наименование класса', inplace=False)

Unnamed: 0,Наименование класса,f1_ent_type,f1_partial,f1_strict,f1_exact
0,LOC,0.806265,0.847804,0.800817,0.840994
1,MISC,0.761218,0.790865,0.733974,0.767628
2,ORG,0.882945,0.911894,0.858402,0.892385
3,PER,0.803019,0.807692,0.768512,0.788641
4,macro,0.823854,0.849877,0.801774,0.833711


In [15]:
from sbe_vallib.validation.nlp.general_tests.model_quality.test_key_metric import test_key_metric

In [16]:
test_key_metric(wrapped_model, scorer, sampler,
                metric_name='precision_score',
                average='macro')['result_dataframes'][0]

Unnamed: 0,type,support,precision_score,semaphore
0,LOC,1668,0.936306,green
1,MISC,702,0.804921,green
2,ORG,1661,0.894426,green
3,PER,1617,0.931997,green
4,agg_macro,5648,0.891912,green


In [17]:
from sbe_vallib.validation.nlp.general_tests.model_quality.test_ner_metric import test_ner_metric

In [18]:
test_ner_metric(wrapped_model, scorer, sampler)['result_dataframes'][0]

Unnamed: 0,Наименование класса,f1_ent_type,f1_partial,f1_strict,f1_exact
0,LOC,0.806265,0.847804,0.800817,0.840994
1,MISC,0.761218,0.790865,0.733974,0.767628
2,ORG,0.882945,0.911894,0.858402,0.892385
3,PER,0.803019,0.807692,0.768512,0.788641
4,macro,0.823854,0.849877,0.801774,0.833711


In [19]:
from sbe_vallib import BaseValidation

ner_validation = BaseValidation(model = wrapped_model, scorer=scorer, sampler=sampler, pipeline='sbe_vallib/validation/nlp/pipelines/Config_45.xlsx')

In [22]:
res = ner_validation.validate()

	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 12.44it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 12.90it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 12.90it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 12.80it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 11.88it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 12.49it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 12.42it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 13.16it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 13.38it/s]


	Train/oos samples are too huge for independence test, calculating on 0.4287 fraction
(7499, 85)


100%|██████████| 20/20 [00:01<00:00, 13.34it/s]


In [24]:
res.keys()

dict_keys(['train_test_independence_test', 'test_extremal_missing_values', 'psi_factor_classes_test_oos', 'psi_factor_classes_test_oot', 'test_key_metric', 'test_confidence_inteval', 'test_presicion', 'test_recall', 'test_2_5', 'test_3_1', 'test_3_2', 'test_feature_importance', 'test_4_2', 'test_key_metric_stability', 'test_presicion_stability', 'test_recall_stability', 'test_5_4', 'test_5_6', 'test_5_7'])