In [1]:
%load_ext autoreload
%autoreload 2
from ner_doccano_utils import (
    load_json_lines, extract_entities, Tokenizers, tokenize,
    distribute_labels, iob, doccano_to_iob_tokens, display_annotations,
    split_doc_into_sentences
)

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html
Neither PyTorch nor TensorFlow >= 2.0 have been found.Models won't be available and only tokenizers, configurationand file/data utilities can be used.


## `load_json_lines()`

In [2]:
doccano_data = load_json_lines('examples/danish.json')

In [3]:
len(doccano_data)

4

In [4]:
type(doccano_data)

list

In [5]:
doccano_data[1]

{'text': 'Der er ikke fejet noget ind under gulvtæppet, sagde Statsminister Poul Schlüter fra Folketingets talerstol 25. april 1989.',
 'labels': [[52, 65, 'title'],
  [66, 79, 'name'],
  [84, 95, 'org'],
  [107, 121, 'date']]}

## `extract_entities()`

In [6]:
extract_entities(doccano_data[1])

{'title': ['Statsminister'],
 'org': ['Folketinget'],
 'name': ['Poul Schlüter'],
 'date': ['25. april 1989']}

In [7]:
extract_entities(doccano_data[1], include_indices=True)

{'title': [(52, 65, 'Statsminister')],
 'org': [(84, 95, 'Folketinget')],
 'name': [(66, 79, 'Poul Schlüter')],
 'date': [(107, 121, '25. april 1989')]}

## `Tokenizers()`

In [8]:
models = Tokenizers(spacy_lang_class='da', bert_model='bert-base-multilingual-cased')

## `tokenize()`

In [9]:
tokens = tokenize(
    doccano_document=doccano_data[1],
    tokenizers=models
)
tokens

[(0, 3, 'Der'),
 (4, 6, 'er'),
 (7, 11, 'ikke'),
 (12, 14, 'fe'),
 (14, 17, '##jet'),
 (18, 23, 'noget'),
 (24, 27, 'ind'),
 (28, 33, 'under'),
 (34, 37, 'gul'),
 (37, 39, '##vt'),
 (39, 40, '##æ'),
 (40, 44, '##ppet'),
 (44, 45, ','),
 (46, 48, 'sa'),
 (48, 51, '##gde'),
 (52, 56, 'Stat'),
 (56, 65, '##sminister'),
 (66, 70, 'Poul'),
 (71, 73, 'Sc'),
 (73, 75, '##hl'),
 (75, 79, '##üter'),
 (80, 83, 'fra'),
 (84, 88, 'Folk'),
 (88, 92, '##etin'),
 (92, 95, '##get'),
 (95, 96, '##s'),
 (97, 101, 'tale'),
 (101, 104, '##rst'),
 (104, 106, '##ol'),
 (107, 109, '25'),
 (109, 110, '.'),
 (111, 116, 'april'),
 (117, 121, '1989'),
 (121, 122, '.')]

## `distribute_labels()`

In [10]:
labels = distribute_labels(doccano_data[1], tokens)
labels

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'title',
 'title',
 'name',
 'name',
 'name',
 'name',
 '',
 'org',
 'org',
 'org',
 '',
 '',
 '',
 '',
 'date',
 'date',
 'date',
 'date',
 '']

## `iob()`

In [11]:
iob(labels)

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-TITLE',
 'I-TITLE',
 'B-NAME',
 'I-NAME',
 'I-NAME',
 'I-NAME',
 'O',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'B-DATE',
 'I-DATE',
 'I-DATE',
 'I-DATE',
 'O']

## `doccano_to_iob_tokens()`

In [12]:
doccano_to_iob_tokens(
    doccano_document=doccano_data[1],
    tokenizers=models
)

[('Der', 'O'),
 ('er', 'O'),
 ('ikke', 'O'),
 ('fe', 'O'),
 ('##jet', 'O'),
 ('noget', 'O'),
 ('ind', 'O'),
 ('under', 'O'),
 ('gul', 'O'),
 ('##vt', 'O'),
 ('##æ', 'O'),
 ('##ppet', 'O'),
 (',', 'O'),
 ('sa', 'O'),
 ('##gde', 'O'),
 ('Stat', 'B-TITLE'),
 ('##sminister', 'I-TITLE'),
 ('Poul', 'B-NAME'),
 ('Sc', 'I-NAME'),
 ('##hl', 'I-NAME'),
 ('##üter', 'I-NAME'),
 ('fra', 'O'),
 ('Folk', 'B-ORG'),
 ('##etin', 'I-ORG'),
 ('##get', 'I-ORG'),
 ('##s', 'O'),
 ('tale', 'O'),
 ('##rst', 'O'),
 ('##ol', 'O'),
 ('25', 'B-DATE'),
 ('.', 'I-DATE'),
 ('april', 'I-DATE'),
 ('1989', 'I-DATE'),
 ('.', 'O')]

## `display_annotations()`

In [13]:
display_annotations(doccano_data[0], )

In [14]:
display_annotations(doccano_data[1])

In [15]:
display_annotations(doccano_data[2])

In [16]:
display_annotations(doccano_data[3])

In [17]:
display_annotations(doccano_data[3], break_lines=False)

## `split_doc_into_sentences()`

In [18]:
document = doccano_data[0]['text']
split_doc_into_sentences(document, models, MAX_LEN=128)

[['Og', 'så', 'sky', '##der', 'han', '.'],
 ['Ne', '##j', ',', 'han', 'vente', '##r', '.'],
 ['Ja', '##aa', '##a', ',', 'Michael', 'Lau', '##dru', '##p', '.'],
 ['Det', 'er', 'geni', '##alt', ',', 'dé', '##t', 'der', '!']]

In [19]:
document = doccano_data[3]['text']
split_doc_into_sentences(document, models, MAX_LEN=128)

[['DE', '##N', 'H', '##Æ', '##SL', '##IG', '##E', 'BY', '.'],
 ['S',
  '##kr',
  '##ig',
  'er',
  'ud',
  '##sp',
  '##æ',
  '##ndt',
  'mellem',
  'hus',
  '##ene',
  'i',
  'den',
  'h',
  '##æ',
  '##sli',
  '##ge',
  'by',
  'Men',
  '##nes',
  '##kene',
  'er',
  'sky',
  '##gger',
  'af',
  'fa',
  '##brik',
  '##kern',
  '##e',
  'og',
  'kon',
  '##tore',
  '##rne',
  ',',
  'En',
  'pi',
  '##ge',
  'l',
  '##ø',
  '##fter',
  'sin',
  'kjo',
  '##le',
  'og',
  'hendes',
  'k',
  '##øn',
  'er',
  'et',
  'ur',
  'Fragment',
  '##er',
  'af',
  'endnu',
  'fri',
  'be',
  '##vid',
  '##st',
  '##hed',
  'h',
  '##ænge',
  '##r',
  'som',
  'en',
  't',
  '##åg',
  '##e',
  'i',
  'lu',
  '##ften',
  'kun',
  'gennem',
  '##lys',
  '##t',
  'af',
  'neo',
  '##n',
  '##øj',
  '##nene',
  'Gen',
  '##nem',
  'gade',
  '##rne',
  'fl',
  '##axe',
  '##r',
  'ans',
  '##kud',
  '##te',
  'dr',
  '##ømme',
  'med',
  'vin',
  '##ger',
  'som',
  'vi',
  '##mp',
  '##ler',
  'af',