## `Alexander Kurdyukov BS21-AI-01`

## Loading and processing the data

### For training the toc2vec model `ru_core_news_lg` vectors were used

In [1]:
# !python -m spacy download ru_core_news_lg

### Importing needed packages and loading the HuggingFace dataset to use its part as test data

In [1]:
import spacy
from datasets import load_dataset
from tqdm import tqdm

spacy.require_gpu()

dataset = load_dataset('MalakhovIlya/RuNNE', 'data')

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the module from C:\Users\Ario\.cache\huggingface\modules\datasets_modules\datasets\MalakhovIlya--RuNNE\5c0467600cde2a64546227a05688adcfcdcb583c442a7cac64b864313a68e588 (last modified on Thu Apr 11 20:23:35 2024) since it couldn't be found locally at MalakhovIlya/RuNNE, or remotely on the Hugging Face Hub.


### Creating a function for processing the data

In [2]:
from spacy.tokens import DocBin

def get_spacy_doc(data):
  ### Create a blank spaCy pipeline
  nlp = spacy.blank('ru')
  
  ### Create DocBin object to store processed texts
  db = DocBin()

  ### Some counters for statistics
  counter_all, counter_filt, counter_ign = 0, 0, 0

  ### Iterate through the data
  for line in tqdm(data):
    doc = nlp.make_doc(line["sentences"])
    annot = line['ners']
    ents = []

    ### Extract entities from the annotations
    for start, end, label in annot:
      start, end = int(start), int(end)
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
        ### Some other attempts to get character span(due to the broken positionings)
        if span is None:
          span = doc.char_span(start, end + 1, label=label, alignment_mode='strict')
        if span is None:
          span = doc.char_span(start - 1, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        ### Log errors for annotations that couldn't be processed
        # print(str(line["sentences"])[start: end], doc.char_span(start, end, label=label, alignment_mode='expand'), label)
        counter_ign += 1
      else:
        ents.append(span)
      counter_all += 1
    
    try:
      ### Filtering overlapping and repeating spans and then saving the text and NER-data
      filtered_ents = spacy.util.filter_spans(ents)
      doc.ents = filtered_ents
      counter_filt += len(filtered_ents)
      db.add(doc)
    except:
      pass

  ### Printing some global statistics about the spans 
  print(counter_all, counter_filt, counter_ign)
  return db

### Loading the train and test data with some formatting 

In [3]:
import json

with open('data/train.jsonl') as f:
    c_train_data = [json.loads(line) for line in f]

test_data = [{"sentences": dataset['test']["text"][line_id], "ners": [triple.split() for triple in dataset['test']["entities"][line_id]]} for line_id in range(len(dataset['test']))]

### Processing all the data and creating data files for spacy

In [4]:
train_db = get_spacy_doc(c_train_data)
test_db = get_spacy_doc(test_data)

train_db.to_disk("train_data.spacy")
test_db.to_disk("test_data.spacy")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
100%|██████████| 519/519 [00:04<00:00, 129.26it/s]


30835 23963 227


100%|██████████| 93/93 [00:00<00:00, 95.00it/s] 


5843 4605 30


## Tok2Vec model train

In [5]:
!python -m spacy init fill-config base_config_tok2vec.cfg config_tok2vec.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config_tok2vec.cfg
You can now add your data and train your pipeline:
python -m spacy train config_tok2vec.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


### Trainging tok2vec model for 20 epochs

In [6]:
!python -m spacy train --gpu-id 0 config_tok2vec.cfg --output ./ --paths.train ./train_data.spacy --paths.dev ./test_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    152.76    1.35    0.79    4.47    0.01
  0     200       1543.45  14883.35   34.71   42.27   29.45    0.35
  0     400        892.28  10505.61   51.55   58.94   45.80    0.52
  1     600       1873.34   9459.88   55.96   57.66   54.35    0.56
  1     800        478.80   7769.56   58.08   59.17   57.02    0.58
  1    1000       1333.20   8323.12   60.80   62.59   59.11    0.61
  2    1200       1422.10   6718.21   59.77   63.19   56.70    0.60
  2    1400        501.70   6666.92   59.24   65.15   54.31    0.59
  3    1600        565.10   5965.39   61.70   65.93   57.98    0.62
  3    1800        576.40   4938.65   63.92   66

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## Transformer model train

In [None]:
!python -m spacy init fill-config base_config_trans.cfg config_trans.cfg

### Trainging transformer model for 20 epochs

In [23]:
!python -m spacy train --gpu-id 0 config_trans.cfg --output ./ --paths.train ./train_data.spacy --paths.dev ./test_data.spacy

[38;5;4mℹ Saving to output directory: .[0m

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(



[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0         808.42    820.40    0.12    0.08    0.26    0.00
  2     200      806782.36  173838.81   37.19   62.23   26.51    0.37
  5     400       89629.25  58944.69   73.55   73.83   73.27    0.74
  7     600       20693.63  33460.93   77.81   77.16   78.48    0.78
 10     800        7900.66  23606.74   78.06   77.68   78.44    0.78
 12    1000        5990.45  19744.57   78.06   75.80   80.46    0.78
 15    1200        2786.91  17490.43   78.31   76.53   80.17    0.78
 17    1400        2571.47  16501.79   77.54   75.10   80.15    0.78
[38;5;2m✔ Saved pipeline to output directory[0m
model-last


## Loading and testing the best model

In [18]:
tok2vec_model = spacy.load("model-best")
transformer_model = spacy.load("prev_best_trans")

  _torch_pytree._register_pytree_node(


In [19]:
import json 

with open('data/dev.jsonl') as f:
    dev_data = [json.loads(line) for line in f]

### Check the model on sentences from the set 

In [26]:
text = dev_data[5]['senences']
# print(text)

out = tok2vec_model(text)
for ent in out.ents:
    print (ent.text, ent.label_)

Путин PERSON
закона Димы Яковлева LAW
Президент России PROFESSION
Владимир Путин PERSON
закона Димы Яковлева LAW
Советом Федерации ORGANIZATION
в четверг 27 декабря DATE
Владимир Путин PERSON
О мерах воздействия на лиц LAW
Димы Яковлева ORGANIZATION
президент РФ PROFESSION
заседании EVENT
Госсовета ORGANIZATION
Путина PERSON
американские власти ORGANIZATION
Российский Президент ORGANIZATION
российских COUNTRY
Госдумой ORGANIZATION
Димы Яковлева ORGANIZATION
американцам NATIONALITY
России COUNTRY
Советом Федерации ORGANIZATION
российских законодателей ORGANIZATION
Конгрессом США ORGANIZATION
Россией COUNTRY


In [29]:
text = dev_data[5]['senences']
# print(text)

out = transformer_model(text)
for ent in out.ents:
    print (ent.text, ent.label_)

Путин PERSON
закона Димы Яковлева LAW
Президент России PROFESSION
Владимир Путин PERSON
закона Димы Яковлева LAW
Советом Федерации ORGANIZATION
в четверг 27 декабря DATE
Владимир Путин PERSON
О мерах воздействия на лиц, причастных к нарушениям основополагающих прав и свобод человека, прав и свобод граждан РФ LAW
закон Димы Яковлева LAW
президент РФ PROFESSION
заседании EVENT
Госсовета ORGANIZATION
Путина PERSON
американские власти ORGANIZATION
Российский Президент PROFESSION
российских COUNTRY
тяжелых заболеваний DISEASE
Госдумой ORGANIZATION
закон Димы Яковлева LAW
американцам NATIONALITY
России COUNTRY
Советом Федерации ORGANIZATION
российских COUNTRY
законодателей PROFESSION
Конгрессом США ORGANIZATION
Закон о нормализации торговых отношений с Россией, LAW
закон Магнитского LAW


### Using the model to get NERS from all the sentences of the set

In [14]:
out_data = []
for line in tqdm(dev_data, total=len(dev_data)):
    out = model(line["senences"])
    ents = []
    for ent in out.ents:
        ents.append([ent.start_char, ent.end_char, ent.label_])
    out_data.append({"id":line["id"], "ners": ents})

100%|██████████| 65/65 [00:04<00:00, 14.95it/s]


### Save the results in `test.jsonl`

In [15]:
with open("test.jsonl", 'w') as f:
    for item in out_data:
        f.write(json.dumps(item) + "\n")

### Checking the format

In [2]:
# with open('test.jsonl') as f:
#     data = [json.loads(line) for line in f]

# data[0]