## Model

In [None]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.9/401.9 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.0.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m88.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperopt>=0.2.7
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting bpemb>=0.3.2
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting wikipedia-api
  Downloading Wikipedia_API-0.5.8-py3-none-any.whl (13 kB)
Collecting sentencepiece==0.1.95
  Downl

In [None]:
import flair, torch
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter
from flair.data import Sentence

from collections import defaultdict

In [None]:
torch.cuda.is_available()

In [None]:
!nvidia-smi

In [None]:
flair.device = torch.device("cuda:0")

In [None]:
splitter = SegtokSentenceSplitter()
tagger = SequenceTagger.load('ner')
# tagger = SequenceTagger.load('ner-fast')

In [None]:
def ner_eval(stats: dict, new: dict):
    splitted = splitter.split(new["content"])
    splitted.append(Sentence(new["title"]))
    if isinstance(new["description"], str):
        splitted.extend(
            splitter.split(new["description"])
        )

    tagger.predict(splitted)

    buf_stats = {
        "PER":  defaultdict(int),
        "LOC":  defaultdict(int),
        "ORG":  defaultdict(int),
        "MISC": defaultdict(int)
    }     # костыль

    # число вхождений в текст
    for sentence in splitted:
        for entity in sentence.get_spans("ner"):
            #print(entity)
            buf_stats[entity.tag][entity.text] += 1 

    for key in buf_stats.keys():
        for entity in buf_stats[key].keys():
            stats[key][entity].append((new["index"], buf_stats[key][entity]))

## Data

In [None]:
from pprint import pprint
import json
from google.colab import drive

In [None]:
drive.mount('/content/up_nlp/', force_remount=True)

In [None]:
with open("/content/up_nlp/MyDrive/up_nlp/data.json", "r") as data_file:
    data = json.load(data_file)

len(data)

42635

In [None]:
with open("/content/up_nlp/MyDrive/up_nlp/stats.json", "r") as stats_file:
    stats = json.load(stats_file)

for key in ["PER", "LOC", "ORG", "MISC"]:
    stats[key] = defaultdict(list, stats[key])

{key: len(stats[key]) for key in ["PER", "LOC", "ORG", "MISC"]}

{'PER': 27780, 'LOC': 7065, 'ORG': 18536, 'MISC': 11428}

## Evaluating

In [None]:
from tqdm import tqdm

In [None]:
N_SAMPLES = 2635

In [None]:
len(data)

42635

In [None]:
stats["last_new_indx"]

39999

In [None]:
ner_samples_failed = []
for i in tqdm(range(stats["last_new_indx"]+1, stats["last_new_indx"]+N_SAMPLES+1)):
    try:
        ner_eval(stats, data[i])
    except:
        ner_samples_failed.append(data[i]["index"])

    if (i+1) % 1000 == 0:
        stats["last_new_indx"] += 1000
        with open("/content/up_nlp/MyDrive/up_nlp/stats.json", "w") as stats_file:
            stats_file.write(json.dumps(stats))
            print(f"Dumped after {(stats['last_new_indx'])}'th sample")
print(ner_samples_failed)

 38%|███▊      | 1000/2635 [07:10<30:27,  1.12s/it]

Dumped after 40999'th sample


 76%|███████▌  | 2000/2635 [13:44<12:55,  1.22s/it]

Dumped after 41999'th sample


100%|██████████| 2635/2635 [18:08<00:00,  2.42it/s]

[]





In [None]:
{key: len(stats[key]) for key in ["PER", "LOC", "ORG", "MISC"]}

{'PER': 185089, 'LOC': 52875, 'ORG': 108032, 'MISC': 88084}

In [None]:
del stats['last_new_indx']

In [None]:
with open("/content/up_nlp/MyDrive/up_nlp/stats.json", "w") as stats_file:
    stats_file.write(json.dumps(stats))

In [None]:
stats["last_new_indx"]

KeyError: ignored