# Named Entity Recognition

## Data

In [1]:
import pandas as pd
import numpy as np
import pymongo
from pprint import pprint

from collections import defaultdict
import json

In [2]:
client = pymongo.MongoClient('localhost', 27017)
db = client["news"]
data = db["data"]

## Model

In [3]:
import flair, torch
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter
from flair.data import Sentence

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
torch.cuda.is_available()

True

In [5]:
!nvidia-smi

Thu Feb 16 20:45:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| N/A   55C    P0    N/A /  N/A |      6MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
flair.device = torch.device("cuda:0")

In [7]:
splitter = SegtokSentenceSplitter()
#tagger = SequenceTagger.load('ner')
tagger = SequenceTagger.load('ner-fast')



2023-02-16 20:45:25,813 loading file /home/scurrra/.flair/models/ner-english-fast/4c58e7191ff952c030b82db25b3694b58800b0e722ff15427f527e1631ed6142.e13c7c4664ffe2bbfa8f1f5375bd0dced866b8c1dd7ff89a6d705518abf0a611
2023-02-16 20:45:27,428 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [8]:
stats = {
    "last_new_indx": -1,
    "PER":  defaultdict(list),
    "LOC":  defaultdict(list),
    "ORG":  defaultdict(list),
    "MISC": defaultdict(list)
}

In [9]:
def ner_eval(stats: dict, new: dict):
    splitted = splitter.split(new["content"])
    splitted.append(Sentence(new["title"]))
    if isinstance(new["description"], str):
        splitted.extend(
            splitter.split(new["description"])
        )

    tagger.predict(splitted)

    buf_stats = {
        "PER":  defaultdict(list),
        "LOC":  defaultdict(list),
        "ORG":  defaultdict(list),
        "MISC": defaultdict(list)
    }     # костыль

    # число вхождений в текст
    for sentence in splitted:
        for entity in sentence.get_spans("ner"):
            #print(entity)
            buf_stats[entity.tag][entity.text].append(1) 

    for key in buf_stats.keys():
        for entity in buf_stats[key].keys():
            stats[key][entity].append((new["index"], sum(buf_stats[key][entity])))

In [10]:
ner_stats = db["ner_stats"]
# ner_stats.insert_one(stats)

### Test on one of the news

In [11]:
stats = {
    "PER":  defaultdict(list),
    "LOC":  defaultdict(list),
    "ORG":  defaultdict(list),
    "MISC": defaultdict(list)
}

In [12]:
new_test = data.find_one({"index": 4712})
new_test;

In [13]:
test_splitted = splitter.split(new_test["content"])
test_splitted.append(Sentence(new_test["title"]))
if isinstance(new_test["description"], str):
    test_splitted.extend(
        splitter.split(new_test["description"])
    )

test_splitted

[Sentence: "( CNN ) Right now , there 's a shortage of truck drivers in the US and worldwide , exacerbated by the e-commerce boom brought on by the pandemic .",
 Sentence: "One solution to the problem is autonomous trucks , and several companies are in a race to be the first to launch one .",
 Sentence: "Among them is San Diego-based TuSimple.Founded in 2015 , TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the US , China and Europe .",
 Sentence: "Although these are simply commercially available trucks retrofitted with its technology , TuSimple has deals in place with two of the world 's largest truck manufacturers -- Navistar in the US and Traton , Volkswagen 's trucking business , in Europe -- to design and build fully autonomous models , which it hopes to launch by 2024 .",
 Sentence: "Photos : The Yara Birkeland is what its builders call the world 's first zero-emission , autonomous cargo ship .",
 Sentence: "The ship is scheduled to 

In [14]:
tagger.predict(test_splitted)

In [15]:
test_splitted[0].get_spans("ner")[0].get_label()

'Span[15:16]: "US"'/'LOC' (0.999)

In [16]:
test_splitted[0].get_spans("ner")[0].text

'US'

In [17]:
test_splitted[0].get_spans("ner")[0].tag

'LOC'

In [18]:
test_splitted[0].get_spans("ner")[0].score

0.9989510774612427

In [19]:
INDX = 4712

In [20]:
buf_stats = {
    "PER":  defaultdict(list),
    "LOC":  defaultdict(list),
    "ORG":  defaultdict(list),
    "MISC": defaultdict(list)
}     # костыль

for sentence in test_splitted:
    for entity in sentence.get_spans("ner"):
        #print(entity)
        buf_stats[entity.tag][entity.text].append(1) 

for key in buf_stats.keys():
    for entity in buf_stats[key].keys():
        stats[key][entity].append((INDX, sum(buf_stats[key][entity])))
    print(key)
    pprint(stats[key])

stats

PER
defaultdict(<class 'list'>,
            {'Cheng Lu': [(4712, 1)],
             'Grayson Brulte': [(4712, 1)],
             'Lu': [(4712, 9)],
             'Steve Lee': [(4712, 1)],
             'Stretch': [(4712, 1)],
             'TuSimple': [(4712, 2)]})
LOC
defaultdict(<class 'list'>,
            {'Arizona': [(4712, 2)],
             'Aurora': [(4712, 1)],
             'Boston': [(4712, 1)],
             'China': [(4712, 5)],
             'Dallas': [(4712, 1)],
             'Denmark': [(4712, 1)],
             'Dubai': [(4712, 1)],
             'El Paso': [(4712, 1)],
             'Europe': [(4712, 3)],
             'Germany': [(4712, 1)],
             'London': [(4712, 1)],
             'Maglev': [(4712, 1)],
             'Netherlands': [(4712, 1)],
             'Nogales': [(4712, 1)],
             'Oklahoma City': [(4712, 1)],
             'Phoenix': [(4712, 1)],
             'Qingdao': [(4712, 1)],
             'Shandong Province': [(4712, 1)],
             'Sharjah': [(4712,

{'PER': defaultdict(list,
             {'Steve Lee': [(4712, 1)],
              'Cheng Lu': [(4712, 1)],
              'TuSimple': [(4712, 2)],
              'Lu': [(4712, 9)],
              'Stretch': [(4712, 1)],
              'Grayson Brulte': [(4712, 1)]}),
 'LOC': defaultdict(list,
             {'US': [(4712, 9)],
              'China': [(4712, 5)],
              'Europe': [(4712, 3)],
              'Maglev': [(4712, 1)],
              'Qingdao': [(4712, 1)],
              'Shandong Province': [(4712, 1)],
              'Netherlands': [(4712, 1)],
              'UAE': [(4712, 1)],
              'Sharjah': [(4712, 1)],
              'Dubai': [(4712, 1)],
              'Germany': [(4712, 1)],
              'Nogales': [(4712, 1)],
              'Arizona': [(4712, 2)],
              'Oklahoma City': [(4712, 1)],
              'Tucson': [(4712, 1)],
              'Dallas': [(4712, 1)],
              'TuSimple': [(4712, 2)],
              'Texas': [(4712, 1)],
              'Phoenix': [

## Evaluating

In [21]:
from tqdm import tqdm

In [22]:
stats = list(ner_stats.find())[-1]

for key in ["PER", "LOC", "ORG", "MISC"]:
    stats[key] = defaultdict(list, stats[key])

stats

{'_id': ObjectId('63ee6b49dc6dad6ddfaa79c1'),
 'last_new_indx': -1,
 'PER': defaultdict(list, {}),
 'LOC': defaultdict(list, {}),
 'ORG': defaultdict(list, {}),
 'MISC': defaultdict(list, {})}

In [23]:
N_SAMPLES = 10

In [24]:
samples = data.find({"index": {"$gt": stats["last_new_indx"], "$lte": stats["last_new_indx"]+N_SAMPLES}})

In [25]:
ner_samples_failed = []
for new in tqdm(samples, total=N_SAMPLES):
    try:
        ner_eval(stats, new)
    except:
        ner_samples_failed.append(new["index"])
stats["last_new_indx"] += N_SAMPLES
print(ner_samples_failed)

100%|██████████| 10/10 [00:05<00:00,  1.98it/s]

[]





In [26]:
stats

{'_id': ObjectId('63ee6b49dc6dad6ddfaa79c1'),
 'last_new_indx': 9,
 'PER': defaultdict(list,
             {'Alexander': [(0, 1)],
              'Richard Parsons': [(0, 1)],
              'Alan Greenspan': [(1, 1)],
              'Greenspan': [(1, 2)],
              'Robert Sinche': [(1, 1)],
              'Jamie Firestone': [(2, 1)],
              'Tim Osborne': [(2, 1)],
              'Mikhail Khodorkovsky': [(2, 1)],
              'Rod Eddington': [(3, 1)],
              'Mike Powell': [(3, 1)],
              'Martin Broughton': [(3, 1)],
              'Nick Van den Brul': [(3, 1)],
              'Eddington': [(3, 1)],
              'Heizo Takenaka': [(5, 1)],
              'Paul Sheard': [(5, 1)],
              'Bush': [(6, 1)],
              'Herbert Hoover': [(6, 1)],
              'Rick Egelton': [(6, 1)],
              'Ken Mayland': [(6, 1)],
              'Palaniappan Chidambaram': [(7, 1)],
              'Chidambaram': [(7, 2)],
              'Gordon Brown': [(7, 1)],
       

In [29]:
ner_stats.update_one({"_id": stats["_id"]}, {"$set": stats})

<pymongo.results.UpdateResult at 0x7fa7635c0500>

In [30]:
# ner_stats.remove({"_id": stats["_id"]})