# NER

In [1]:
# import libraries
from pathlib import Path
import pandas as pd
import spacy
from spacy import displacy
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
BASE_DIR = Path.cwd().parent

In [2]:
# load row reviews
df = pd.read_csv(BASE_DIR / "row_data" / "reviews_2023_03.csv").head(1000)

In [3]:
# identify Persons
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
host_names = []
for row in df['comments']:
    ner_results = nlp(row)
    if len(ner_results) > 0 and ner_results[0]['entity'] == 'B-PER' and ner_results[0]['word'] not in host_names:
        host_names.append(ner_results[0]['word'])

In [4]:
host_names

['Chris',
 'Peter',
 'Li',
 'Lydia',
 'W',
 'Mo',
 'Tina',
 '##T',
 'Linda',
 'Rolf',
 'dad',
 'Jan',
 'Antonio',
 'Ta',
 'St',
 'Stock',
 'Hu',
 'An',
 'Adele',
 'Bar',
 'Marie',
 'Hi',
 'Dan',
 'Mama',
 'Sa',
 'Fred',
 'Frederic',
 'John',
 'Ella',
 'Mia',
 'Ellen',
 'Fu',
 'el',
 'Car',
 'Gross',
 'Le']

In [5]:
# en_core_web_sm - English, sv_core_news_sm - Swedish, xx_ent_wiki_sm - general
spacy_dict = ['en_core_web_sm', 'sv_core_news_sm', 'xx_ent_wiki_sm']

In [6]:
# display results
for dict in spacy_dict:
    for row in range(125, 127):
        nlp = spacy.load(dict)
        text_string = nlp(df['comments'][row])
        displacy.render(text_string, style="ent", jupyter=True)
        print(dict)

en_core_web_sm


en_core_web_sm


sv_core_news_sm


sv_core_news_sm


xx_ent_wiki_sm


xx_ent_wiki_sm
