In [1]:
from pathlib import Path
import pandas as pd
import spacy
from spacy import displacy
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
BASE_DIR = Path.cwd().parent

In [2]:
df = pd.read_csv(BASE_DIR / "row_data" / "reviews_2023_03.csv").head(1000)

In [3]:
df

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,32984,91258,2010-09-06,200247,Keenan,Great little apartment in the perfect spot in ...
1,32984,95401,2010-09-12,126543,Olaf,Nice flat in a great area. Chris sorted things...
2,32984,273707,2011-05-21,294223,Max,"Great location, Chris was responsive by email ..."
3,32984,285148,2011-05-28,577190,Eli,Chris' place is super great and close to every...
4,32984,308649,2011-06-11,521818,Jonathan,This was a \very nice apartment in a GREAT loc...
...,...,...,...,...,...,...
995,238411,3068287,2012-12-11,3609245,XiangWei,My friend and I really enjoyed our stay in Sto...
996,238411,3105484,2012-12-17,4215262,Jacopo,Mia's apartment is in a fantastic position in ...
997,238411,3279513,2013-01-05,2635023,Tati,"Great apartment in a perfect location, well fu..."
998,238411,3460644,2013-02-02,1144103,Rosa,Mia's apartment is super centrally located clo...


In [4]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
host_names = []
for row in df['comments']:
    ner_results = nlp(row)
    if len(ner_results) > 0 and ner_results[0]['entity'] == 'B-PER' and ner_results[0]['word'] not in host_names:
        host_names.append(ner_results[0]['word'])

In [5]:
host_names

['Chris',
 'Peter',
 'Li',
 'Lydia',
 'W',
 'Mo',
 'Tina',
 '##T',
 'Linda',
 'Rolf',
 'dad',
 'Jan',
 'Antonio',
 'Ta',
 'St',
 'Stock',
 'Hu',
 'An',
 'Adele',
 'Bar',
 'Marie',
 'Hi',
 'Dan',
 'Mama',
 'Sa',
 'Fred',
 'Frederic',
 'John',
 'Ella',
 'Mia',
 'Ellen',
 'Fu',
 'el',
 'Car',
 'Gross',
 'Le']

In [6]:
spacy_dict = ['en_core_web_sm', 'sv_core_news_sm', 'xx_ent_wiki_sm']

In [7]:
for dict in spacy_dict:
    for row in range(125, 127):
        nlp = spacy.load(dict)
        text_string = nlp(df['comments'][row])
        displacy.render(text_string, style="ent", jupyter=True)
        print(dict)

en_core_web_sm


en_core_web_sm


sv_core_news_sm


sv_core_news_sm


xx_ent_wiki_sm


xx_ent_wiki_sm
