In [30]:
import pandas as pd
ner_test = pd.read_csv(r'NER-test.tsv', sep="\t")
ner_test.head()

Unnamed: 0,sentence_id,token_id,token,BIO_NER_tag
0,0,0,If,O
1,0,1,you're,O
2,0,2,visiting,O
3,0,3,Paris,B-LOCATION
4,0,4,",",O


In [31]:
sentences = ner_test.groupby('sentence_id')['token'].apply(lambda tokens: ' '.join(tokens)).reset_index()
sentences.columns = ['sentence_id', 'sentence']

In [32]:
import spacy
import stanza

In [33]:
nlp_spacy = spacy.load("en_core_web_sm")

In [34]:
stanza.download("en")
nlp_stanza = stanza.Pipeline("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-24 22:45:10 INFO: Downloaded file to C:\Users\munzu\stanza_resources\resources.json
2025-05-24 22:45:10 INFO: Downloading default packages for language: en (English) ...
2025-05-24 22:45:11 INFO: File exists: C:\Users\munzu\stanza_resources\en\default.zip
2025-05-24 22:45:12 INFO: Finished downloading models and saved to C:\Users\munzu\stanza_resources
2025-05-24 22:45:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-24 22:45:12 INFO: Downloaded file to C:\Users\munzu\stanza_resources\resources.json
2025-05-24 22:45:13 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2025-05-24 22:45:13 INFO: Using device: cpu
2025-05-24 22:45:13 INFO: Loading: tokenize
2025-05-24 22:45:13 INFO: Loading: mwt
2025-05-24 22:45:13 INFO: Loading: pos
2025-05-24 22:45:14 INFO: Loading: lemma
2025-05-24 22:45:15 INFO: Loading: constituency
2025-05-24 22:45:15 INFO: Loading: depparse
2025-05-24 22:45:15 INFO: Loading: sentiment
2025-05-24 22:45:16 INFO: Loading: ner

In [35]:
def extract_entities_spacy(text):
    doc = nlp_spacy(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [36]:
def extract_entities_stanza(text):
    doc = nlp_stanza(text)
    return [(ent.text, ent.type) for sent in doc.sentences for ent in sent.ents]

In [37]:
sentences['spacy_entities'] = sentences['sentence'].apply(extract_entities_spacy)
sentences['stanza_entities'] = sentences['sentence'].apply(extract_entities_stanza)

In [38]:
pd.set_option('display.max_colwidth', None)
print(sentences[['sentence', 'spacy_entities', 'stanza_entities']])

                                                                                                           sentence  \
0                          If you're visiting Paris , make sure to see the Louvre , as they exhibit the Mona Lisa !   
1                                 Amazon , Google and Meta control a huge share of the technology market globally .   
2                                             Did you hear Pharoah Sanders recorded an album with Floating Points ?   
3                                                                Madvillainy is still my favourite MF DOOM record .   
4   My friend Kevin just finished watching Succession , and won't stop talking about Kieran Culkin 's performance .   
5                                                       Venus Williams has always been overshadowed by her sister .   
6                         Since Queen Elizabeth died , King Charles has been the head of the British Royal Family .   
7                                               

In [39]:
for idx, row in sentences.iterrows():
    print(f"Sentence: {row['sentence']}\n")
    print("spaCy entities:")
    for ent in row['spacy_entities']:
        print(f"  {ent[0]} ({ent[1]})")
    print("Stanza entities:")
    for ent in row['stanza_entities']:
        print(f"  {ent[0]} ({ent[1]})")
    print("-" * 60)

Sentence: If you're visiting Paris , make sure to see the Louvre , as they exhibit the Mona Lisa !

spaCy entities:
  Paris (GPE)
  Louvre (PERSON)
  the Mona Lisa (WORK_OF_ART)
Stanza entities:
  Paris (GPE)
  Louvre (FAC)
  the Mona Lisa (PERSON)
------------------------------------------------------------
Sentence: Amazon , Google and Meta control a huge share of the technology market globally .

spaCy entities:
  Amazon (ORG)
  Google (ORG)
  Meta (ORG)
Stanza entities:
  Amazon (ORG)
  Google (ORG)
  Meta (ORG)
------------------------------------------------------------
Sentence: Did you hear Pharoah Sanders recorded an album with Floating Points ?

spaCy entities:
  Pharoah Sanders (PERSON)
  Floating Points (WORK_OF_ART)
Stanza entities:
  Pharoah Sanders (PERSON)
  Floating Points (WORK_OF_ART)
------------------------------------------------------------
Sentence: Madvillainy is still my favourite MF DOOM record .

spaCy entities:
Stanza entities:
  Madvillainy (PERSON)
------