In [19]:
# import sys

# # Clean install of working NER pipeline (no curated_transformer)
# !{sys.executable} -m pip uninstall -y spacy spacy-transformers spacy-curated-transformers
# !{sys.executable} -m pip install spacy==3.7.2
# !{sys.executable} -m spacy download en_core_web_lg

# Load large English model
import spacy
nlp = spacy.load("en_core_web_lg")

# Test
text = "Pris is awesome. Its the best city. My friend is there. Peter is the worst"
doc = nlp(text)

# Extract subject + entity label
for sent in doc.sents:
    subject = None
    for token in sent:
        if token.dep_ in ('nsubj', 'nsubjpass'):
            subject = token
            break
    if subject:
        ent_label = subject.ent_type_ if subject.ent_type_ else "UNKNOWN"
        print(f"Sentence: '{sent.text.strip()}'")
        print(f"Subject: '{subject}' → Entity: {ent_label}")


Sentence: 'Pris is awesome.'
Subject: 'Pris' → Entity: ORG
Sentence: 'My friend is there.'
Subject: 'friend' → Entity: UNKNOWN
Sentence: 'Peter is the worst'
Subject: 'Peter' → Entity: PERSON


In [20]:
from datasets import load_dataset

ds = load_dataset("tner/ontonotes5")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 59924/59924 [00:00<00:00, 1085952.98 examples/s]
Generating validation split: 100%|██████████| 8528/8528 [00:00<00:00, 1803328.69 examples/s]
Generating test split: 100%|██████████| 8262/8262 [00:00<00:00, 1659642.70 examples/s]


In [24]:
print(ds.column_names)

{'train': ['tokens', 'tags'], 'validation': ['tokens', 'tags'], 'test': ['tokens', 'tags']}


In [None]:
# import sys

# # Step 1: Fix pip installation into current Python kernel environment
# !{sys.executable} -m pip uninstall -y util
# !{sys.executable} -m pip uninstall -y spacy spacy-transformers
# !{sys.executable} -m pip install -U pip
# !{sys.executable} -m pip install "spacy[transformers]"
# !{sys.executable} -m spacy download en_core_web_trf

[0mDefaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting spacy[transformers]
  Using cached spacy-3.8.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy_transformers<1.4.0,>=1.1.2 (from spacy[transformers])
  Downloading spacy_transformers-1.3.8-cp39-cp39-macosx_11_0_arm64.whl.metadata (7.0 kB)
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading spacy_alignments-0.9.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.7 kB)
Downloading spacy_transformers-1.3.8-cp39-cp39-macosx_11_0_arm64.whl (171 kB)
Using cached spacy-3.8.3-cp39-cp39-macosx_11_0_arm64.whl (6.3 MB)
Downloading spacy_alignments-0.9.1-cp39-cp39-macosx_11_0_arm64.whl (317 kB)
Installing collected packages: spacy-alignments, spacy, spacy_transformers
Successfully installed spacy-3.8.3 spacy-alignments-0.9.1 spacy_transformers-1.3.8
Defau

In [30]:
from datasets import load_dataset
import spacy

# Use a small subset for testing (first 100 examples)
ds = load_dataset("tner/ontonotes5", split="train")

true_positive = 0
false_positive = 0
false_negative = 0
total_evaluated = 0

# For each example in the dataset
for example in ds:
    tokens = example["tokens"]       # e.g. ["People", "start", "their", ...]
    tags = example["tags"]           # e.g. [0, 0, 0, ...] where 4 means PERSON
    
    # Reconstruct sentence (simple join; note: tokenization differences may occur)
    sentence = " ".join(tokens)
    doc = nlp(sentence)
    
    subject_token = None
    # Loop over sentences in the spaCy doc (usually one sentence per example)
    for sent in doc.sents:
        for token in sent:
            if token.dep_ in ('nsubj', 'nsubjpass'):
                subject_token = token
                break
        if subject_token:
            break
    # If no subject was found, skip this example
    if subject_token is None:
        continue

    # Try to align the subject token with the original tokens (using a case-insensitive match)
    try:
        token_index = next(i for i, t in enumerate(tokens) if t.lower() == subject_token.text.lower())
    except StopIteration:
        continue  # if not found, skip this example

    # Ground truth: label 4 means PERSON
    ground_truth_person = tags[token_index] == 4 or (
        token_index > 0 and tags[token_index - 1] == 4
    ) or (
        token_index < len(tags) - 1 and tags[token_index + 1] == 4
    )
    # Prediction: our model's subject gets "PERSON" if spaCy marks it as such; otherwise, we treat it as not PERSON.
    predicted_person = any(
        ent.label_ == "PERSON" and subject_token.idx >= ent.start_char and subject_token.idx < ent.end_char
        for ent in doc.ents
    )

    
    # Count outcomes
    if predicted_person and ground_truth_person:
        true_positive += 1
    elif predicted_person and not ground_truth_person:
        false_positive += 1
    elif not predicted_person and ground_truth_person:
        false_negative += 1

    total_evaluated += 1

# Calculate metrics
precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("Total evaluated examples:", total_evaluated)
print("True Positive:", true_positive)
print("False Positive:", false_positive)
print("False Negative:", false_negative)
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1 Score: {f1:.3f}")


Total evaluated examples: 49595
True Positive: 4109
False Positive: 641
False Negative: 329
Precision: 0.865, Recall: 0.926, F1 Score: 0.894


In [32]:
from datasets import load_dataset
import spacy

# Load a subset of the OnTonotes5 dataset
ds = load_dataset("tner/ontonotes5", split="train")

false_positives = []
false_negatives = []

# Process each example in the dataset
for example in ds:
    tokens = example["tokens"]       # e.g., ["People", "start", "their", "own", "businesses", ...]
    tags = example["tags"]           # e.g., [0, 0, 0, ...] with tag 4 indicating PERSON

    # Reconstruct the sentence
    sentence = " ".join(tokens)
    doc = nlp(sentence)

    subject_token = None
    # Extract the first subject from the spaCy parse
    for sent in doc.sents:
        for token in sent:
            if token.dep_ in ('nsubj', 'nsubjpass'):
                subject_token = token
                break
        if subject_token:
            break

    # If no subject was found, skip this example
    if subject_token is None:
        continue

    # Try to align the spaCy subject token with the dataset tokens (case-insensitive match)
    try:
        token_index = next(i for i, t in enumerate(tokens) if t.lower() == subject_token.text.lower())
    except StopIteration:
        continue

    # Ground truth: tag 4 indicates PERSON
    ground_truth_person = tags[token_index] == 4 or (
        token_index > 0 and tags[token_index - 1] == 4
    ) or (
        token_index < len(tags) - 1 and tags[token_index + 1] == 4
    )
    # Model prediction: if spaCy labels the subject as PERSON, we predict PERSON
    predicted_person = any(
        ent.label_ == "PERSON" and subject_token.idx >= ent.start_char and subject_token.idx < ent.end_char
        for ent in doc.ents
    )

    if predicted_person and not ground_truth_person:
        false_positives.append({
            "sentence": sentence,
            "subject_token": subject_token.text,
            "token_index": token_index,
            "tokens": tokens,
            "predicted": "PERSON",
            "ground_truth": "NOT_PERSON"
        })
    elif (not predicted_person) and ground_truth_person:
        false_negatives.append({
            "sentence": sentence,
            "subject_token": subject_token.text,
            "token_index": token_index,
            "tokens": tokens,
            "predicted": "NOT_PERSON",
            "ground_truth": "PERSON"
        })

# Print some misclassified examples for inspection
print("False Positives (Model predicted PERSON but GT is NOT_PERSON):")
for i, example in enumerate(false_positives[:10]):
    print(f"\nExample {i+1}:")
    print("Sentence       :", example["sentence"])
    print("Tokens         :", example["tokens"])
    print("Subject token  :", example["subject_token"])
    print("Token index    :", example["token_index"])
    print("Predicted label:", example["predicted"])
    print("Ground-truth   :", example["ground_truth"])

print("\n\nFalse Negatives (Model predicted NOT_PERSON but GT is PERSON):")
for i, example in enumerate(false_negatives[:10]):
    print(f"\nExample {i+1}:")
    print("Sentence       :", example["sentence"])
    print("Tokens         :", example["tokens"])
    print("Subject token  :", example["subject_token"])
    print("Token index    :", example["token_index"])
    print("Predicted label:", example["predicted"])
    print("Ground-truth   :", example["ground_truth"])

print("\nTotal false positives:", len(false_positives))
print("Total false negatives :", len(false_negatives))


False Positives (Model predicted PERSON but GT is NOT_PERSON):

Example 1:
Sentence       : John B. Curcio , 55 years old , resigned as chairman of this diesel truck manufacturer , effective upon appointment of a successor .
Tokens         : ['John', 'B.', 'Curcio', ',', '55', 'years', 'old', ',', 'resigned', 'as', 'chairman', 'of', 'this', 'diesel', 'truck', 'manufacturer', ',', 'effective', 'upon', 'appointment', 'of', 'a', 'successor', '.']
Subject token  : Curcio
Token index    : 2
Predicted label: PERSON
Ground-truth   : NOT_PERSON

Example 2:
Sentence       : Richard W. Lock , retired vice president and treasurer of Owens - Illinois Inc. , was named a director of this transportation industry supplier , increasing its board to six members .
Tokens         : ['Richard', 'W.', 'Lock', ',', 'retired', 'vice', 'president', 'and', 'treasurer', 'of', 'Owens', '-', 'Illinois', 'Inc.', ',', 'was', 'named', 'a', 'director', 'of', 'this', 'transportation', 'industry', 'supplier', ',', 'incr

In [33]:
from datasets import load_dataset
import spacy

# Load a subset of the OnTonotes5 dataset
ds = load_dataset("tner/ontonotes5", split="train")

# Load spaCy model
nlp = spacy.load("en_core_web_lg")  # Or whatever model you're using

false_positives = []
false_negatives = []

true_positive = 0
true_negative = 0

# Process each example in the dataset
for example in ds:
    tokens = example["tokens"]       # e.g., ["People", "start", "their", ...]
    tags = example["tags"]           # e.g., [0, 0, 4, ...]

    sentence = " ".join(tokens)
    doc = nlp(sentence)

    # Ground truth: if any tag == 4 (PERSON), then it's a PERSON sentence
    ground_truth_person = 4 in tags

    # Prediction: if any entity in the sentence is labeled PERSON by spaCy
    predicted_person = any(ent.label_ == "PERSON" for ent in doc.ents)

    # Evaluate outcomes
    if predicted_person and not ground_truth_person:
        false_positives.append({
            "sentence": sentence,
            "tokens": tokens,
            "predicted": "PERSON",
            "ground_truth": "NOT_PERSON"
        })
    elif not predicted_person and ground_truth_person:
        false_negatives.append({
            "sentence": sentence,
            "tokens": tokens,
            "predicted": "NOT_PERSON",
            "ground_truth": "PERSON"
        })
    elif predicted_person and ground_truth_person:
        true_positive += 1
    elif not predicted_person and not ground_truth_person:
        true_negative += 1

# Metrics
precision = true_positive / (true_positive + len(false_positives)) if (true_positive + len(false_positives)) > 0 else 0
recall = true_positive / (true_positive + len(false_negatives)) if (true_positive + len(false_negatives)) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Print misclassifications
print("False Positives (Model predicted PERSON but GT is NOT_PERSON):")
for i, example in enumerate(false_positives[:10]):
    print(f"\nExample {i+1}:")
    print("Sentence       :", example["sentence"])
    print("Tokens         :", example["tokens"])
    print("Predicted label:", example["predicted"])
    print("Ground-truth   :", example["ground_truth"])

print("\n\nFalse Negatives (Model predicted NOT_PERSON but GT is PERSON):")
for i, example in enumerate(false_negatives[:10]):
    print(f"\nExample {i+1}:")
    print("Sentence       :", example["sentence"])
    print("Tokens         :", example["tokens"])
    print("Predicted label:", example["predicted"])
    print("Ground-truth   :", example["ground_truth"])

# Summary
print("\nTotal false positives:", len(false_positives))
print("Total false negatives :", len(false_negatives))
print("True positives        :", true_positive)
print("True negatives        :", true_negative)
print(f"\nPrecision: {precision:.3f}, Recall: {recall:.3f}, F1 Score: {f1:.3f}")




False Positives (Model predicted PERSON but GT is NOT_PERSON):

Example 1:
Sentence       : He is known as the father of the U.S. - grown Granny Smith , a radically different apple that the conventional wisdom once said would never catch on .
Tokens         : ['He', 'is', 'known', 'as', 'the', 'father', 'of', 'the', 'U.S.', '-', 'grown', 'Granny', 'Smith', ',', 'a', 'radically', 'different', 'apple', 'that', 'the', 'conventional', 'wisdom', 'once', 'said', 'would', 'never', 'catch', 'on', '.']
Predicted label: PERSON
Ground-truth   : NOT_PERSON

Example 2:
Sentence       : The scare over Alar , a growth regulator that makes apples redder and crunchier but may be carcinogenic , made consumers shy away from the Delicious , though they were less affected than the McIntosh .
Tokens         : ['The', 'scare', 'over', 'Alar', ',', 'a', 'growth', 'regulator', 'that', 'makes', 'apples', 'redder', 'and', 'crunchier', 'but', 'may', 'be', 'carcinogenic', ',', 'made', 'consumers', 'shy', 'away', '

In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")

# Set of pronouns we consider as PERSONs
PERSON_PRONOUNS = {"you", "he", "she", "they", "him", "her"}

def detect_person_targets(text):
    doc = nlp(text)

    person_tokens = []

    for token in doc:
        # Add pronouns like "you", "he", etc.
        if token.text.lower() in PERSON_PRONOUNS:
            person_tokens.append(token.text)
    
    # Add spaCy named PERSON entities
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            person_tokens.append(ent.text)

    # Remove duplicates and preserve order
    seen = set()
    unique_persons = [p for p in person_tokens if not (p.lower() in seen or seen.add(p.lower()))]

    return {
        "person_tokens": unique_persons,
        "is_directed_towards_someone": len(unique_persons) > 0
    }



{'person_tokens': ['He'], 'is_directed_towards_someone': True}
