In [21]:
!pip install -U spacy scikit-learn
!python -m spacy download en_core_web_sm


Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.8.0


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")


In [2]:
texts = [
    "Barack Obama was the 44th President of the United States.",
    "Apple Inc. is headquartered in Cupertino, California.",
    "Elon Musk founded SpaceX and Tesla.",
    "Google was founded by Larry Page and Sergey Brin.",
    "Microsoft announced new AI features in New York."
]

true_entities = [
    [("Barack Obama", "PERSON"), ("United States", "GPE")],
    [("Apple Inc.", "ORG"), ("Cupertino", "GPE"), ("California", "GPE")],
    [("Elon Musk", "PERSON"), ("SpaceX", "ORG"), ("Tesla", "ORG")],
    [("Google", "ORG"), ("Larry Page", "PERSON"), ("Sergey Brin", "PERSON")],
    [("Microsoft", "ORG"), ("New York", "GPE")]
]


In [3]:
pred_entities = []

for text in texts:
    doc = nlp(text)
    pred_entities.append([(ent.text, ent.label_) for ent in doc.ents])

pred_entities


[[('Barack Obama', 'PERSON'),
  ('44th', 'ORDINAL'),
  ('the United States', 'GPE')],
 [('Apple Inc.', 'ORG'), ('Cupertino', 'GPE'), ('California', 'GPE')],
 [('Elon Musk', 'PERSON'), ('Tesla', 'NORP')],
 [('Google', 'ORG'), ('Larry Page', 'PERSON'), ('Sergey Brin', 'PERSON')],
 [('Microsoft', 'ORG'), ('AI', 'GPE'), ('New York', 'GPE')]]

In [4]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

y_true = []
y_pred = []

for true, pred in zip(true_entities, pred_entities):
    true_set = set(true)
    pred_set = set(pred)

    for ent in true_set:
        y_true.append(ent[1])
        y_pred.append(ent[1] if ent in pred_set else "O")

    for ent in pred_set - true_set:
        y_true.append("O")
        y_pred.append(ent[1])


In [5]:
labels = list(set(y_true) - {"O"})

precision, recall, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, labels=labels, average="weighted"
)

accuracy = accuracy_score(y_true, y_pred)

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")


Accuracy : 0.5882
Precision: 0.8769
Recall   : 0.7692
F1-score : 0.8013
