# Assignment 4 - To perform Named Entity Recognition using Spacy library

In [None]:
!pip install spacy



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# 1. Loading the model using Spacy Library

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

# 2. Applying the model to each sentence in the list

In [None]:
sentences = [
    "Barack Obama was born in Hawaii.",
    "Apple is looking to buy a startup in India.",
    "Elon Musk founded SpaceX in 2002."
]

for text in sentences:
    doc = nlp(text)
    print(f"\nText: {text}")
    for ent in doc.ents:
        print(ent.text, ent.label_)


Text: Barack Obama was born in Hawaii.
Barack Obama PERSON
Hawaii GPE

Text: Apple is looking to buy a startup in India.
Apple ORG
India GPE

Text: Elon Musk founded SpaceX in 2002.
Elon Musk PERSON
2002 DATE


# 3. Applying the same model to an extract from a news article

In [None]:
news = """Donald Trump dislikes being compared to other American presidents. The irritation is almost visceral. Once described as the third-best president after George Washington and Abraham Lincoln, he snapped back that they had not fought “eight, nine wars.” When contrasted with Barack Obama or Joe Biden, the verdict in his mind is even simpler. He is better, full stop. The comparisons, he suggests, are flawed because they fail to grasp what greatness really means.
That reaction itself is revealing. Trump does not see the presidency as a role shaped by inheritance, restraint, or continuity. He sees it as a personal contest, one in which dominance, visibility, and disruption are the metrics of success. To compare him with earlier presidents is therefore not just an academic exercise. It is a way of understanding how radically his leadership style departs from the American tradition."""

In [None]:
doc = nlp(news)

for ent in doc.ents:
    print(ent.text, ent.label_)

Donald Trump PERSON
American NORP
third ORDINAL
George Washington PERSON
Abraham Lincoln PERSON
eight CARDINAL
nine CARDINAL
Barack Obama PERSON
Joe Biden PERSON
Trump ORG
one CARDINAL
American NORP


# 4. Evaluation Metrics - Accuracy, Precision, Recall and F1-Score

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [None]:
# Ground truth entities (text, label)
true_entities = [
    ("Donald Trump", "PERSON"),
    ("George Washington", "PERSON"),
    ("Abraham Lincoln", "PERSON"),
    ("Barack Obama", "PERSON"),
    ("Joe Biden", "PERSON"),
    ("American", "NORP")
]

In [None]:
doc = nlp(text)

sentences = []
for sent in doc.sents:
    sentences.append([token.text for token in sent])

In [None]:
def create_bio_labels(doc, entity_list):
    labels = ["O"] * len(doc)

    for ent_text, ent_label in entity_list:
        ent_tokens = ent_text.split()
        for i in range(len(doc) - len(ent_tokens) + 1):
            if [t.text for t in doc[i:i+len(ent_tokens)]] == ent_tokens:
                labels[i] = f"B-{ent_label}"
                for j in range(1, len(ent_tokens)):
                    labels[i+j] = f"I-{ent_label}"
    return labels

In [None]:
y_true = create_bio_labels(doc, true_entities)

In [None]:
y_pred = ["O"] * len(doc)

for ent in doc.ents:
    y_pred[ent.start] = f"B-{ent.label_}"
    for i in range(ent.start + 1, ent.end):
        y_pred[i] = f"I-{ent.label_}"

In [None]:
final_true = []
final_pred = []

for token, t, p in zip(doc, y_true, y_pred):
    if token.is_alpha:   # remove punctuation
        final_true.append(t)
        final_pred.append(p)

In [None]:
precision = precision_score(final_true, final_pred, average="micro")
recall = recall_score(final_true, final_pred, average="micro")
f1 = f1_score(final_true, final_pred, average="micro")
accuracy = accuracy_score(final_true, final_pred)

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1-score :", f1)

Accuracy : 0.6
Precision: 0.6
Recall   : 0.6
F1-score : 0.6
