# Task 4: Named Entity Recognition (NER) from

News Articles

Import

In [12]:
!pip install -q spacy pandas
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_trf

import pandas as pd
import spacy
from spacy import displacy
from IPython.display import display, HTML

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m89.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installati

In [13]:
metadata_path = '/content/Dataset/metadata'
train_path = '/content/Dataset/train.txt'
test_path = '/content/Dataset/test.txt'
valid_path = '/content/Dataset/valid.txt'

Function: Load CoNLL Data

In [14]:
def load_and_clean_conll(path):
    sentences, current = [], []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("-DOCSTART-"):
                if current:
                    sentences.append(" ".join(current))
                    current = []
            else:
                parts = line.split()
                if len(parts) >= 1:
                    current.append(parts[0])
        if current:
            sentences.append(" ".join(current))
    return sentences

Load & Combine Data

In [15]:
train = load_and_clean_conll(train_path)
test = load_and_clean_conll(test_path)
valid = load_and_clean_conll(valid_path)

# Combine all texts
all_texts = train + test + valid
print(f"Loaded {len(all_texts)} sentences for NER")

Loaded 20744 sentences for NER


Load SpaCy Models

In [16]:
nlp_small = spacy.load("en_core_web_sm")
nlp_trf   = spacy.load("en_core_web_trf")

Function to Extract & Display Entities

In [17]:
def extract_entities(texts, nlp_model, model_name="Model"):
    data = []
    print(f"\n=== Extracting entities using {model_name} ===\n")
    for i, doc in enumerate(nlp_model.pipe(texts, batch_size=20)):
        ents = [(ent.text, ent.label_) for ent in doc.ents]
        data.append({
            "Text": texts[i][:200] + "..." if len(texts[i]) > 200 else texts[i],
            "Entities": ents
        })
        # Display first 2 examples with highlighted entities
        if i < 2 and ents:
            html = displacy.render(doc, style="ent", jupyter=True)
            display(HTML(html))
    df = pd.DataFrame(data)
    display(df.head(5))
    return df

Extract Entities

In [18]:
df_small = extract_entities(all_texts[:50], nlp_small, "en_core_web_sm")
df_trf   = extract_entities(all_texts[:50], nlp_trf, "en_core_web_trf")


=== Extracting entities using en_core_web_sm ===



<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

Unnamed: 0,Text,Entities
0,EU rejects German call to boycott British lamb .,"[(EU, ORG), (German, NORP), (British, NORP)]"
1,Peter Blackburn,"[(Peter Blackburn, PERSON)]"
2,BRUSSELS 1996-08-22,"[(BRUSSELS, GPE), (1996-08-22, DATE)]"
3,The European Commission said on Thursday it di...,"[(The European Commission, ORG), (Thursday, DA..."
4,Germany 's representative to the European Unio...,"[(Germany, GPE), (the European Union 's, ORG),..."



=== Extracting entities using en_core_web_trf ===



<IPython.core.display.HTML object>

<IPython.core.display.HTML object>

Unnamed: 0,Text,Entities
0,EU rejects German call to boycott British lamb .,"[(EU, ORG), (German, NORP), (British, NORP)]"
1,Peter Blackburn,"[(Peter Blackburn, PERSON)]"
2,BRUSSELS 1996-08-22,"[(BRUSSELS, GPE), (1996-08-22, DATE)]"
3,The European Commission said on Thursday it di...,"[(The European Commission, ORG), (Thursday, DA..."
4,Germany 's representative to the European Unio...,"[(Germany, GPE), (the European Union 's, ORG),..."


BONUS: Compare Entity Counts

In [19]:
from collections import Counter

def compare_entity_counts(df1, df2):
    counts1 = Counter([label for ents in df1['Entities'] for _, label in ents])
    counts2 = Counter([label for ents in df2['Entities'] for _, label in ents])
    print("\n=== Entity Counts Comparison ===")
    print("\nSmall Model:", dict(counts1))
    print("\nTransformer Model:", dict(counts2))

compare_entity_counts(df_small, df_trf)

print("Entities extracted, visualized, and compared successfully.")


=== Entity Counts Comparison ===

Small Model: {'ORG': 22, 'NORP': 20, 'PERSON': 24, 'GPE': 36, 'DATE': 29, 'LOC': 3, 'CARDINAL': 9, 'QUANTITY': 1, 'PERCENT': 5, 'MONEY': 6, 'LANGUAGE': 1, 'TIME': 1, 'ORDINAL': 1}

Transformer Model: {'ORG': 21, 'NORP': 20, 'PERSON': 24, 'GPE': 38, 'DATE': 28, 'LOC': 2, 'CARDINAL': 9, 'QUANTITY': 1, 'PERCENT': 6, 'MONEY': 6, 'WORK_OF_ART': 1, 'TIME': 2}
Entities extracted, visualized, and compared successfully.
