In [1]:
import pandas as pd
import spacy
from spacy import displacy
from pathlib import Path
import re


Step 1: Load the CoNLL-2003 Dataset

In [2]:
def load_dataset(file_path):
    with open(file_path,'r',encoding='utf-8') as f:
        lines=f.readlines()
    sentence=[]
    sentences=[]

    for line in lines:
        line=line.strip()
        if not line:
            if sentence:
                sentences.append(sentence)
                sentence=[]
        else:
            splits=line.split()
            if len(splits)>=2:
                word=splits[0]
                tag=splits[-1]
                sentence.append((word,tag))
    if sentence:
        sentences.append(sentence)
    return sentences

train_data=load_dataset("train.txt")

Step 2: Convert Sentences to Plain Text

In [3]:
def to_text(sentences,n=5):
    return [" ".join([token for token,tag in sentence]) for sentence in sentences[:n]]

In [4]:
text=to_text(train_data,n=5)

Rule-Based NER (using Regex)

In [5]:
def rule_based_ner(text):
    patterns={
        "ORG":r"\b[A-Z]{2,}(?:\s[A-Z]{2,})*",
        "PERSON":r"\b[A-Z][a-z]+\s[A-Z][a-z]+\b",
        "LOC":r"\b(Germany|France|USA|United States|China)\b"
    }
    matches=[]
    for label,pattern in patterns.items():
        for match in re.finditer(pattern,text):
            matches.append(
                {
                    "start":match.start(),
                    "end":match.end(),
                    "label":label
                }
            )
    return matches

In [6]:
from spacy import displacy
from IPython.display import display, HTML

def visualize(text, matches):
    from spacy import displacy
    from IPython.display import display, HTML

    doc = {
        "text": text,
        "ents": matches,
        "title": "Rule-based NER"
    }
    html = displacy.render(doc, style="ent", manual=True, jupyter=True)
    display(HTML(html))


In [9]:
print("Rule-based NER Example:")
for sample in text[:1]:
    matches = rule_based_ner(sample)
    visualize(sample, matches)

Rule-based NER Example:


<IPython.core.display.HTML object>

Model-Based NER (SpaCy)

In [None]:
!python -m spacy download en_core_web_trf
nlp_sm = spacy.load("en_core_web_sm")
nlp_trf = spacy.load("en_core_web_trf")


Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [38;2;249;38;114m━[0m[38;2;249;38;114m╸[0m[38;5;237m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/457.4 MB[0m [31m78.6 kB/s[0m  [36m1:32:37[0m0m
[0m  Resuming download https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (21.0 MB/457.4 MB)
[2K     [38;2;249;38;114m━━━[0m[38;2;249;38;114m╸[0m[38;5;237m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/457.4 MB[0m [31m20.1 kB/s[0m  [36m5:44:01[0m0m
[0m  Resuming download https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (41.9 MB/457.4 MB)
[2K     [38;2;249;38;114m━━━[0m[38;2;249;38;114m╸[0m[38;5;237m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/457.4 MB[0m [31m8.9 kB/s[0m e

In [None]:
def visualize_model_based(nlp_model, text, model_name="Model"):
    doc = nlp_model(text)
    print(f" Entities found by {model_name}:")from spacy import displacy
from IPython.display import display, HTML

def visualize(text, matches):
    doc = {
        "text": text,
        "ents": matches,
        "title": "Rule-based NER"
    }
    html = displacy.render(doc, style="ent", manual=True, jupyter=True)
    display(HTML(html))

    for ent in doc.ents:
        print(f"{ent.text} → {ent.label_}")
    displacy.render(doc, style="ent", manual=True)


In [None]:
sample_text = text_samples[0]

visualize_model_based(nlp_sm, sample_text, model_name="en_core_web_sm")

visualize_model_based(nlp_trf, sample_text, model_name="en_core_web_trf")
