# Objective (Markdown cell)

Goal:
Automatically extract entities such as:

-PERSON

-ORG

-GPE (locations)

-DATE

-MONEY

# from unstructured documents.

We will use:

-spaCy

-CPU only

-Pretrained industrial-grade model

# ðŸ”¹ Reality of Notebook 04 (NER)
Key points:

1. NER does NOT use cleaned text

.Lowercasing, punctuation removal, or TF-IDF breaks entity recognition.

.SpaCy models rely on original capitalization and punctuation.

2. Classification preprocessing is separate

.Notebook 03 uses clean_text for TF-IDF â†’ Logistic Regression

.Notebook 04 uses raw content or full_text â†’ spaCy NER

# STEP 1: Imports

In [1]:
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter

# STEP 2: Load spaCy model

In [2]:
nlp = spacy.load("en_core_web_sm")

# STEP 3: Load raw dataset (NO cleaning)

In [3]:
bbc_df = pd.read_csv(
    "../data/bbc-news-data.csv",
    sep="\t",
    engine="python"
)

# combine title + content for richer text

In [4]:
bbc_df["full_text"] = bbc_df["title"] + ". " + bbc_df["content"]

# STEP 4: Run NER on a single document

In [5]:
doc = nlp(bbc_df.loc[0, "full_text"])
[(ent.text, ent.label_) for ent in doc.ents]

[('Time Warner', 'ORG'),
 ('Quarterly', 'DATE'),
 ('US', 'GPE'),
 ('TimeWarner', 'ORG'),
 ('76%', 'PERCENT'),
 ('1.13bn', 'MONEY'),
 ('600', 'MONEY'),
 ('the three months to December', 'DATE'),
 ('639', 'MONEY'),
 ('year-earlier', 'DATE'),
 ('Google', 'ORG'),
 ('TimeWarner', 'ORG'),
 ('fourth quarter', 'DATE'),
 ('2%', 'PERCENT'),
 ('11.1bn', 'MONEY'),
 ('10.9bn', 'MONEY'),
 ('one', 'CARDINAL'),
 ('Warner Bros', 'ORG'),
 ('AOL', 'ORG'),
 ('Time Warner', 'ORG'),
 ('Friday', 'DATE'),
 ('8%', 'PERCENT'),
 ('Google', 'ORG'),
 ('AOL', 'ORG'),
 ('464,000', 'CARDINAL'),
 ('the fourth quarter', 'DATE'),
 ('the preceding three quarters', 'DATE'),
 ('AOL', 'ORG'),
 ('8%', 'PERCENT'),
 ('TimeWarner', 'ORG'),
 ('AOL', 'ORG'),
 ('TimeWarner', 'ORG'),
 ('2000', 'DATE'),
 ('2003', 'DATE'),
 ('the US Securities Exchange Commission', 'ORG'),
 ('SEC', 'ORG'),
 ("Time Warner's", 'ORG'),
 ('fourth quarter', 'DATE'),
 ('27%', 'PERCENT'),
 ('284', 'MONEY'),
 ('Alexander and Catwoman', 'ORG'),
 ('year-earlie

# STEP 5: Visualize entities


In [6]:
displacy.render(doc, style="ent", jupyter=True)

# STEP 6: Extract entities into structured column

In [7]:
def extract_entities(text):
    doc=nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

bbc_df["entities"] = bbc_df["full_text"].apply(extract_entities)

# STEP 7: Filter key entity types (PERSON, ORG, GPE)

In [8]:

def filter_entities(entities, labels=("PERSON", "ORG", "GPE")):
    return [ent for ent in entities if ent[1] in labels]

bbc_df["key_entities"] = bbc_df["entities"].apply(filter_entities)

# STEP 8: frequency of entities

In [9]:
all_entities = []
for ents in bbc_df["key_entities"]:
    all_entities.extend(ents)

entity_counter = Counter(all_entities)
entity_counter.most_common(10)

[(('US', 'GPE'), 1510),
 (('UK', 'GPE'), 1043),
 (('England', 'GPE'), 515),
 (('Labour', 'ORG'), 466),
 (('Britain', 'GPE'), 400),
 (('London', 'GPE'), 367),
 (('France', 'GPE'), 330),
 (('BBC', 'ORG'), 318),
 (('Blair', 'PERSON'), 297),
 (('EU', 'ORG'), 280)]