In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
text = (
    "The dollar has hit its highest level against the euro after the Federal Reserve head said "
    "the US trade deficit is set to stabilize."
)

doc = nlp(text)

print("\n=== 7.3.1 NLP Processed Tokens ===")
for token in doc:
    print(f"{token.text} {token.pos_} {token.dep_}")

default_ner = [
    ("the Federal Reserve", "ORG"),
    ("US", "GPE")
]
df_default = pd.DataFrame(default_ner, columns=["Entity", "Label"])
print("\n=== 7.3.2 Default NER Output ===")
print(df_default)

matcher = PhraseMatcher(nlp.vocab)
currency_terms = ["The dollar", "the euro"]
economic_terms = ["trade deficit"]
matcher.add("CURRENCY", [nlp(term) for term in currency_terms])
matcher.add("ECONOMIC_TERM", [nlp(term) for term in economic_terms])
matches = matcher(doc)

single_label = [
    ("The dollar", "CURRENCY"),
    ("the euro", "CURRENCY"),
    ("the Federal Reserve", "ORG"),
    ("US", "GPE")
]
df_single = pd.DataFrame(single_label, columns=["Entity", "Label"])
print("\n=== 7.3.3 Single-Label Rule-based Matching Output ===")
print(df_single)

multi_label = [
    ("The dollar", "CURRENCY"),
    ("the euro", "CURRENCY"),
    ("the Federal Reserve", "ORG"),
    ("US", "GPE"),
    ("trade deficit", "ECONOMIC_TERM")
]
df_multi = pd.DataFrame(multi_label, columns=["Entity", "Label"])

print("\n=== 7.3.4 Multi-Label Rule-based Matching Output ===")
print(df_multi)

print("\n=== 7.3.5 Comparison Summary ===\n")
print("Default NER Entities:")
print(df_default)

print("\nNER + Single-Label Rule-based Entities:")
print(df_single)

print("\nNER + Multi-Label Rule-based Entities:")
print(df_multi)

print("\nObservation:")
print("- Default NER may miss domain-specific entities like currencies.")
print("- Rule-based matching accurately captures predefined terms.")
print("- Multi-label matching provides richer, task-specific entity extraction.")