In [1]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")


In [2]:
news_data = {
    "headline": [
        "Apple reports strong quarterly earnings",
        "Apple harvest expected to be strong this season",
        "Amazon expands cloud infrastructure in India",
        "Heavy rainfall damages apple orchards in Himachal",
        "Microsoft acquires AI startup in Europe",
        "Apple faces antitrust scrutiny in the EU"
    ],
    "source": [
        "Reuters",
        "AgriNews",
        "Bloomberg",
        "Local Times",
        "TechCrunch",
        "Financial Times"
    ]
}

df = pd.DataFrame(news_data)
df


Unnamed: 0,headline,source
0,Apple reports strong quarterly earnings,Reuters
1,Apple harvest expected to be strong this season,AgriNews
2,Amazon expands cloud infrastructure in India,Bloomberg
3,Heavy rainfall damages apple orchards in Himachal,Local Times
4,Microsoft acquires AI startup in Europe,TechCrunch
5,Apple faces antitrust scrutiny in the EU,Financial Times


In [3]:
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df["entities"] = df["headline"].apply(extract_entities)
df


Unnamed: 0,headline,source,entities
0,Apple reports strong quarterly earnings,Reuters,"[(Apple, ORG), (quarterly, DATE)]"
1,Apple harvest expected to be strong this season,AgriNews,"[(Apple, ORG)]"
2,Amazon expands cloud infrastructure in India,Bloomberg,"[(Amazon, ORG), (India, GPE)]"
3,Heavy rainfall damages apple orchards in Himachal,Local Times,"[(Himachal, ORG)]"
4,Microsoft acquires AI startup in Europe,TechCrunch,"[(Microsoft, ORG), (AI, ORG), (Europe, LOC)]"
5,Apple faces antitrust scrutiny in the EU,Financial Times,"[(Apple, ORG), (EU, ORG)]"


In [4]:
def extract_org_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "ORG"]

df["org_entities"] = df["headline"].apply(extract_org_entities)
df


Unnamed: 0,headline,source,entities,org_entities
0,Apple reports strong quarterly earnings,Reuters,"[(Apple, ORG), (quarterly, DATE)]",[Apple]
1,Apple harvest expected to be strong this season,AgriNews,"[(Apple, ORG)]",[Apple]
2,Amazon expands cloud infrastructure in India,Bloomberg,"[(Amazon, ORG), (India, GPE)]",[Amazon]
3,Heavy rainfall damages apple orchards in Himachal,Local Times,"[(Himachal, ORG)]",[Himachal]
4,Microsoft acquires AI startup in Europe,TechCrunch,"[(Microsoft, ORG), (AI, ORG), (Europe, LOC)]","[Microsoft, AI]"
5,Apple faces antitrust scrutiny in the EU,Financial Times,"[(Apple, ORG), (EU, ORG)]","[Apple, EU]"


In [5]:
ORG_BLACKLIST = {"EU", "AI", "India", "Europe", "Himachal"}

NON_FINANCE_CONTEXT = {
    "harvest", "orchard", "rainfall", "crop", "season"
}

def clean_org_entities(text, org_list):
    text_lower = text.lower()
    cleaned = []

    for org in org_list:
        if org in ORG_BLACKLIST:
            continue

        # Drop Apple when clearly used in non-financial context
        if org == "Apple" and any(word in text_lower for word in NON_FINANCE_CONTEXT):
            continue

        cleaned.append(org)

    return cleaned

df["clean_org_entities"] = df.apply(
    lambda x: clean_org_entities(x["headline"], x["org_entities"]),
    axis=1
)

df


Unnamed: 0,headline,source,entities,org_entities,clean_org_entities
0,Apple reports strong quarterly earnings,Reuters,"[(Apple, ORG), (quarterly, DATE)]",[Apple],[Apple]
1,Apple harvest expected to be strong this season,AgriNews,"[(Apple, ORG)]",[Apple],[]
2,Amazon expands cloud infrastructure in India,Bloomberg,"[(Amazon, ORG), (India, GPE)]",[Amazon],[Amazon]
3,Heavy rainfall damages apple orchards in Himachal,Local Times,"[(Himachal, ORG)]",[Himachal],[]
4,Microsoft acquires AI startup in Europe,TechCrunch,"[(Microsoft, ORG), (AI, ORG), (Europe, LOC)]","[Microsoft, AI]",[Microsoft]
5,Apple faces antitrust scrutiny in the EU,Financial Times,"[(Apple, ORG), (EU, ORG)]","[Apple, EU]",[Apple]


In [6]:
TARGET_COMPANY = "Apple"

FINANCE_KEYWORDS = {
    "earnings", "revenue", "profit", "stock", "shares",
    "antitrust", "regulator", "ipo", "market", "quarter",
    "guidance", "sales", "lawsuit"
}

def is_target_company_news(text, target_company):
    doc = nlp(text)

    exact_org_match = any(
        ent.label_ == "ORG" and ent.text == target_company
        for ent in doc.ents
    )

    if not exact_org_match:
        return False

    return any(word in text.lower() for word in FINANCE_KEYWORDS)


In [7]:
df["is_target_company_news"] = df["headline"].apply(
    lambda x: is_target_company_news(x, TARGET_COMPANY)
)

df


Unnamed: 0,headline,source,entities,org_entities,clean_org_entities,is_target_company_news
0,Apple reports strong quarterly earnings,Reuters,"[(Apple, ORG), (quarterly, DATE)]",[Apple],[Apple],True
1,Apple harvest expected to be strong this season,AgriNews,"[(Apple, ORG)]",[Apple],[],False
2,Amazon expands cloud infrastructure in India,Bloomberg,"[(Amazon, ORG), (India, GPE)]",[Amazon],[Amazon],False
3,Heavy rainfall damages apple orchards in Himachal,Local Times,"[(Himachal, ORG)]",[Himachal],[],False
4,Microsoft acquires AI startup in Europe,TechCrunch,"[(Microsoft, ORG), (AI, ORG), (Europe, LOC)]","[Microsoft, AI]",[Microsoft],False
5,Apple faces antitrust scrutiny in the EU,Financial Times,"[(Apple, ORG), (EU, ORG)]","[Apple, EU]",[Apple],True


In [8]:
filtered_df = df[df["is_target_company_news"] == True]
filtered_df


Unnamed: 0,headline,source,entities,org_entities,clean_org_entities,is_target_company_news
0,Apple reports strong quarterly earnings,Reuters,"[(Apple, ORG), (quarterly, DATE)]",[Apple],[Apple],True
5,Apple faces antitrust scrutiny in the EU,Financial Times,"[(Apple, ORG), (EU, ORG)]","[Apple, EU]",[Apple],True
