#### Loading the required libararies and the required NER model

In [1]:
import numpy as np
import pandas as pd
import spacy

nlp = spacy.load('en_core_web_sm')
print('Spacy Model loaded successfully')

Spacy Model loaded successfully


In [2]:
sample_texts = [
    "Apple Inc. reports record earnings in Q4",
    "The Amazon rainforest is facing deforestation",
    "Amazon Web Services expands cloud infrastructure",
    "I bought an apple from the grocery store",
    "Tesla opens new factory in Austin, Texas",
    "Visit the Tesla museum in New York"
]

print("Entity Detection Examples:")
print("=" * 60)

for text in sample_texts:
    doc = nlp(text)
    print(f'\nText: {text}')
    
    if doc.ents:
        for ent in doc.ents:
            print(f"  - '{ent.text}' → {ent.label_}")
    else:
        print('No Entities found')

Entity Detection Examples:

Text: Apple Inc. reports record earnings in Q4
  - 'Apple Inc.' → ORG
  - 'Q4' → GPE

Text: The Amazon rainforest is facing deforestation
  - 'Amazon' → ORG

Text: Amazon Web Services expands cloud infrastructure
  - 'Amazon Web Services' → ORG

Text: I bought an apple from the grocery store
No Entities found

Text: Tesla opens new factory in Austin, Texas
  - 'Austin' → GPE
  - 'Texas' → GPE

Text: Visit the Tesla museum in New York
  - 'Tesla' → NORP
  - 'New York' → GPE


#### Creating a custom NER filter to prevent noise from slipping in

In [3]:
def check_entity_is_org(text,company_name):
    for ent in doc.ents:
        if company_name.lower() in ent.text.lower():
            if ent.label == 'ORG':
                return True
            else:
                return False # Tagged as entity but not org
    return False # Not tagged as entity only

test_cases = [
    ("Apple Inc. releases new iPhone", "Apple", True),
    ("I ate an apple for lunch", "Apple", False),
    ("Amazon expands its e-commerce platform", "Amazon", True),
    ("The Amazon river is the longest", "Amazon", False),
]

print("Testing NER Filter:")
print("=" * 60)
for text, company, expected in test_cases:
    result = check_entity_is_org(text, company)
    status = "✅" if result == expected else "❌"
    print(f"{status} '{text}' → {result} (expected {expected})")

Testing NER Filter:
❌ 'Apple Inc. releases new iPhone' → False (expected True)
✅ 'I ate an apple for lunch' → False (expected False)
❌ 'Amazon expands its e-commerce platform' → False (expected True)
✅ 'The Amazon river is the longest' → False (expected False)


#### Cleaning the news dataset

In [4]:
news_data = {
    'headline': [
        'Apple announces new product line',
        'How to make apple pie at home',
        'Amazon stock surges on earnings beat',
        'Amazon rainforest conservation efforts',
        'Tesla unveils new electric vehicle',
        'Visit the Tesla museum in Belgrade',
        'Microsoft launches AI initiative',
        'Google reports strong ad revenue',
        'Trip to New York and visiting Google offices',
        'Apple trees need proper care'
    ],
    'company': ['Apple', 'Apple', 'Amazon', 'Amazon', 'Tesla',
               'Tesla', 'Microsoft', 'Google', 'Google', 'Apple']
}

df = pd.DataFrame(news_data)

df['is_org'] = df.apply(lambda row: check_entity_is_org(row['headline'],row['company']), axis = 1)

df_filtered = df[df['is_org'] == True].copy()
print(f"Filtered dataset: {len(df_filtered)} articles (removed {len(df) - len(df_filtered)} non-ORG)")
print("\nCleaned News for Analysis:")
print(df_filtered[['headline', 'company']])

Filtered dataset: 0 articles (removed 10 non-ORG)

Cleaned News for Analysis:
Empty DataFrame
Columns: [headline, company]
Index: []


#### Refining the filter to improve accuracy

In [5]:
def check_entity_is_org_refined(text, company_name):
    noise_keywords = ['rainforest', 'river', 'tree', 'fruit', 'pie', 'recipe', 'care']

    if any(word in text.lower() for word in noise_keywords):
        return False

    doc = nlp(text)
    for ent in doc.ents:
        if company_name.lower() in ent.text.lower():
            return ent.label_ == 'ORG'

    return False

df['is_org_refined'] = df.apply(lambda row: check_entity_is_org_refined(row['headline'], row['company']),axis=1)

df_final = df[df['is_org_refined'] == True].copy()
print(f"✅ Final Cleaned Dataset: {len(df_final)} articles")
print(df_final[['headline', 'company']])

✅ Final Cleaned Dataset: 5 articles
                                       headline    company
0              Apple announces new product line      Apple
2          Amazon stock surges on earnings beat     Amazon
6              Microsoft launches AI initiative  Microsoft
7              Google reports strong ad revenue     Google
8  Trip to New York and visiting Google offices     Google


In [6]:
df['is_org'] = df.apply(lambda row: check_entity_is_org_refined(row['headline'], row['company']),axis=1)

print("Dataset with NER check:")
print(df)

df_filtered = df[df['is_org'] == True].copy()

print(f"✅ Filtered dataset: {len(df_filtered)} articles (removed {len(df) - len(df_filtered)} non-ORG)")
print("Filtered articles:")
print(df_filtered[['headline', 'company']])

Dataset with NER check:
                                       headline    company  is_org  \
0              Apple announces new product line      Apple    True   
1                 How to make apple pie at home      Apple   False   
2          Amazon stock surges on earnings beat     Amazon    True   
3        Amazon rainforest conservation efforts     Amazon   False   
4            Tesla unveils new electric vehicle      Tesla   False   
5            Visit the Tesla museum in Belgrade      Tesla   False   
6              Microsoft launches AI initiative  Microsoft    True   
7              Google reports strong ad revenue     Google    True   
8  Trip to New York and visiting Google offices     Google    True   
9                  Apple trees need proper care      Apple   False   

   is_org_refined  
0            True  
1           False  
2            True  
3           False  
4           False  
5           False  
6            True  
7            True  
8            True  
9    

#### Using batch processing for larger datasets

In [7]:
def filter_org_articles_batch(df, text_column='headline', company_column='company'):
    results = []
    noise_keywords = ['rainforest', 'river', 'tree', 'fruit', 'pie', 'recipe', 'care']

    docs = nlp.pipe(df[text_column])

    for idx, doc in enumerate(docs):
        headline_text = df.iloc[idx][text_column].lower()
        company = df.iloc[idx][company_column].lower()
        is_org = False

        if any(word in headline_text for word in noise_keywords):
            results.append(False)
            continue

        for ent in doc.ents:
            if company in ent.text.lower() and ent.label_ == 'ORG':
                is_org = True
                break

        results.append(is_org)

    return results

# Test with our dataset
df['is_org_batch'] = filter_org_articles_batch(df)

print("Batch processing results:")
print(df[['headline', 'company', 'is_org_batch']])

Batch processing results:
                                       headline    company  is_org_batch
0              Apple announces new product line      Apple          True
1                 How to make apple pie at home      Apple         False
2          Amazon stock surges on earnings beat     Amazon          True
3        Amazon rainforest conservation efforts     Amazon         False
4            Tesla unveils new electric vehicle      Tesla         False
5            Visit the Tesla museum in Belgrade      Tesla         False
6              Microsoft launches AI initiative  Microsoft          True
7              Google reports strong ad revenue     Google          True
8  Trip to New York and visiting Google offices     Google          True
9                  Apple trees need proper care      Apple         False
