# GDELT Data Validation

This notebook validates the GDELT news articles data ingested for MAG7 companies.

In [171]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
# Load the GDELT articles data
DATA_PATH = Path("../data/raw/gdelt_articles.csv")
df = pd.read_csv(DATA_PATH, parse_dates=["seendate"])
print(f"Loaded {len(df):,} rows from {DATA_PATH}")

Loaded 1,400 rows from ../data/raw/gdelt_articles.csv


In [172]:
RAW_DIR = DATA_PATH.parent                    # ../data/raw
PROCESSED_DIR = DATA_PATH.parent.parent / "processed"  # ../data/processed

In [173]:
# Build output filename from input
output_filename = DATA_PATH.stem + "_clean.csv"  # gdelt_articles_clean.csv
output_path = PROCESSED_DIR / output_filename

In [174]:
original_rows = len(df)
original_cols = len(df.columns)

In [175]:
# Preview first few rows
df.head()

Unnamed: 0,query,seendate,url,title,description,language,domain,sourceCountry,socialimage,company,ticker
0,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-16 21:00:00+00:00,https://www.cosmeticsandtoiletries.com/cosmeti...,The Longevity and Slow - Aging Movement Gets a...,,English,cosmeticsandtoiletries.com,,https://img.cosmeticsandtoiletries.com/mindful...,Apple,AAPL
1,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-16 20:45:00+00:00,https://www.businessinsider.com/apple-losing-g...,Apple Is Losing Its Grip on the World Tech Sup...,,English,businessinsider.com,,https://i.insider.com/696a837aa645d11881878256...,Apple,AAPL
2,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-16 20:45:00+00:00,https://finance.yahoo.com/news/asml-soars-abov...,ASML Soars Above $500 Billion Value on TSMC Up...,,English,finance.yahoo.com,,https://s.yimg.com/ny/api/res/1.2/2pyxQMi5YKpX...,Apple,AAPL
3,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-16 20:30:00+00:00,https://www.androidpolice.com/the-excellent-ga...,The excellent Galaxy Buds3 Pro deserve way mor...,,English,androidpolice.com,,https://static0.anpoimages.com/wordpress/wp-co...,Apple,AAPL
4,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-16 20:00:00+00:00,https://markets.financialcontent.com/stocks/ar...,FinancialContent - The Great Rebalancing : Sma...,,English,markets.financialcontent.com,,https://marketminute.ghost.io/content/images/s...,Apple,AAPL


In [176]:
df.shape #how many rows and columns we have 

(1400, 11)

In [177]:
#check all column names
df.columns

Index(['query', 'seendate', 'url', 'title', 'description', 'language',
       'domain', 'sourceCountry', 'socialimage', 'company', 'ticker'],
      dtype='object')

In [178]:
#check if any comand is null, missing or useful
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   query          1400 non-null   object             
 1   seendate       1400 non-null   datetime64[ns, UTC]
 2   url            1400 non-null   object             
 3   title          1400 non-null   object             
 4   description    0 non-null      float64            
 5   language       1400 non-null   object             
 6   domain         1400 non-null   object             
 7   sourceCountry  0 non-null      float64            
 8   socialimage    1200 non-null   object             
 9   company        1400 non-null   object             
 10  ticker         1400 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(2), object(8)
memory usage: 120.4+ KB


In [179]:
df.isnull().sum() #count all the missing values

query               0
seendate            0
url                 0
title               0
description      1400
language            0
domain              0
sourceCountry    1400
socialimage       200
company             0
ticker              0
dtype: int64

In [180]:
#check how many percentage of missing values we have based on the whole dataset

(df.isnull().sum()/len(df)) * 100

query              0.000000
seendate           0.000000
url                0.000000
title              0.000000
description      100.000000
language           0.000000
domain             0.000000
sourceCountry    100.000000
socialimage       14.285714
company            0.000000
ticker             0.000000
dtype: float64

*You can see that all description and sourceCountry data are missing, and socialimage missed 14% of the data*

In [181]:
df_clean = df.drop(columns=['description', 'sourceCountry'])
print(f"Before: {df.shape}")
print(f"After: {df_clean.shape}")

Before: (1400, 11)
After: (1400, 9)


In [182]:

print(f"Duplicate rows: {df_clean.duplicated().sum()}")

Duplicate rows: 1050


In [183]:
print(f"Duplicate URL: {df_clean['url'].duplicated().sum()}")

duplicated_urls = df_clean[df_clean['url'].duplicated(keep=False)]
print(f"\nRows with duplicated urls: {len(duplicated_urls)}")

print(f"One example of duplicated urls:")
example_url = duplicated_urls['url'].iloc[0]
print(df_clean[df_clean['url'] == example_url] [['url', 'company', 'ticker']])

Duplicate URL: 1162

Rows with duplicated urls: 1400
One example of duplicated urls:
                                                   url company ticker
0    https://www.cosmeticsandtoiletries.com/cosmeti...   Apple   AAPL
50   https://www.cosmeticsandtoiletries.com/cosmeti...   Apple   AAPL
100  https://www.cosmeticsandtoiletries.com/cosmeti...   Apple   AAPL
150  https://www.cosmeticsandtoiletries.com/cosmeti...   Apple   AAPL


In [184]:
df_clean = df_clean.drop_duplicates(subset=['url', 'company'], keep='first')

print(f"Rows after removing duplicates: {len(df_clean):,}")
print(f"Unique URLs: {df_clean['url'].nunique():,}")


Rows after removing duplicates: 350
Unique URLs: 238


In [185]:
print(f"Earliest article: {df_clean['seendate'].min()}")
print(f"Latest article: {df_clean['seendate'].max()}")

#Calculate the span 
date_range = df_clean['seendate'].max() - df_clean['seendate'].min()
print(f"Date span: {date_range.days} days")

Earliest article: 2026-01-16 04:45:00+00:00
Latest article: 2026-01-16 21:00:00+00:00
Date span: 0 days


In [186]:
#Check for empty or short title
empty_titles = df_clean['title'].isna().sum()
print(f"Empty titles: {empty_titles}")

Empty titles: 0


In [187]:
short_titles = (df_clean['title'].str.len() < 10).sum()
print(f"Very short titles (<10 chars): {short_titles}")

Very short titles (<10 chars): 1


In [188]:
if short_titles > 0:
    print("\nShort titles found:")
    print(df_clean[df_clean['title'].str.len() < 10]['title'].values)


Short titles found:
['8秒懂 !! ']


In [189]:
print("Language in data:")
print(df_clean['language'].value_counts())

#keep only english
df_clean = df_clean[df_clean['language'] == 'English']
print(f"\nRows after filtering to English: {len(df_clean)}")


Language in data:
language
English    315
Chinese     12
French       6
Finnish      3
German       3
Arabic       2
Korean       2
Spanish      2
Dutch        2
Polish       1
Swedish      1
Hindi        1
Name: count, dtype: int64

Rows after filtering to English: 315


In [190]:
print("Articles per company:")
print(df_clean['company'].value_counts())

Articles per company:
company
NVIDIA            49
Meta Platforms    49
Alphabet          47
Apple             46
Microsoft         44
Amazon            42
Tesla             38
Name: count, dtype: int64


In [191]:
valid_urls = df_clean['url'].str.startswith(('http://', 'https://')).all()
print(f"All URLs valid: {valid_urls}")

All URLs valid: True


In [192]:
invalid = df_clean[~df_clean['url'].str.startswith(('http://', 'https://'))]
print(f"Invalid URLs: {len(invalid)}")

Invalid URLs: 0


In [193]:
#Top 10 news sources - check for source bias
print("Top 10 domains:")
print(df_clean['domain'].value_counts().head(10))

Top 10 domains:
domain
markets.financialcontent.com    36
finance.yahoo.com               35
fool.com                        28
insidermonkey.com               18
benzinga.com                     8
cnbc.com                         8
economictimes.indiatimes.com     7
forbes.com                       7
businessinsider.com              6
siliconangle.com                 5
Name: count, dtype: int64


In [194]:
# How many unique sources?
print(f"\nUnique domains: {df_clean['domain'].nunique()}")


Unique domains: 117


In [195]:
#Some articles might not actually be about the company's stock/business:
financial_keywords = [
    # Stock & Trading
    'stock', 'share', 'shares', 'trading', 'trader', 'nasdaq', 'nyse', 
    's&p', 'dow', 'index', 'etf', 'fund', 'hedge',
    
    # Financial Metrics
    'earnings', 'revenue', 'profit', 'loss', 'margin', 'eps', 
    'guidance', 'forecast', 'outlook', 'quarter', 'quarterly',
    'annual', 'fiscal', 'billion', 'million', 'trillion',
    
    # Market Movement
    'bull', 'bear', 'rally', 'surge', 'soar', 'jump', 'climb',
    'drop', 'fall', 'crash', 'plunge', 'sink', 'tumble', 'volatile',
    'gain', 'rise', 'decline', 'dip',
    
    # Valuation
    'valuation', 'market cap', 'price target', 'rating', 'upgrade',
    'downgrade', 'buy', 'sell', 'hold', 'overweight', 'underweight',
    
    # Business Operations  
    'ceo', 'cfo', 'executive', 'board', 'investor', 'shareholder',
    'dividend', 'buyback', 'acquisition', 'merger', 'deal', 'partnership',
    'investment', 'ipo', 'stake',
    
    # Supply Chain & Operations
    'supplier', 'supply chain', 'manufacture', 'production', 'factory',
    'chip', 'semiconductor', 'shortage',
    
    # Tech-Specific
    'ai', 'artificial intelligence', 'cloud', 'software', 'hardware',
    'iphone', 'android', 'windows', 'azure', 'aws', 'gpu', 'data center',
    
    # MAG7 Company Names (catches articles about them)
    'apple', 'microsoft', 'google', 'alphabet', 'amazon', 'meta', 
    'facebook', 'tesla', 'nvidia', 'aapl', 'msft', 'googl', 'amzn', 
    'tsla', 'nvda',
    
    # Competition & Industry
    'competitor', 'rival', 'industry', 'sector', 'antitrust', 'regulation',
    'ces', 'tech trends', 'conference', 'keynote', 'announcement', 'launch', 'unveil'
]
def has_financial_keyword(title):
    title_lower = title.lower()
    for kw in financial_keywords:
        # Use word boundary \b to match whole words only
        if re.search(r'\b' + re.escape(kw) + r'\b', title_lower):
            return True
    return False
df_clean['is_relevant'] = df_clean['title'].apply(has_financial_keyword)


In [196]:
print(f"Articles with financial keywords: {df_clean['is_relevant'].sum()}")
print(f"Potential irrelevant: {(~df_clean['is_relevant']).sum()}")

Articles with financial keywords: 188
Potential irrelevant: 127


In [197]:
# Preview potentially irrelevant articles
print("\nPotentially irrelevant titles:")
print(df_clean[~df_clean['is_relevant']]['title'].head(10).values)


Potentially irrelevant titles:
['The Longevity and Slow - Aging Movement Gets an Ingredient Refresh'
 'The excellent Galaxy Buds3 Pro deserve way more respect than they get'
 '20 Canadian albums we cant wait to hear in 2026'
 'How Hermès keeps its clutches on its own handbags'
 'VCET : David Aronoff / MCJ | Vermont Business Magazine'
 'Pass the Popcorn : Becoming Led Zeppelin'
 '12 Best Ginger Beer Cocktails Youll Want to Make All Year , According to Experts'
 'Alex Goldmark'
 'With The Rip , Ben Affleck and Matt Damon are having fun again'
 'Indigenous recipes from Turtle Island | Vermilion Standard']


In [198]:
#keep only relevant data 
df_clean = df_clean[df_clean['is_relevant'] ==  True]
print(f"Clean rows after relevance filter: {len(df_clean)}")

Clean rows after relevance filter: 188


In [199]:
# Debug: Test the function directly
test_titles = [
    "20 Canadian albums we cant wait to hear in 2026",
    "Apple stock rises 5%",
    "How Hermès keeps its clutches on its own handbags"
]

for title in test_titles:
    result = has_financial_keyword(title)
    print(f"{result} <- '{title[:50]}'")

False <- '20 Canadian albums we cant wait to hear in 2026'
True <- 'Apple stock rises 5%'
False <- 'How Hermès keeps its clutches on its own handbags'


In [200]:
df_clean = df_clean[df_clean['is_relevant']==True]
df_clean = df_clean.drop(columns=['is_relevant', 'query' ])

In [201]:
df_clean.head(10)

Unnamed: 0,seendate,url,title,language,domain,socialimage,company,ticker
1,2026-01-16 20:45:00+00:00,https://www.businessinsider.com/apple-losing-g...,Apple Is Losing Its Grip on the World Tech Sup...,English,businessinsider.com,https://i.insider.com/696a837aa645d11881878256...,Apple,AAPL
2,2026-01-16 20:45:00+00:00,https://finance.yahoo.com/news/asml-soars-abov...,ASML Soars Above $500 Billion Value on TSMC Up...,English,finance.yahoo.com,https://s.yimg.com/ny/api/res/1.2/2pyxQMi5YKpX...,Apple,AAPL
4,2026-01-16 20:00:00+00:00,https://markets.financialcontent.com/stocks/ar...,FinancialContent - The Great Rebalancing : Sma...,English,markets.financialcontent.com,https://marketminute.ghost.io/content/images/s...,Apple,AAPL
5,2026-01-16 20:00:00+00:00,https://thebull.com.au/us-news/apple-cements-5...,Apple Cements $5 Billion Google Gemini Partner...,English,thebull.com.au,,Apple,AAPL
7,2026-01-16 20:00:00+00:00,https://www.cpapracticeadvisor.com/podcasts/ke...,Key Trends and Tech Observations from CES 2026...,English,cpapracticeadvisor.com,https://www.cpapracticeadvisor.com/wp-content/...,Apple,AAPL
9,2026-01-16 20:00:00+00:00,https://markets.financialcontent.com/stocks/ar...,FinancialContent - The Trillion - Dollar Safet...,English,markets.financialcontent.com,https://marketminute.ghost.io/content/images/s...,Apple,AAPL
10,2026-01-16 20:00:00+00:00,https://www.cnbc.com/2026/01/16/one-of-our-top...,One of our top stocks this week just lost its ...,English,cnbc.com,https://image.cnbcfm.com/api/v1/image/10822069...,Apple,AAPL
12,2026-01-16 19:45:00+00:00,https://finance.yahoo.com/news/fund-manager-pu...,Fund manager pulls plug on popular semiconduct...,English,finance.yahoo.com,https://s.yimg.com/os/en/thestreet_881/ca90ba7...,Apple,AAPL
13,2026-01-16 19:15:00+00:00,https://macdailynews.com/2026/01/16/major-appl...,Major Apple supplier TSMC boosts U . S . footp...,English,macdailynews.com,https://macdailynews.com/wp-content/uploads/20...,Apple,AAPL
14,2026-01-16 19:15:00+00:00,https://www.ktiv.com/2026/01/16/us-taiwan-sign...,"US and Taiwan sign $250B trade deal , cutting ...",English,ktiv.com,https://gray-ktiv-prod.gtv-cdn.com/resizer/v2/...,Apple,AAPL


Sometimes the same story gets reported by multiple outlets with slightly different titles:

In [202]:
from difflib import SequenceMatcher #comparing sequence


In [203]:
def similar(a,b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

#Find titles that >80% similar
titles = df_clean['title'].tolist()
near_dupes=[]
for i, t1 in enumerate(titles):
    for j, t2 in enumerate(titles[i+1:], i+1):
        if similar(t1, t2) > 0.8:
            near_dupes.append((t1,t2,similar(t1,t2)))

print(f"Found {len(near_dupes)} near duplicate pairs")
for t1, t2, score in near_dupes[:5]:
    print(f"\n{score:.0%} similar:")
    print(f"  1: {t1[:60]}...")
    print(f"  2: {t2[:60]}...")

Found 171 near duplicate pairs

100% similar:
  1: Apple Is Losing Its Grip on the World Tech Supply Chain...
  2: Apple Is Losing Its Grip on the World Tech Supply Chain...

100% similar:
  1: Apple Is Losing Its Grip on the World Tech Supply Chain...
  2: Apple Is Losing Its Grip on the World Tech Supply Chain...

100% similar:
  1: Apple Is Losing Its Grip on the World Tech Supply Chain...
  2: Apple Is Losing Its Grip on the World Tech Supply Chain...

100% similar:
  1: ASML Soars Above $500 Billion Value on TSMC Upbeat Outlook...
  2: ASML Soars Above $500 Billion Value on TSMC Upbeat Outlook...

100% similar:
  1: ASML Soars Above $500 Billion Value on TSMC Upbeat Outlook...
  2: ASML Soars Above $500 Billion Value on TSMC Upbeat Outlook...


In [204]:
# Check one of the duplicate titles
dupe_title = "Apple Is Losing Its Grip on the World Tech Supply Chain"
print(df_clean[df_clean['title'] == dupe_title][['title', 'company', 'ticker']])

                                                 title    company ticker
1    Apple Is Losing Its Grip on the World Tech Sup...      Apple   AAPL
202  Apple Is Losing Its Grip on the World Tech Sup...  Microsoft   MSFT
401  Apple Is Losing Its Grip on the World Tech Sup...     NVIDIA   NVDA
802  Apple Is Losing Its Grip on the World Tech Sup...     Amazon   AMZN


In [205]:
# Save
df_clean.to_csv(output_path, index=False)
print(f"Saved {len(df_clean)} rows to {output_path}")

Saved 188 rows to ../data/processed/gdelt_articles_clean.csv


In [206]:
# All cleaning in one cell (near the end of notebook)
df_clean = df.copy()
df_clean = df_clean.drop(columns=['description', 'sourceCountry'])
df_clean = df_clean.drop_duplicates(subset=['url', 'company'], keep='first')
df_clean = df_clean[df_clean['language'] == 'English']
df_clean = df_clean[df_clean['title'].apply(has_financial_keyword)]
df_clean = df_clean.drop(columns=['query'])

print(f"Final clean data: {len(df_clean)} rows")

# Save immediately after
df_clean.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Final clean data: 188 rows
Saved to ../data/processed/gdelt_articles_clean.csv
