# GDELT Data Validation

This notebook validates the GDELT news articles data ingested for MAG7 companies.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
# Load the GDELT articles data
DATA_PATH = Path("../data/raw/gdelt_articles.csv")
df = pd.read_csv(DATA_PATH, parse_dates=["seendate"])
print(f"Loaded {len(df):,} rows from {DATA_PATH}")

Loaded 1,400 rows from ../data/raw/gdelt_articles.csv


In [2]:
RAW_DIR = DATA_PATH.parent                    # ../data/raw
PROCESSED_DIR = DATA_PATH.parent.parent / "processed"  # ../data/processed

In [3]:
# Build output filename from input
output_filename = DATA_PATH.stem + "_clean.csv"  # gdelt_articles_clean.csv
output_path = PROCESSED_DIR / output_filename

In [4]:
original_rows = len(df)
original_cols = len(df.columns)

In [5]:
# Preview first few rows
df.head()

Unnamed: 0,query,seendate,url,title,description,language,domain,sourceCountry,socialimage,company,ticker
0,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-13 15:00:00+00:00,https://smartmania.cz/po-letech-zmena-krale-ap...,Po letech změna krále : Apple prodal víc telef...,,Czech,smartmania.cz,,https://smartmania.cz/wp-content/uploads/2025/...,Apple,AAPL
1,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-13 15:00:00+00:00,https://www.sej.org/headlines/trump-s-biggest-...,Trump Biggest Inaugural Donor Benefits from Wo...,,English,sej.org,,,Apple,AAPL
2,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-13 15:00:00+00:00,https://finance.yahoo.com/news/prediction-spec...,Prediction : This Spectacular Vanguard ETF Wil...,,English,finance.yahoo.com,,https://s.yimg.com/ny/api/res/1.2/vPftkj6hH9si...,Apple,AAPL
3,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-13 15:00:00+00:00,https://www.proactiveinvestors.com/companies/n...,S & P and Nasdaq hold as Dow starts lower desp...,,English,proactiveinvestors.com,,https://cdn.proactiveinvestors.com/eyJidWNrZXQ...,Apple,AAPL
4,"(""Apple"" OR AAPL) (stock OR shares OR earnings...",2026-01-13 14:45:00+00:00,http://www.thailandnews.net/news/278802381/aut...,Auto industry bets on AI partnerships after co...,,English,thailandnews.net,,https://image.chitra.live/api/v1/wps/ed80633/b...,Apple,AAPL


In [6]:
df.shape #how many rows and columns we have 

(1400, 11)

In [7]:
#check all column names
df.columns

Index(['query', 'seendate', 'url', 'title', 'description', 'language',
       'domain', 'sourceCountry', 'socialimage', 'company', 'ticker'],
      dtype='object')

In [8]:
#check if any comand is null, missing or useful
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   query          1400 non-null   object             
 1   seendate       1400 non-null   datetime64[ns, UTC]
 2   url            1400 non-null   object             
 3   title          1400 non-null   object             
 4   description    0 non-null      float64            
 5   language       1400 non-null   object             
 6   domain         1400 non-null   object             
 7   sourceCountry  0 non-null      float64            
 8   socialimage    1168 non-null   object             
 9   company        1400 non-null   object             
 10  ticker         1400 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(2), object(8)
memory usage: 120.4+ KB


In [9]:
df.isnull().sum() #count all the missing values

query               0
seendate            0
url                 0
title               0
description      1400
language            0
domain              0
sourceCountry    1400
socialimage       232
company             0
ticker              0
dtype: int64

In [10]:
#check how many percentage of missing values we have based on the whole dataset

(df.isnull().sum()/len(df)) * 100

query              0.000000
seendate           0.000000
url                0.000000
title              0.000000
description      100.000000
language           0.000000
domain             0.000000
sourceCountry    100.000000
socialimage       16.571429
company            0.000000
ticker             0.000000
dtype: float64

*You can see that all description and sourceCountry data are missing, and socialimage missed 14% of the data*

In [11]:
df_clean = df.drop(columns=['description', 'sourceCountry'])
print(f"Before: {df.shape}")
print(f"After: {df_clean.shape}")

Before: (1400, 11)
After: (1400, 9)


In [12]:

print(f"Duplicate rows: {df_clean.duplicated().sum()}")

Duplicate rows: 1050


In [13]:
print(f"Duplicate URL: {df_clean['url'].duplicated().sum()}")

duplicated_urls = df_clean[df_clean['url'].duplicated(keep=False)]
print(f"\nRows with duplicated urls: {len(duplicated_urls)}")

print(f"One example of duplicated urls:")
example_url = duplicated_urls['url'].iloc[0]
print(df_clean[df_clean['url'] == example_url] [['url', 'company', 'ticker']])

Duplicate URL: 1169

Rows with duplicated urls: 1400
One example of duplicated urls:
                                                   url company ticker
0    https://smartmania.cz/po-letech-zmena-krale-ap...   Apple   AAPL
50   https://smartmania.cz/po-letech-zmena-krale-ap...   Apple   AAPL
100  https://smartmania.cz/po-letech-zmena-krale-ap...   Apple   AAPL
150  https://smartmania.cz/po-letech-zmena-krale-ap...   Apple   AAPL


In [14]:
df_clean = df_clean.drop_duplicates(subset=['url', 'company'], keep='first')

print(f"Rows after removing duplicates: {len(df_clean):,}")
print(f"Unique URLs: {df_clean['url'].nunique():,}")


Rows after removing duplicates: 350
Unique URLs: 231


In [15]:
print(f"Earliest article: {df_clean['seendate'].min()}")
print(f"Latest article: {df_clean['seendate'].max()}")

#Calculate the span 
date_range = df_clean['seendate'].max() - df_clean['seendate'].min()
print(f"Date span: {date_range.days} days")

Earliest article: 2026-01-13 03:45:00+00:00
Latest article: 2026-01-13 15:00:00+00:00
Date span: 0 days


In [16]:
#Check for empty or short title
empty_titles = df_clean['title'].isna().sum()
print(f"Empty titles: {empty_titles}")

Empty titles: 0


In [17]:
short_titles = (df_clean['title'].str.len() < 10).sum()
print(f"Very short titles (<10 chars): {short_titles}")

Very short titles (<10 chars): 0


In [18]:
if short_titles > 0:
    print("\nShort titles found:")
    print(df_clean[df_clean['title'].str.len() < 10]['title'].values)

In [19]:
print("Language in data:")
print(df_clean['language'].value_counts())

#keep only english
df_clean = df_clean[df_clean['language'] == 'English']
print(f"\nRows after filtering to English: {len(df_clean)}")


Language in data:
language
English       298
Chinese        16
French          6
German          5
Turkish         4
Spanish         4
Czech           3
Polish          3
Hebrew          2
Portuguese      2
Norwegian       1
Vietnamese      1
Italian         1
Korean          1
Romanian        1
Thai            1
Arabic          1
Name: count, dtype: int64

Rows after filtering to English: 298


In [20]:
print("Articles per company:")
print(df_clean['company'].value_counts())

Articles per company:
company
Meta Platforms    48
Alphabet          46
NVIDIA            43
Amazon            43
Microsoft         41
Tesla             41
Apple             36
Name: count, dtype: int64


In [21]:
valid_urls = df_clean['url'].str.startswith(('http://', 'https://')).all()
print(f"All URLs valid: {valid_urls}")

All URLs valid: True


In [22]:
invalid = df_clean[~df_clean['url'].str.startswith(('http://', 'https://'))]
print(f"Invalid URLs: {len(invalid)}")

Invalid URLs: 0


In [23]:
#Top 10 news sources - check for source bias
print("Top 10 domains:")
print(df_clean['domain'].value_counts().head(10))

Top 10 domains:
domain
finance.yahoo.com     25
forbes.com            15
zerohedge.com         12
insidermonkey.com      9
webpronews.com         8
abcnews.go.com         8
cnbc.com               7
afghanistansun.com     6
cincinnatisun.com      6
thailandnews.net       6
Name: count, dtype: int64


In [24]:
# How many unique sources?
print(f"\nUnique domains: {df_clean['domain'].nunique()}")


Unique domains: 115


In [25]:
#Some articles might not actually be about the company's stock/business:
financial_keywords = [
    # Stock & Trading
    'stock', 'share', 'shares', 'trading', 'trader', 'nasdaq', 'nyse', 
    's&p', 'dow', 'index', 'etf', 'fund', 'hedge',
    
    # Financial Metrics
    'earnings', 'revenue', 'profit', 'loss', 'margin', 'eps', 
    'guidance', 'forecast', 'outlook', 'quarter', 'quarterly',
    'annual', 'fiscal', 'billion', 'million', 'trillion',
    
    # Market Movement
    'bull', 'bear', 'rally', 'surge', 'soar', 'jump', 'climb',
    'drop', 'fall', 'crash', 'plunge', 'sink', 'tumble', 'volatile',
    'gain', 'rise', 'decline', 'dip',
    
    # Valuation
    'valuation', 'market cap', 'price target', 'rating', 'upgrade',
    'downgrade', 'buy', 'sell', 'hold', 'overweight', 'underweight',
    
    # Business Operations  
    'ceo', 'cfo', 'executive', 'board', 'investor', 'shareholder',
    'dividend', 'buyback', 'acquisition', 'merger', 'deal', 'partnership',
    'investment', 'ipo', 'stake',
    
    # Supply Chain & Operations
    'supplier', 'supply chain', 'manufacture', 'production', 'factory',
    'chip', 'semiconductor', 'shortage',
    
    # Tech-Specific
    'ai', 'artificial intelligence', 'cloud', 'software', 'hardware',
    'iphone', 'android', 'windows', 'azure', 'aws', 'gpu', 'data center',
    
    # MAG7 Company Names (catches articles about them)
    'apple', 'microsoft', 'google', 'alphabet', 'amazon', 'meta', 
    'facebook', 'tesla', 'nvidia', 'aapl', 'msft', 'googl', 'amzn', 
    'tsla', 'nvda',
    
    # Competition & Industry
    'competitor', 'rival', 'industry', 'sector', 'antitrust', 'regulation',
    'ces', 'tech trends', 'conference', 'keynote', 'announcement', 'launch', 'unveil'
]
def has_financial_keyword(title):
    title_lower = title.lower()
    for kw in financial_keywords:
        # Use word boundary \b to match whole words only
        if re.search(r'\b' + re.escape(kw) + r'\b', title_lower):
            return True
    return False
df_clean['is_relevant'] = df_clean['title'].apply(has_financial_keyword)


In [26]:
print(f"Articles with financial keywords: {df_clean['is_relevant'].sum()}")
print(f"Potential irrelevant: {(~df_clean['is_relevant']).sum()}")

Articles with financial keywords: 225
Potential irrelevant: 73


In [27]:
# Preview potentially irrelevant articles
print("\nPotentially irrelevant titles:")
print(df_clean[~df_clean['is_relevant']]['title'].head(10).values)


Potentially irrelevant titles:
['Trump Biggest Inaugural Donor Benefits from Worker Safety Weakening'
 'Jim Cramer Says  There a Lot of Value in Nike  '
 'Spotify Halts ICE Recruitment Ads Amid 2025 Backlash and Ethics Debate'
 'T - Mobile Launches Better Value Plan : Unlimited 5G , Streaming Perks for $140'
 'Unicode Unveils Emoji 18 . 0 Draft : Squinting Face , Pickle , and More for 2027'
 'US Market Open : US equity futures are modestly lower , whilst DXY is flat as market awaits US CPI - Newsquawk US Opening News'
 'Here a look at some of the key topics coming up this legislative session'
 'Best website research tools for competitive intelligence and market analysis'
 'US Market Open : US equity futures are modestly lower , whilst DXY is flat as market awaits US CPI - Newsquawk US Opening News'
 '5 Highly Anticipated Games That Could Miss 2026']


In [28]:
#keep only relevant data 
df_clean = df_clean[df_clean['is_relevant'] ==  True]
print(f"Clean rows after relevance filter: {len(df_clean)}")

Clean rows after relevance filter: 225


In [29]:
# Debug: Test the function directly
test_titles = [
    "20 Canadian albums we cant wait to hear in 2026",
    "Apple stock rises 5%",
    "How Hermès keeps its clutches on its own handbags"
]

for title in test_titles:
    result = has_financial_keyword(title)
    print(f"{result} <- '{title[:50]}'")

False <- '20 Canadian albums we cant wait to hear in 2026'
True <- 'Apple stock rises 5%'
False <- 'How Hermès keeps its clutches on its own handbags'


In [30]:
df_clean = df_clean[df_clean['is_relevant']==True]
df_clean = df_clean.drop(columns=['is_relevant', 'query' ])

In [31]:
df_clean.head(10)

Unnamed: 0,seendate,url,title,language,domain,socialimage,company,ticker
2,2026-01-13 15:00:00+00:00,https://finance.yahoo.com/news/prediction-spec...,Prediction : This Spectacular Vanguard ETF Wil...,English,finance.yahoo.com,https://s.yimg.com/ny/api/res/1.2/vPftkj6hH9si...,Apple,AAPL
3,2026-01-13 15:00:00+00:00,https://www.proactiveinvestors.com/companies/n...,S & P and Nasdaq hold as Dow starts lower desp...,English,proactiveinvestors.com,https://cdn.proactiveinvestors.com/eyJidWNrZXQ...,Apple,AAPL
4,2026-01-13 14:45:00+00:00,http://www.thailandnews.net/news/278802381/aut...,Auto industry bets on AI partnerships after co...,English,thailandnews.net,https://image.chitra.live/api/v1/wps/ed80633/b...,Apple,AAPL
9,2026-01-13 14:30:00+00:00,https://edition.cnn.com/2026/01/13/tech/plan-a...,How Amazon plans to catch up to ChatGPT,English,edition.cnn.com,https://media.cnn.com/api/v1/images/stellar/pr...,Apple,AAPL
10,2026-01-13 14:30:00+00:00,https://nypost.com/2026/01/13/business/jpmorga...,JPMorgan profit takes a hit as it builds reser...,English,nypost.com,https://nypost.com/wp-content/uploads/sites/2/...,Apple,AAPL
11,2026-01-13 14:30:00+00:00,https://abcnews.go.com/Technology/wireStory/go...,Google corporate parent joins $4 trillion club...,English,abcnews.go.com,https://i.abcnewsfe.com/a/bb8e059a-24ff-4961-9...,Apple,AAPL
12,2026-01-13 14:30:00+00:00,https://economictimes.indiatimes.com/news/inte...,why JPM stock is rising despite earnings miss ...,English,economictimes.indiatimes.com,"https://img.etimg.com/thumb/msid-126506651,wid...",Apple,AAPL
16,2026-01-13 14:30:00+00:00,https://www.insidermonkey.com/blog/jim-cramer-...,Jim Cramer Calls Alphabet Gemini 3 a Home Run,English,insidermonkey.com,https://d2gr5kl7dt2z3t.cloudfront.net/blog/wp-...,Apple,AAPL
17,2026-01-13 14:15:00+00:00,https://www.webpronews.com/evercore-isi-lifts-...,Evercore ISI Lifts Apple Price Target to $330 ...,English,webpronews.com,https://www.webpronews.com/wp-content/uploads/...,Apple,AAPL
18,2026-01-13 14:15:00+00:00,https://timesofindia.indiatimes.com/education/...,Google has hired tons of people without coll...,English,timesofindia.indiatimes.com,https://static.toiimg.com/thumb/msid-126506318...,Apple,AAPL


Sometimes the same story gets reported by multiple outlets with slightly different titles:

In [32]:
from difflib import SequenceMatcher #comparing sequence


In [33]:
def similar(a,b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

#Find titles that >80% similar
titles = df_clean['title'].tolist()
near_dupes=[]
for i, t1 in enumerate(titles):
    for j, t2 in enumerate(titles[i+1:], i+1):
        if similar(t1, t2) > 0.8:
            near_dupes.append((t1,t2,similar(t1,t2)))

print(f"Found {len(near_dupes)} near duplicate pairs")
for t1, t2, score in near_dupes[:5]:
    print(f"\n{score:.0%} similar:")
    print(f"  1: {t1[:60]}...")
    print(f"  2: {t2[:60]}...")

Found 1412 near duplicate pairs

100% similar:
  1: Prediction : This Spectacular Vanguard ETF Will Beat the S &...
  2: Prediction : This Spectacular Vanguard ETF Will Beat the S &...

100% similar:
  1: Prediction : This Spectacular Vanguard ETF Will Beat the S &...
  2: Prediction : This Spectacular Vanguard ETF Will Beat the S &...

100% similar:
  1: Prediction : This Spectacular Vanguard ETF Will Beat the S &...
  2: Prediction : This Spectacular Vanguard ETF Will Beat the S &...

100% similar:
  1: Prediction : This Spectacular Vanguard ETF Will Beat the S &...
  2: Prediction : This Spectacular Vanguard ETF Will Beat the S &...

100% similar:
  1: Prediction : This Spectacular Vanguard ETF Will Beat the S &...
  2: Prediction : This Spectacular Vanguard ETF Will Beat the S &...


In [34]:
# Check one of the duplicate titles
dupe_title = "Apple Is Losing Its Grip on the World Tech Supply Chain"
print(df_clean[df_clean['title'] == dupe_title][['title', 'company', 'ticker']])

Empty DataFrame
Columns: [title, company, ticker]
Index: []


In [35]:
# Save
df_clean.to_csv(output_path, index=False)
print(f"Saved {len(df_clean)} rows to {output_path}")

Saved 225 rows to ../data/processed/gdelt_articles_clean.csv


In [36]:
# All cleaning in one cell (near the end of notebook)
df_clean = df.copy()
df_clean = df_clean.drop(columns=['description', 'sourceCountry'])
df_clean = df_clean.drop_duplicates(subset=['url', 'company'], keep='first')
df_clean = df_clean[df_clean['language'] == 'English']
df_clean = df_clean[df_clean['title'].apply(has_financial_keyword)]
df_clean = df_clean.drop(columns=['query'])

print(f"Final clean data: {len(df_clean)} rows")

# Save immediately after
df_clean.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Final clean data: 225 rows
Saved to ../data/processed/gdelt_articles_clean.csv
