In [2]:
import pandas as pd 
from pathlib import Path
print("GDELT Data Validation")

GDELT Data Validation


In [3]:

#Get project root (parent of notebooks folder)
project_root = Path().resolve().parent

data_path = project_root/"data"/"raw"/"gdelt_articles.csv"
df = pd.read_csv(data_path)


In [4]:
df.dtypes

query             object
seendate          object
url               object
title             object
description      float64
language          object
domain            object
sourceCountry    float64
socialimage       object
company           object
ticker            object
dtype: object

In [6]:
df.shape

(1400, 11)

In [8]:
print("1.Overview")
print(f"Total articles: {len(df)}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"First few rows: {df.head(10)}")

1.Overview
Total articles: 1400

Columns: ['query', 'seendate', 'url', 'title', 'description', 'language', 'domain', 'sourceCountry', 'socialimage', 'company', 'ticker']
First few rows:                                                query  \
0  ("Apple" OR AAPL) (stock OR shares OR earnings...   
1  ("Apple" OR AAPL) (stock OR shares OR earnings...   
2  ("Apple" OR AAPL) (stock OR shares OR earnings...   
3  ("Apple" OR AAPL) (stock OR shares OR earnings...   
4  ("Apple" OR AAPL) (stock OR shares OR earnings...   
5  ("Apple" OR AAPL) (stock OR shares OR earnings...   
6  ("Apple" OR AAPL) (stock OR shares OR earnings...   
7  ("Apple" OR AAPL) (stock OR shares OR earnings...   
8  ("Apple" OR AAPL) (stock OR shares OR earnings...   
9  ("Apple" OR AAPL) (stock OR shares OR earnings...   

                    seendate  \
0  2026-01-15 04:30:00+00:00   
1  2026-01-15 04:30:00+00:00   
2  2026-01-15 04:30:00+00:00   
3  2026-01-15 04:30:00+00:00   
4  2026-01-15 04:30:00+00:00   
5  20

In [9]:
print(df.columns)
df['ticker'].value_counts()

Index(['query', 'seendate', 'url', 'title', 'description', 'language',
       'domain', 'sourceCountry', 'socialimage', 'company', 'ticker'],
      dtype='object')


ticker
AAPL     200
MSFT     200
NVDA     200
GOOGL    200
AMZN     200
META     200
TSLA     200
Name: count, dtype: int64

In [10]:
print("\n2. Anomalies:")
print("Sample of potentially corrupted rows:")
print(df[df.iloc[:,0].str.contains('jpg|png|gif', na=False)])


2. Anomalies:
Sample of potentially corrupted rows:
Empty DataFrame
Columns: [query, seendate, url, title, description, language, domain, sourceCountry, socialimage, company, ticker]
Index: []


Understand the datetime, our csv file has `seendate` column which stored as object(text), which should be convert to datetime, so we can do math

In [16]:
# Show first and last article timestamp
print(f"First article: {df['seendate'].min()}")
print(f"Last article: {df['seendate'].max()}")
print(f"Total time span: {df['seendate'].max() - df['seendate'].min()}")


First article: 2026-01-14 14:45:00+00:00
Last article: 2026-01-15 04:30:00+00:00
Total time span: 0 days 13:45:00


In [18]:
df_sorted = df.sort_values('seendate')
df_sorted['time_since_previous'] = df_sorted['seendate'].diff()

In [19]:
gaps = df_sorted[df_sorted['time_since_previous'] > pd.Timedelta(minutes=5)]

In [21]:
print(f"Numbers of gaps > 5 min: {len(gaps)}")

Numbers of gaps > 5 min: 40


In [22]:
print(gaps[['seendate', 'time_since_previous']].head(10))

                      seendate time_since_previous
798  2026-01-14 15:00:00+00:00     0 days 00:15:00
647  2026-01-14 15:30:00+00:00     0 days 00:30:00
746  2026-01-14 16:00:00+00:00     0 days 00:30:00
1089 2026-01-14 16:30:00+00:00     0 days 00:30:00
792  2026-01-14 16:45:00+00:00     0 days 00:15:00
786  2026-01-14 17:15:00+00:00     0 days 00:30:00
1180 2026-01-14 17:30:00+00:00     0 days 00:15:00
734  2026-01-14 17:45:00+00:00     0 days 00:15:00
780  2026-01-14 18:00:00+00:00     0 days 00:15:00
679  2026-01-14 18:15:00+00:00     0 days 00:15:00
