In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# GDELT Base URL for raw CSV files
GDELT_BASE_URL = "http://data.gdeltproject.org/gdeltv2/"

# Columns from GDELT Event Codebook
EVENT_COLUMNS = [
    "GlobalEventID", "Day", "MonthYear", "Year", "FractionDate", 
    "Actor1Code", "IsRootEvent", "EventCode", "EventBaseCode", "EventRootCode", "QuadClass", "GoldsteinScale", 
    "NumMentions", "NumSources", "NumArticles", "AvgTone", "ActionGeo_Type", "ActionGeo_Fullname", "ActionGeo_CountryCode", "ActionGeo_ADM1Code", 
    "ActionGeo_ADM2Code", "ActionGeo_Lat", "ActionGeo_Long", "ActionGeo_FeatureID", 
    "DATEADDED", "SOURCEURL"
]

def generate_past_intervals(hours=10, interval_minutes=15):
    """
    Generate a list of timestamps for the past 'hours' with a given interval (in minutes).
    GDELT timestamps are in UTC and use YYYYMMDDHHMMSS format.
    """
    timestamps = []
    now = datetime.utcnow()
    for i in range(0, hours * 60, interval_minutes):
        timestamp = now - timedelta(minutes=i)
        timestamps.append(timestamp.strftime("%Y%m%d%H%M%S"))
    return timestamps

def fetch_gdelt_data(timestamp):
    """Fetches GDELT data for a specific timestamp."""
    csv_url = f"{GDELT_BASE_URL}export.{timestamp}.csv"
    try:
        data = pd.read_csv(csv_url, sep="\t", names=EVENT_COLUMNS, encoding="latin1", low_memory=False)
        print(f"✅ Fetched data for {timestamp}")
        return data
    except Exception as e:
        print(f"❌ Failed to fetch data for {timestamp}: {e}")
        return None

def main():
    """Fetches GDELT data for the past 10 hours at 15-minute intervals."""
    timestamps = generate_past_intervals(hours=10, interval_minutes=15)
    
    all_data = []
    
    for timestamp in timestamps:
        data = fetch_gdelt_data(timestamp)
        if data is not None:
            all_data.append(data)
    
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        filename = f"gdelt_events_past10hours_{datetime.utcnow().strftime('%Y%m%d_%H%M')}.csv"
        final_df.to_csv(filename, index=False)
        print(f"✅ Data saved as {filename}")
    else:
        print("❌ No data retrieved.")

if __name__ == "__main__":
    main()


  now = datetime.utcnow()


❌ Failed to fetch data for 20250307093434: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307091934: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307090434: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307084934: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307083434: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307081934: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307080434: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307074934: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307073434: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307071934: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307070434: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307064934: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307063434: HTTP Error 404: Not Found
❌ Failed to fetch data for 20250307061934: HTTP Error 404: Not Found
❌ Failed to fetch data for 2025030

In [2]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# GDELT Base URL for raw CSV files
GDELT_BASE_URL = "http://data.gdeltproject.org/gdeltv2/"

# Columns from GDELT Event Codebook
EVENT_COLUMNS = [
    "GlobalEventID", "Day", "MonthYear", "Year", "FractionDate", 
    "Actor1Code", "IsRootEvent", "EventCode", "EventBaseCode", "EventRootCode", "QuadClass", "GoldsteinScale", 
    "NumMentions", "NumSources", "NumArticles", "AvgTone", "ActionGeo_Type", "ActionGeo_Fullname", "ActionGeo_CountryCode", "ActionGeo_ADM1Code", 
    "ActionGeo_ADM2Code", "ActionGeo_Lat", "ActionGeo_Long", "ActionGeo_FeatureID", 
    "DATEADDED", "SOURCEURL"
]

def generate_past_intervals(hours=10):
    """
    Generate timestamps for the past 'hours' at 15-minute intervals
    ensuring format YYYYMMDDHHMM00 (minutes = 00, 15, 30, or 45).
    """
    timestamps = []
    now = datetime.utcnow()
    
    for i in range(hours * 4):  # 4 intervals per hour (every 15 min)
        timestamp = now - timedelta(minutes=i * 15)
        formatted_timestamp = timestamp.strftime("%Y%m%d%H%M")
        formatted_timestamp = formatted_timestamp[:-2] + "00"  # Set seconds to "00"
        timestamps.append(formatted_timestamp)
    
    return timestamps

def fetch_gdelt_data(timestamp):
    """Fetches GDELT data for a specific timestamp and handles errors."""
    csv_url = f"{GDELT_BASE_URL}export.{timestamp}.csv"
    
    try:
        data = pd.read_csv(csv_url, sep="\t", names=EVENT_COLUMNS, encoding="latin1", low_memory=False)
        print(f"✅ Successfully fetched data for {timestamp}")
        return data
    except requests.exceptions.HTTPError as http_err:
        print(f"❌ HTTP Error for {timestamp}: {http_err}")
    except Exception as e:
        print(f"❌ Failed to fetch data for {timestamp}: {e}")
    return None

def main():
    """Fetches GDELT data for the past 10 hours at 15-minute intervals."""
    timestamps = generate_past_intervals(hours=10)
    
    all_data = []
    
    for timestamp in timestamps:
        data = fetch_gdelt_data(timestamp)
        if data is not None and not data.empty:
            all_data.append(data)
    
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        filename = f"gdelt_events_past10hours_{datetime.utcnow().strftime('%Y%m%d_%H%M')}.csv"
        final_df.to_csv(filename, index=False)
        print(f"✅ Data saved as {filename}")
    else:
        print("❌ No data retrieved for any interval.")

if __name__ == "__main__":
    main()


  now = datetime.utcnow()


❌ Failed to fetch data for 202503070900: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070900: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070900: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070800: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070800: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070800: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070800: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070700: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070700: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070700: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070700: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070600: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070600: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070600: HTTP Error 404: Not Found
❌ Failed to fetch data for 202503070600: HTTP Error 404: Not F

In [3]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# GDELT last update URL
LAST_UPDATE_URL = "http://data.gdeltproject.org/gdeltv2/lastupdate.txt"

# Columns from GDELT Event Codebook
EVENT_COLUMNS = [
    "GlobalEventID", "Day", "MonthYear", "Year", "FractionDate", 
    "Actor1Code", "IsRootEvent", "EventCode", "EventBaseCode", "EventRootCode", "QuadClass", "GoldsteinScale", 
    "NumMentions", "NumSources", "NumArticles", "AvgTone", "ActionGeo_Type", "ActionGeo_Fullname", "ActionGeo_CountryCode", "ActionGeo_ADM1Code", 
    "ActionGeo_ADM2Code", "ActionGeo_Lat", "ActionGeo_Long", "ActionGeo_FeatureID", 
    "DATEADDED", "SOURCEURL"
]

def fetch_available_gdelt_files():
    """
    Fetch the last available GDELT update file URLs.
    Returns a list of URLs from the past 10 hours.
    """
    response = requests.get(LAST_UPDATE_URL)
    
    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        file_urls = []

        for line in lines:
            if "export" in line and ".CSV.zip" in line:
                url = line.split()[-1]  # Extract file URL
                file_urls.append(url)
        
        return file_urls[:40]  # Limit to last 10 hours (each file is ~15 min)
    
    print("❌ Failed to fetch available GDELT update files.")
    return []

def fetch_gdelt_data(file_url):
    """Fetches and extracts GDELT event data from the given URL."""
    try:
        data = pd.read_csv(file_url, sep="\t", names=EVENT_COLUMNS, encoding="latin1", low_memory=False)
        print(f"✅ Successfully fetched data from {file_url}")
        return data
    except Exception as e:
        print(f"❌ Failed to fetch data from {file_url}: {e}")
        return None

def main():
    """Fetches and combines GDELT data from the last 10 hours."""
    file_urls = fetch_available_gdelt_files()
    all_data = []

    for url in file_urls:
        data = fetch_gdelt_data(url)
        if data is not None and not data.empty:
            all_data.append(data)

    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        filename = f"gdelt_events_past10hours_{datetime.utcnow().strftime('%Y%m%d_%H%M')}.csv"
        final_df.to_csv(filename, index=False)
        print(f"✅ Data saved as {filename}")
    else:
        print("❌ No data retrieved from any available files.")

if __name__ == "__main__":
    main()


✅ Successfully fetched data from http://data.gdeltproject.org/gdeltv2/20250307094500.export.CSV.zip
✅ Data saved as gdelt_events_past10hours_20250307_0939.csv


  filename = f"gdelt_events_past10hours_{datetime.utcnow().strftime('%Y%m%d_%H%M')}.csv"


In [4]:
import requests
import pandas as pd
from datetime import datetime

# GDELT last update URL
LAST_UPDATE_URL = "http://data.gdeltproject.org/gdeltv2/lastupdate.txt"

# Columns from GDELT Event Codebook
# Columns from GDELT Event Codebook
EVENT_COLUMNS = [
    "GlobalEventID", "Day", "MonthYear", "Year", "FractionDate", 
    "Actor1Code", "IsRootEvent", "EventCode", "EventBaseCode", "EventRootCode", "QuadClass", "GoldsteinScale", 
    "NumMentions", "NumSources", "NumArticles", "AvgTone", "ActionGeo_Type", "ActionGeo_Fullname", "ActionGeo_CountryCode", "ActionGeo_ADM1Code", 
    "ActionGeo_ADM2Code", "ActionGeo_Lat", "ActionGeo_Long", "ActionGeo_FeatureID", 
    "DATEADDED", "SOURCEURL"
]

def fetch_available_gdelt_files():
    """
    Fetch the last available GDELT update file URLs.
    Returns a list of URLs from the past 10 hours.
    """
    response = requests.get(LAST_UPDATE_URL)
    
    if response.status_code == 200:
        lines = response.text.strip().split("\n")
        file_urls = []

        for line in lines:
            if "export" in line and ".CSV.zip" in line:
                url = line.split()[-1]  # Extract file URL
                file_urls.append(url)
        
        return file_urls[:40]  # Limit to last 10 hours (each file is ~15 min)
    
    print("❌ Failed to fetch available GDELT update files.")
    return []

def fetch_gdelt_data(file_url):
    """Fetches and extracts GDELT event data from the given URL, filtering only for Delhi events."""
    try:
        data = pd.read_csv(file_url, sep="\t", names=EVENT_COLUMNS, encoding="latin1", low_memory=False)

        # Filter only for events happening in Delhi
        delhi_data = data[data["ActionGeo_Fullname"].str.contains("Delhi", na=False, case=False)]

        if not delhi_data.empty:
            print(f"✅ Fetched {len(delhi_data)} Delhi events from {file_url}")
            return delhi_data
        else:
            print(f"🔍 No Delhi events in {file_url}")
            return None

    except Exception as e:
        print(f"❌ Failed to fetch data from {file_url}: {e}")
        return None

def main():
    """Fetches and combines GDELT data for Delhi from the last 10 hours."""
    file_urls = fetch_available_gdelt_files()
    all_data = []

    for url in file_urls:
        data = fetch_gdelt_data(url)
        if data is not None and not data.empty:
            all_data.append(data)

    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        filename = f"gdelt_delhi_events_past10hours_{datetime.utcnow().strftime('%Y%m%d_%H%M')}.csv"
        final_df.to_csv(filename, index=False)
        print(f"✅ Data saved as {filename}")
    else:
        print("❌ No Delhi events retrieved from any available files.")

if __name__ == "__main__":
    main()


✅ Fetched 6 Delhi events from http://data.gdeltproject.org/gdeltv2/20250307100000.export.CSV.zip
✅ Data saved as gdelt_delhi_events_past10hours_20250307_0955.csv


  filename = f"gdelt_delhi_events_past10hours_{datetime.utcnow().strftime('%Y%m%d_%H%M')}.csv"
