This notebook extracts news sentiment score from the GDELT database and then constructs a UK-based housing related news sentiment index for the year 2005 to June 2025.

2005

In [None]:

# Extracting data from GDELT 1.0 Events
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io
import re
import zipfile
import requests
import pandas as pd
import numpy as np

from pathlib import Path
from collections import defaultdict
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# Configuration
YEAR = 2005
BASE_URL = "http://data.gdeltproject.org/events"
CHUNK_SIZE = 200_000

OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# UK identifiers used in GDELT 1.0 Events
UK_FIPS = "UK"
UK_ISO3 = "GBR"

# Column indices for GDELT 1.0 Events (pre-2013 format; 57 columns total)
#  1  = SQLDATE
#  7  = Actor1CountryCode
#  17 = Actor2CountryCode
#  51 = ActionGeo_CountryCode
#  34 = AvgTone
#  28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES  = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str
}

# Robust HTTP session
def build_session(user_agent="gdelt-events/1.0"):
    s = requests.Session()
    s.headers.update({"User-Agent": user_agent})

    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )

    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

# Download yearly ZIP
def fetch_year_zip(year: int, session: requests.Session):
    """
    Try the direct yearly ZIP (1979–2005): {BASE_URL}/{year}.zip
    If that fails, fallback to parsing /events/index.html.
    Returns: (url, ZipFile, inner_filename)
    """
    direct_url = f"{BASE_URL}/{year}.zip"

    # direct download
    try:
        r = session.get(direct_url, timeout=180)
        if r.ok:
            z = zipfile.ZipFile(io.BytesIO(r.content))
            inner = z.namelist()[0]
            return direct_url, z, inner
    except Exception:
        pass

    # fallback: scrape index.html
    idx_html = session.get(f"{BASE_URL}/index.html", timeout=120).text
    m = re.search(rf'href="(.*?/{year}\.zip)"', idx_html, re.I)

    if not m:
        raise RuntimeError(f"Could not find yearly ZIP for {year} on the GDELT events index.")

    url = m.group(1)
    if url.startswith("/"):
        url = "http://data.gdeltproject.org" + url

    r = session.get(url, timeout=180)
    r.raise_for_status()

    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return url, z, inner

#  Monthly aggregation
def _month_start_from_sqldate(series_sqldate: pd.Series) -> pd.Series:
    """Convert SQLDATE (YYYYMMDD) to month-start Timestamp."""
    return (
        pd.to_datetime(series_sqldate, format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp(how="start")
    )

def add_monthly_uk_stats(chunk: pd.DataFrame, counts, tone_sum, tone_n):
    """
    Update monthly counts + tone sums for UK-related rows in this chunk.
    UK-related = ActionGeo country is UK OR Actor1/2 country is GBR.
    """
    is_uk = (
        chunk[COL_ACTC].eq(UK_FIPS)
        | chunk[COL_A1C].eq(UK_ISO3)
        | chunk[COL_A2C].eq(UK_ISO3)
    )

    if not is_uk.any():
        return

    sub = chunk.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = _month_start_from_sqldate(sub[COL_SQLDATE])
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")

    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Monthly document counts
    docs = sub.groupby("Month").size()

    # Monthly tone sums and counts (for mean tone)
    valid_tone = sub["AvgTone"].notna()
    tone_sums = sub.loc[valid_tone].groupby("Month")["AvgTone"].sum()
    tone_cnts = sub.loc[valid_tone].groupby("Month")["AvgTone"].size()

    # Update global dicts
    for m, n in docs.items():
        counts[m] += int(n)

    for m, s in tone_sums.items():
        tone_sum[m] += float(s)

    for m, n in tone_cnts.items():
        tone_n[m] += int(n)

def add_monthly_uk_econ_stats(chunk: pd.DataFrame, counts, tone_sum, tone_n):
    """
    Economic family rows only: EventRootCode starts with '04'.
    """
    is_econ = chunk[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        add_monthly_uk_stats(chunk.loc[is_econ], counts, tone_sum, tone_n)

def finalize_monthly(counts, tone_sum, tone_n) -> pd.DataFrame:
    rows = []
    for m in sorted(counts.keys()):
        docs = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": docs, "AvgTone": avg})
    return pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

# Running extraction
print(f"Running GDELT 1.0 Events extraction for YEAR = {YEAR}")
print(f"Saving outputs to: {OUT_DIR}")

s = build_session(user_agent=f"gdelt-events-{YEAR}/1.0")
url, z, inner = fetch_year_zip(YEAR, s)
print(f"✓ Downloaded yearly ZIP: {url}")
print(f"✓ Reading file inside ZIP: {inner}")

reader = pd.read_csv(
    z.open(inner),
    sep="\t",
    header=None,
    low_memory=False,
    chunksize=CHUNK_SIZE,
    usecols=USECOLS,
    dtype=DTYPES
)

uk_counts   = defaultdict(int)
uk_tsum     = defaultdict(float)
uk_tn       = defaultdict(int)

econ_counts = defaultdict(int)
econ_tsum   = defaultdict(float)
econ_tn     = defaultdict(int)

total_rows = 0

for i, chunk in enumerate(reader, start=1):
    total_rows += len(chunk)

    add_monthly_uk_stats(chunk, uk_counts, uk_tsum, uk_tn)
    add_monthly_uk_econ_stats(chunk, econ_counts, econ_tsum, econ_tn)

    if i % 5 == 0:
        print(f"  processed ~{total_rows:,} rows so far...")

df_uk   = finalize_monthly(uk_counts, uk_tsum, uk_tn)
df_econ = finalize_monthly(econ_counts, econ_tsum, econ_tn)

# Saving outputs
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print(f"Total rows scanned: ~{total_rows:,}")
print(f"UK monthly rows: {len(df_uk)} | Economic monthly rows: {len(df_econ)}")

print("\nUK-all (monthly):")
display(df_uk.head(12))

print("\nUK-economic (EventRootCode '04*') (monthly):")
display(df_econ.head(12))

print("\nSaved files:")
print(" -", out_uk)
print(" -", out_econ)


Mounted at /content/drive
Running GDELT 1.0 Events extraction for YEAR = 2005
Saving outputs to: /content/drive/MyDrive/msc_project/Gdelt_Events_1.0
✓ Downloaded yearly ZIP: http://data.gdeltproject.org/events/2005.zip
✓ Reading file inside ZIP: 2005.csv
  processed ~1,000,000 rows so far...
  processed ~2,000,000 rows so far...
  processed ~3,000,000 rows so far...

--- Done ---
Total rows scanned: ~3,534,874
UK monthly rows: 12 | Economic monthly rows: 12

UK-all (monthly):


Unnamed: 0,Month,Docs,AvgTone
0,2005-01-01,12223,5.48572
1,2005-02-01,14461,5.838248
2,2005-03-01,12688,5.779465
3,2005-04-01,9765,5.893368
4,2005-05-01,10942,5.626196
5,2005-06-01,14543,5.742366
6,2005-07-01,25623,4.728918
7,2005-08-01,14541,4.783105
8,2005-09-01,14552,5.197001
9,2005-10-01,14015,5.401413



UK-economic (EventRootCode '04*') (monthly):


Unnamed: 0,Month,Docs,AvgTone
0,2005-01-01,3443,5.659953
1,2005-02-01,4481,5.84982
2,2005-03-01,3601,6.084276
3,2005-04-01,2613,6.143649
4,2005-05-01,2777,5.747168
5,2005-06-01,4020,5.632774
6,2005-07-01,6308,5.016168
7,2005-08-01,3363,4.927621
8,2005-09-01,3698,5.500993
9,2005-10-01,3940,5.579145



Saved files:
 - /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2005.csv
 - /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2005.csv


2006

In [None]:
# Configuration
YEAR = 2006
BASE_URL = "http://data.gdeltproject.org/events"
OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

# UK identifiers in GDELT 1.0
UK_FIPS = "UK"     # ActionGeo_CountryCode uses FIPS-2
UK_ISO3 = "GBR"    # Actor1/2CountryCode uses ISO-3

# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We only read what we need:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode (ISO3)
#   17 = Actor2CountryCode (ISO3)
#   51 = ActionGeo_CountryCode (FIPS2)
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str
}

# HTTP session + URL list
def make_session():
    """Requests session with retries to handle occasional throttling/errors."""
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_zip_urls(year: int):
    """Return the monthly ZIP urls for a year (2006–2012 → 12 months; 2013 → 1..3)."""
    last_month = 3 if year == 2013 else 12
    return [f"{BASE_URL}/{year}{m:02d}.zip" for m in range(1, last_month + 1)]

def fetch_month_zip(sess, url: str):
    """Download a monthly ZIP and return (ZipFile, inner_file_name) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """
    Add UK-related rows from this chunk into:
      counts[Month]   += number of rows
      tone_sum[Month] += sum(AvgTone)
      tone_n[Month]   += number of non-missing AvgTone values
    """
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS) |
        chunk_df[COL_A1C].eq(UK_ISO3) |
        chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])

    if sub.empty:
        return

    # docs per month
    docs = sub.groupby("Month").size()

    # tone sums + valid counts per month
    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_economic(chunk_df, counts, tone_sum, tone_n):
    """Same as above, but restricted to economic event family (EventRootCode starts with '04')."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        n_docs = counts[month]
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": n_docs, "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    # Ensure we have one row per month even if the month had 0 hits
    last_month = 3 if year == 2013 else 12
    full_index = pd.date_range(f"{year}-01-01", f"{year}-{last_month:02d}-01", freq="MS")

    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )

    return df_out

# Running for the Year 2006
sess = make_session()
urls = monthly_zip_urls(YEAR)

print(f"Year {YEAR}: trying {len(urls)} monthly ZIPs...")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = fetch_month_zip(sess, url)
    if got is None:
        print(f"  • Missing (skip): {url}")
        continue

    z, inner = got
    print(f"  • Processing: {inner}")

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES
    )

    rows_seen = 0
    for i, chunk in enumerate(reader, start=1):
        rows_seen += len(chunk)

        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

        if i % 5 == 0:
            print(f"    read ~{rows_seen:,} rows so far...")

# Building outputs
df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

# Saving to Drive
out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Year 2006: trying 12 monthly ZIPs...
  • Processing: 200601.csv
  • Processing: 200602.csv
  • Processing: 200603.csv
  • Processing: 200604.csv
  • Processing: 200605.csv
  • Processing: 200606.csv
  • Processing: 200607.csv
  • Processing: 200608.csv
  • Processing: 200609.csv
  • Processing: 200610.csv
  • Processing: 200611.csv
  • Processing: 200612.csv

 Done
Saved:
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2006.csv
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2006.csv

Preview (UK all events):


Unnamed: 0,Month,Docs,AvgTone
0,2006-01-01,17312,5.22157
1,2006-02-01,15091,5.228871
2,2006-03-01,28846,5.577158
3,2006-04-01,17763,5.630909
4,2006-05-01,23968,5.630888
5,2006-06-01,27549,5.632453
6,2006-07-01,28312,5.433701
7,2006-08-01,39512,5.021931
8,2006-09-01,37393,5.587386
9,2006-10-01,25470,5.655227



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2006-01-01,5670,5.345079
1,2006-02-01,3695,5.678923
2,2006-03-01,8627,5.792521
3,2006-04-01,5070,5.805373
4,2006-05-01,6969,5.884226
5,2006-06-01,7556,5.987236
6,2006-07-01,7818,5.436749
7,2006-08-01,8647,5.173487
8,2006-09-01,9348,5.740292
9,2006-10-01,6987,5.724813


2007

In [None]:
# Configuration
YEAR = 2007
BASE_URL = "http://data.gdeltproject.org/events"
OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

# UK identifiers in GDELT 1.0
UK_FIPS = "UK"     # ActionGeo_CountryCode uses FIPS-2
UK_ISO3 = "GBR"    # Actor1/2CountryCode uses ISO-3

# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We only read what we need:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode (ISO3)
#   17 = Actor2CountryCode (ISO3)
#   51 = ActionGeo_CountryCode (FIPS2)
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str
}

# HTTP session + URL list
def make_session():
    """Requests session with retries to handle occasional throttling/errors."""
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_zip_urls(year: int):
    """Return the monthly ZIP urls for a year (2006–2012 → 12 months; 2013 → 1..3)."""
    last_month = 3 if year == 2013 else 12
    return [f"{BASE_URL}/{year}{m:02d}.zip" for m in range(1, last_month + 1)]

def fetch_month_zip(sess, url: str):
    """Download a monthly ZIP and return (ZipFile, inner_file_name) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """
    Add UK-related rows from this chunk into:
      counts[Month]   += number of rows
      tone_sum[Month] += sum(AvgTone)
      tone_n[Month]   += number of non-missing AvgTone values
    """
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS) |
        chunk_df[COL_A1C].eq(UK_ISO3) |
        chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])

    if sub.empty:
        return

    # docs per month
    docs = sub.groupby("Month").size()

    # tone sums + valid counts per month
    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_economic(chunk_df, counts, tone_sum, tone_n):
    """Same as above, but restricted to economic event family (EventRootCode starts with '04')."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        n_docs = counts[month]
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": n_docs, "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    # Ensure we have one row per month even if the month had 0 hits
    last_month = 3 if year == 2013 else 12
    full_index = pd.date_range(f"{year}-01-01", f"{year}-{last_month:02d}-01", freq="MS")

    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )

    return df_out

# Running for the Year 2007
sess = make_session()
urls = monthly_zip_urls(YEAR)

print(f"Year {YEAR}: trying {len(urls)} monthly ZIPs...")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = fetch_month_zip(sess, url)
    if got is None:
        print(f"  • Missing (skip): {url}")
        continue

    z, inner = got
    print(f"  • Processing: {inner}")

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES
    )

    rows_seen = 0
    for i, chunk in enumerate(reader, start=1):
        rows_seen += len(chunk)

        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

        if i % 5 == 0:
            print(f"    read ~{rows_seen:,} rows so far...")

# Building outputs
df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

# Saving to Drive
out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Year 2007: trying 12 monthly ZIPs...
  • Processing: 200701.csv
  • Processing: 200702.csv
  • Processing: 200703.csv
    read ~903,257 rows so far...
  • Processing: 200704.csv
    read ~1,000,000 rows so far...
  • Processing: 200705.csv
    read ~1,000,000 rows so far...
  • Processing: 200706.csv
    read ~1,000,000 rows so far...
  • Processing: 200707.csv
    read ~960,901 rows so far...
  • Processing: 200708.csv
  • Processing: 200709.csv
    read ~957,387 rows so far...
  • Processing: 200710.csv
    read ~1,000,000 rows so far...
  • Processing: 200711.csv
    read ~1,000,000 rows so far...
  • Processing: 200712.csv
    read ~940,702 rows so far...

 Done
Saved:
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2007.csv
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2007.csv

Preview (UK all events):


Unnamed: 0,Month,Docs,AvgTone
0,2007-01-01,38932,5.650727
1,2007-02-01,42950,5.515068
2,2007-03-01,69455,5.518773
3,2007-04-01,77598,5.511486
4,2007-05-01,86161,5.475945
5,2007-06-01,81884,5.518685
6,2007-07-01,90549,5.063248
7,2007-08-01,43332,5.276237
8,2007-09-01,60090,5.528454
9,2007-10-01,66685,5.700373



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2007-01-01,8799,5.725413
1,2007-02-01,10238,5.847031
2,2007-03-01,15814,5.662984
3,2007-04-01,17897,5.716621
4,2007-05-01,23474,5.624303
5,2007-06-01,19309,5.639841
6,2007-07-01,19736,5.467247
7,2007-08-01,9327,5.458635
8,2007-09-01,16102,5.664197
9,2007-10-01,17139,5.768694


2008

In [None]:
# Configuration
YEAR = 2008
BASE_URL = "http://data.gdeltproject.org/events"
OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

# UK identifiers in GDELT 1.0
UK_FIPS = "UK"     # ActionGeo_CountryCode uses FIPS-2
UK_ISO3 = "GBR"    # Actor1/2CountryCode uses ISO-3

# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We only read what we need:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode (ISO3)
#   17 = Actor2CountryCode (ISO3)
#   51 = ActionGeo_CountryCode (FIPS2)
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str
}

# HTTP session + URL list
def make_session():
    """Requests session with retries to handle occasional throttling/errors."""
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_zip_urls(year: int):
    """Return the monthly ZIP urls for a year (2006–2012 → 12 months; 2013 → 1..3)."""
    last_month = 3 if year == 2013 else 12
    return [f"{BASE_URL}/{year}{m:02d}.zip" for m in range(1, last_month + 1)]

def fetch_month_zip(sess, url: str):
    """Download a monthly ZIP and return (ZipFile, inner_file_name) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """
    Add UK-related rows from this chunk into:
      counts[Month]   += number of rows
      tone_sum[Month] += sum(AvgTone)
      tone_n[Month]   += number of non-missing AvgTone values
    """
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS) |
        chunk_df[COL_A1C].eq(UK_ISO3) |
        chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])

    if sub.empty:
        return

    # docs per month
    docs = sub.groupby("Month").size()

    # tone sums + valid counts per month
    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_economic(chunk_df, counts, tone_sum, tone_n):
    """Same as above, but restricted to economic event family (EventRootCode starts with '04')."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        n_docs = counts[month]
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": n_docs, "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    # Ensure we have one row per month even if the month had 0 hits
    last_month = 3 if year == 2013 else 12
    full_index = pd.date_range(f"{year}-01-01", f"{year}-{last_month:02d}-01", freq="MS")

    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )

    return df_out

# Running for the Year 2008
sess = make_session()
urls = monthly_zip_urls(YEAR)

print(f"Year {YEAR}: trying {len(urls)} monthly ZIPs...")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = fetch_month_zip(sess, url)
    if got is None:
        print(f"  • Missing (skip): {url}")
        continue

    z, inner = got
    print(f"  • Processing: {inner}")

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES
    )

    rows_seen = 0
    for i, chunk in enumerate(reader, start=1):
        rows_seen += len(chunk)

        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

        if i % 5 == 0:
            print(f"    read ~{rows_seen:,} rows so far...")

# Building outputs
df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

# Saving to Drive
out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Year 2008: trying 12 monthly ZIPs...
  • Processing: 200801.csv
    read ~1,000,000 rows so far...
  • Processing: 200802.csv
    read ~1,000,000 rows so far...
  • Processing: 200803.csv
    read ~1,000,000 rows so far...
  • Processing: 200804.csv
    read ~1,000,000 rows so far...
  • Processing: 200805.csv
    read ~1,000,000 rows so far...
  • Processing: 200806.csv
    read ~1,000,000 rows so far...
  • Processing: 200807.csv
    read ~1,000,000 rows so far...
  • Processing: 200808.csv
    read ~1,000,000 rows so far...
  • Processing: 200809.csv
    read ~912,791 rows so far...
  • Processing: 200810.csv
    read ~1,000,000 rows so far...
  • Processing: 200811.csv
    read ~1,000,000 rows so far...
  • Processing: 200812.csv
    read ~1,000,000 rows so far...

 Done
Saved:
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2008.csv
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2008.csv

Preview (UK all events):


Unnamed: 0,Month,Docs,AvgTone
0,2008-01-01,75043,5.5519
1,2008-02-01,77358,5.573736
2,2008-03-01,65256,5.635886
3,2008-04-01,67284,5.646853
4,2008-05-01,70249,5.763192
5,2008-06-01,80523,5.596329
6,2008-07-01,85633,5.616669
7,2008-08-01,55390,5.580945
8,2008-09-01,51242,5.878708
9,2008-10-01,75640,5.825013



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2008-01-01,19571,5.73971
1,2008-02-01,18318,5.746198
2,2008-03-01,15791,5.833972
3,2008-04-01,17338,5.768374
4,2008-05-01,18000,5.851488
5,2008-06-01,20096,5.610849
6,2008-07-01,24149,5.711628
7,2008-08-01,12356,5.625138
8,2008-09-01,12196,5.892837
9,2008-10-01,18127,5.930972


2009

In [None]:
# Configuration
YEAR = 2009
BASE_URL = "http://data.gdeltproject.org/events"
OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

# UK identifiers in GDELT 1.0
UK_FIPS = "UK"     # ActionGeo_CountryCode uses FIPS-2
UK_ISO3 = "GBR"    # Actor1/2CountryCode uses ISO-3

# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We only read what we need:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode (ISO3)
#   17 = Actor2CountryCode (ISO3)
#   51 = ActionGeo_CountryCode (FIPS2)
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str
}

# HTTP session + URL list
def make_session():
    """Requests session with retries to handle occasional throttling/errors."""
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_zip_urls(year: int):
    """Return the monthly ZIP urls for a year (2006–2012 → 12 months; 2013 → 1..3)."""
    last_month = 3 if year == 2013 else 12
    return [f"{BASE_URL}/{year}{m:02d}.zip" for m in range(1, last_month + 1)]

def fetch_month_zip(sess, url: str):
    """Download a monthly ZIP and return (ZipFile, inner_file_name) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """
    Add UK-related rows from this chunk into:
      counts[Month]   += number of rows
      tone_sum[Month] += sum(AvgTone)
      tone_n[Month]   += number of non-missing AvgTone values
    """
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS) |
        chunk_df[COL_A1C].eq(UK_ISO3) |
        chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])

    if sub.empty:
        return

    # docs per month
    docs = sub.groupby("Month").size()

    # tone sums + valid counts per month
    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_economic(chunk_df, counts, tone_sum, tone_n):
    """Same as above, but restricted to economic event family (EventRootCode starts with '04')."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        n_docs = counts[month]
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": n_docs, "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    # Ensure we have one row per month even if the month had 0 hits
    last_month = 3 if year == 2013 else 12
    full_index = pd.date_range(f"{year}-01-01", f"{year}-{last_month:02d}-01", freq="MS")

    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )

    return df_out

# Running for the Year 2009
sess = make_session()
urls = monthly_zip_urls(YEAR)

print(f"Year {YEAR}: trying {len(urls)} monthly ZIPs...")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = fetch_month_zip(sess, url)
    if got is None:
        print(f"  • Missing (skip): {url}")
        continue

    z, inner = got
    print(f"  • Processing: {inner}")

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES
    )

    rows_seen = 0
    for i, chunk in enumerate(reader, start=1):
        rows_seen += len(chunk)

        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

        if i % 5 == 0:
            print(f"    read ~{rows_seen:,} rows so far...")

# Building outputs
df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

# Saving to Drive
out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Year 2009: trying 12 monthly ZIPs...
  • Processing: 200901.csv
    read ~1,000,000 rows so far...
    read ~1,971,575 rows so far...
  • Processing: 200902.csv
    read ~1,000,000 rows so far...
  • Processing: 200903.csv
    read ~1,000,000 rows so far...
  • Processing: 200904.csv
    read ~1,000,000 rows so far...
    read ~1,910,335 rows so far...
  • Processing: 200905.csv
    read ~1,000,000 rows so far...
    read ~1,917,261 rows so far...
  • Processing: 200906.csv
    read ~1,000,000 rows so far...
    read ~1,808,376 rows so far...
  • Processing: 200907.csv
    read ~1,000,000 rows so far...
  • Processing: 200908.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 200909.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 200910.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 200911.csv
    read ~1,000,000 rows so far...
    read ~1,893,590 rows so far...
 

Unnamed: 0,Month,Docs,AvgTone
0,2009-01-01,99103,5.599559
1,2009-02-01,99910,5.554991
2,2009-03-01,109427,5.576604
3,2009-04-01,111652,5.625235
4,2009-05-01,98546,5.672658
5,2009-06-01,111020,5.602716
6,2009-07-01,99586,5.550419
7,2009-08-01,129756,5.778115
8,2009-09-01,141935,5.754052
9,2009-10-01,127707,5.807638



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2009-01-01,25719,5.592004
1,2009-02-01,27269,5.618076
2,2009-03-01,28277,5.713109
3,2009-04-01,34073,5.855136
4,2009-05-01,26065,5.786144
5,2009-06-01,28319,5.762501
6,2009-07-01,24327,5.827318
7,2009-08-01,31623,5.798037
8,2009-09-01,37955,5.871977
9,2009-10-01,34476,5.862122


2010

In [None]:
# Configuration
YEAR = 2010
BASE_URL = "http://data.gdeltproject.org/events"
OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

# UK identifiers in GDELT 1.0
UK_FIPS = "UK"     # ActionGeo_CountryCode uses FIPS-2
UK_ISO3 = "GBR"    # Actor1/2CountryCode uses ISO-3

# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We only read what we need:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode (ISO3)
#   17 = Actor2CountryCode (ISO3)
#   51 = ActionGeo_CountryCode (FIPS2)
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str
}

# HTTP session + URL list
def make_session():
    """Requests session with retries to handle occasional throttling/errors."""
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_zip_urls(year: int):
    """Return the monthly ZIP urls for a year (2006–2012 → 12 months; 2013 → 1..3)."""
    last_month = 3 if year == 2013 else 12
    return [f"{BASE_URL}/{year}{m:02d}.zip" for m in range(1, last_month + 1)]

def fetch_month_zip(sess, url: str):
    """Download a monthly ZIP and return (ZipFile, inner_file_name) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """
    Add UK-related rows from this chunk into:
      counts[Month]   += number of rows
      tone_sum[Month] += sum(AvgTone)
      tone_n[Month]   += number of non-missing AvgTone values
    """
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS) |
        chunk_df[COL_A1C].eq(UK_ISO3) |
        chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])

    if sub.empty:
        return

    # docs per month
    docs = sub.groupby("Month").size()

    # tone sums + valid counts per month
    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_economic(chunk_df, counts, tone_sum, tone_n):
    """Same as above, but restricted to economic event family (EventRootCode starts with '04')."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        n_docs = counts[month]
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": n_docs, "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    # Ensure we have one row per month even if the month had 0 hits
    last_month = 3 if year == 2013 else 12
    full_index = pd.date_range(f"{year}-01-01", f"{year}-{last_month:02d}-01", freq="MS")

    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )

    return df_out

# Running for the Year 2010
sess = make_session()
urls = monthly_zip_urls(YEAR)

print(f"Year {YEAR}: trying {len(urls)} monthly ZIPs...")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = fetch_month_zip(sess, url)
    if got is None:
        print(f"  • Missing (skip): {url}")
        continue

    z, inner = got
    print(f"  • Processing: {inner}")

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES
    )

    rows_seen = 0
    for i, chunk in enumerate(reader, start=1):
        rows_seen += len(chunk)

        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

        if i % 5 == 0:
            print(f"    read ~{rows_seen:,} rows so far...")

# Building outputs
df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

# Saving to Drive
out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Year 2010: trying 12 monthly ZIPs...
  • Processing: 201001.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201002.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201003.csv
  • Processing: 201004.csv
    read ~1,000,000 rows so far...
  • Processing: 201005.csv
    read ~1,000,000 rows so far...
  • Processing: 201006.csv
    read ~967,769 rows so far...
  • Processing: 201007.csv
    read ~1,000,000 rows so far...
  • Processing: 201008.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201009.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~2,870,049 rows so far...
  • Processing: 201010.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201011.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201012.csv
    read ~1,000,000 rows so far...
    read ~2

Unnamed: 0,Month,Docs,AvgTone
0,2010-01-01,145272,5.568387
1,2010-02-01,116537,5.786885
2,2010-03-01,35491,5.740809
3,2010-04-01,62650,5.56008
4,2010-05-01,48977,5.92985
5,2010-06-01,43186,5.709821
6,2010-07-01,66995,5.77858
7,2010-08-01,118341,5.801101
8,2010-09-01,138955,5.950679
9,2010-10-01,110151,5.959748



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2010-01-01,40168,5.732365
1,2010-02-01,28770,5.945553
2,2010-03-01,9038,5.97828
3,2010-04-01,17387,5.582283
4,2010-05-01,13015,6.126058
5,2010-06-01,11449,5.913676
6,2010-07-01,16942,6.049803
7,2010-08-01,31830,5.904252
8,2010-09-01,38582,6.037061
9,2010-10-01,28094,6.076266


2011

In [None]:
# Configuration
YEAR = 2011
BASE_URL = "http://data.gdeltproject.org/events"
OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

# UK identifiers in GDELT 1.0
UK_FIPS = "UK"     # ActionGeo_CountryCode uses FIPS-2
UK_ISO3 = "GBR"    # Actor1/2CountryCode uses ISO-3

# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We only read what we need:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode (ISO3)
#   17 = Actor2CountryCode (ISO3)
#   51 = ActionGeo_CountryCode (FIPS2)
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str
}

# HTTP session + URL list
def make_session():
    """Requests session with retries to handle occasional throttling/errors."""
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_zip_urls(year: int):
    """Return the monthly ZIP urls for a year (2006–2012 → 12 months; 2013 → 1..3)."""
    last_month = 3 if year == 2013 else 12
    return [f"{BASE_URL}/{year}{m:02d}.zip" for m in range(1, last_month + 1)]

def fetch_month_zip(sess, url: str):
    """Download a monthly ZIP and return (ZipFile, inner_file_name) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """
    Add UK-related rows from this chunk into:
      counts[Month]   += number of rows
      tone_sum[Month] += sum(AvgTone)
      tone_n[Month]   += number of non-missing AvgTone values
    """
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS) |
        chunk_df[COL_A1C].eq(UK_ISO3) |
        chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])

    if sub.empty:
        return

    # docs per month
    docs = sub.groupby("Month").size()

    # tone sums + valid counts per month
    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_economic(chunk_df, counts, tone_sum, tone_n):
    """Same as above, but restricted to economic event family (EventRootCode starts with '04')."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        n_docs = counts[month]
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": n_docs, "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    # Ensure we have one row per month even if the month had 0 hits
    last_month = 3 if year == 2013 else 12
    full_index = pd.date_range(f"{year}-01-01", f"{year}-{last_month:02d}-01", freq="MS")

    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )

    return df_out

# Running for the Year 2011
sess = make_session()
urls = monthly_zip_urls(YEAR)

print(f"Year {YEAR}: trying {len(urls)} monthly ZIPs...")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = fetch_month_zip(sess, url)
    if got is None:
        print(f"  • Missing (skip): {url}")
        continue

    z, inner = got
    print(f"  • Processing: {inner}")

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES
    )

    rows_seen = 0
    for i, chunk in enumerate(reader, start=1):
        rows_seen += len(chunk)

        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

        if i % 5 == 0:
            print(f"    read ~{rows_seen:,} rows so far...")

# Building outputs
df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

# Saving to Drive
out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Year 2011: trying 12 monthly ZIPs...
  • Processing: 201101.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201102.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201103.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~3,000,000 rows so far...
  • Processing: 201104.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~2,947,005 rows so far...
  • Processing: 201105.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201106.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201107.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201108.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~2,854,375 rows so far...
  • Processing: 201109.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 

Unnamed: 0,Month,Docs,AvgTone
0,2011-01-01,114382,5.868613
1,2011-02-01,125672,5.734325
2,2011-03-01,161123,5.823057
3,2011-04-01,145731,6.103139
4,2011-05-01,117843,6.005576
5,2011-06-01,114844,5.886537
6,2011-07-01,144327,5.526316
7,2011-08-01,134605,5.457779
8,2011-09-01,126927,5.847363
9,2011-10-01,104376,5.829063



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2011-01-01,29172,6.1968
1,2011-02-01,31403,5.81231
2,2011-03-01,41818,5.955905
3,2011-04-01,38924,6.254299
4,2011-05-01,32272,6.265659
5,2011-06-01,28151,6.186302
6,2011-07-01,35622,5.877829
7,2011-08-01,30299,5.700682
8,2011-09-01,31978,6.086146
9,2011-10-01,25826,6.026894


2012

In [None]:
# Configuration
YEAR = 2012
BASE_URL = "http://data.gdeltproject.org/events"
OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

# UK identifiers in GDELT 1.0
UK_FIPS = "UK"     # ActionGeo_CountryCode uses FIPS-2
UK_ISO3 = "GBR"    # Actor1/2CountryCode uses ISO-3

# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We only read what we need:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode (ISO3)
#   17 = Actor2CountryCode (ISO3)
#   51 = ActionGeo_CountryCode (FIPS2)
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str
}

# HTTP session + URL list
def make_session():
    """Requests session with retries to handle occasional throttling/errors."""
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_zip_urls(year: int):
    """Return the monthly ZIP urls for a year (2006–2012 → 12 months; 2013 → 1..3)."""
    last_month = 3 if year == 2013 else 12
    return [f"{BASE_URL}/{year}{m:02d}.zip" for m in range(1, last_month + 1)]

def fetch_month_zip(sess, url: str):
    """Download a monthly ZIP and return (ZipFile, inner_file_name) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """
    Add UK-related rows from this chunk into:
      counts[Month]   += number of rows
      tone_sum[Month] += sum(AvgTone)
      tone_n[Month]   += number of non-missing AvgTone values
    """
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS) |
        chunk_df[COL_A1C].eq(UK_ISO3) |
        chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])

    if sub.empty:
        return

    # docs per month
    docs = sub.groupby("Month").size()

    # tone sums + valid counts per month
    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_economic(chunk_df, counts, tone_sum, tone_n):
    """Same as above, but restricted to economic event family (EventRootCode starts with '04')."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        n_docs = counts[month]
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": n_docs, "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    # Ensure we have one row per month even if the month had 0 hits
    last_month = 3 if year == 2013 else 12
    full_index = pd.date_range(f"{year}-01-01", f"{year}-{last_month:02d}-01", freq="MS")

    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )

    return df_out

# Running for the Year 2012
sess = make_session()
urls = monthly_zip_urls(YEAR)

print(f"Year {YEAR}: trying {len(urls)} monthly ZIPs...")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = fetch_month_zip(sess, url)
    if got is None:
        print(f"  • Missing (skip): {url}")
        continue

    z, inner = got
    print(f"  • Processing: {inner}")

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES
    )

    rows_seen = 0
    for i, chunk in enumerate(reader, start=1):
        rows_seen += len(chunk)

        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

        if i % 5 == 0:
            print(f"    read ~{rows_seen:,} rows so far...")

# Building outputs
df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

# Saving to Drive
out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Year 2012: trying 12 monthly ZIPs...
  • Processing: 201201.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201202.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~3,000,000 rows so far...
  • Processing: 201203.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~3,000,000 rows so far...
  • Processing: 201204.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~3,000,000 rows so far...
  • Processing: 201205.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~3,000,000 rows so far...
  • Processing: 201206.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~2,824,395 rows so far...
  • Processing: 201207.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processing: 201208.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • Processi

Unnamed: 0,Month,Docs,AvgTone
0,2012-01-01,120132,5.806493
1,2012-02-01,163064,5.708025
2,2012-03-01,156369,5.713234
3,2012-04-01,145284,5.638722
4,2012-05-01,149291,5.821468
5,2012-06-01,144762,6.000214
6,2012-07-01,132718,6.18333
7,2012-08-01,125275,6.125585
8,2012-09-01,123476,6.072576
9,2012-10-01,121303,6.105137



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2012-01-01,30652,5.969752
1,2012-02-01,42063,5.961819
2,2012-03-01,40837,6.00281
3,2012-04-01,37448,5.871764
4,2012-05-01,41408,6.056408
5,2012-06-01,42814,6.140011
6,2012-07-01,36333,6.332878
7,2012-08-01,28950,6.418421
8,2012-09-01,30464,6.208529
9,2012-10-01,29704,6.27234


2013

In [None]:
# Configuration
YEAR = 2013
BASE_URL = "http://data.gdeltproject.org/events"

OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

# UK identifiers in GDELT 1.0
UK_FIPS = "UK"     # ActionGeo_* uses FIPS-2
UK_ISO3 = "GBR"    # Actor*CountryCode uses ISO-3

# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We only read what we need:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode (ISO3)
#   17 = Actor2CountryCode (ISO3)
#   51 = ActionGeo_CountryCode (FIPS2)
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str,
}

# HTTP session + URL list
def make_session():
    """Requests session with retries (handy for intermittent 429/5xx)."""
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-2013/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def fetch_zip(sess, url: str):
    """Download a ZIP and return (ZipFile, inner_filename) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """Add UK-related rows from this chunk into monthly totals."""
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS)
        | chunk_df[COL_A1C].eq(UK_ISO3)
        | chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])

    if sub.empty:
        return

    docs = sub.groupby("Month").size()

    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_economic(chunk_df, counts, tone_sum, tone_n):
    """Same as above, but restricted to economic event family (EventRootCode starts with '04')."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": counts[month], "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    full_index = pd.date_range(f"{year}-01-01", f"{year}-12-01", freq="MS")
    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )
    return df_out

# Running for Year 2013
sess = make_session()

uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

print("Processing 2013 monthly ZIPs (Jan–Mar)...")
for m in [1, 2, 3]:
    url = f"{BASE_URL}/{YEAR}{m:02d}.zip"
    got = fetch_zip(sess, url)
    if got is None:
        print(f"  • Missing (skip): {url}")
        continue

    z, inner = got
    print(f"  • {inner}")

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES,
    )

    rows_seen = 0
    for i, chunk in enumerate(reader, start=1):
        rows_seen += len(chunk)
        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

        if i % 5 == 0:
            print(f"    read ~{rows_seen:,} rows so far...")

print("\nProcessing 2013 daily ZIPs (Apr–Dec)...")
daily_dates = pd.date_range("2013-04-01", "2013-12-31", freq="D")

for d in daily_dates:
    url = f"{BASE_URL}/{d:%Y%m%d}.export.CSV.zip"
    got = fetch_zip(sess, url)
    if got is None:
        continue  # some days can be missing
    z, inner = got

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES,
    )

    for chunk in reader:
        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_economic(chunk, ec_counts, ec_tsum, ec_tn)

# Saving into drive
df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Processing 2013 monthly ZIPs (Jan–Mar)...
  • 201301.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • 201302.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
  • 201303.csv
    read ~1,000,000 rows so far...
    read ~2,000,000 rows so far...
    read ~2,834,527 rows so far...

Processing 2013 daily ZIPs (Apr–Dec)...

 Done
Saved:
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2013.csv
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2013.csv

Preview (UK all events):


Unnamed: 0,Month,Docs,AvgTone
0,2013-01-01,126048,2.425202
1,2013-02-01,116075,2.565489
2,2013-03-01,125238,2.609551
3,2013-04-01,68333,2.500042
4,2013-05-01,80872,2.296863
5,2013-06-01,131886,2.576806
6,2013-07-01,185674,2.691628
7,2013-08-01,218965,2.447774
8,2013-09-01,217858,2.529328
9,2013-10-01,215743,2.679382



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2013-01-01,32178,2.481781
1,2013-02-01,33896,2.601469
2,2013-03-01,31667,2.749431
3,2013-04-01,17039,2.49661
4,2013-05-01,20432,2.407361
5,2013-06-01,37077,2.733151
6,2013-07-01,45337,2.831854
7,2013-08-01,56660,2.543029
8,2013-09-01,59406,2.626215
9,2013-10-01,60936,2.790972


2014

In [None]:
# Configuration
YEAR = 2014
BASE_URL = "http://data.gdeltproject.org/events"

OUT_DIR = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

UK_FIPS = "UK"   # ActionGeo_CountryCode (FIPS-2)
UK_ISO3 = "GBR"  # Actor*CountryCode (ISO-3)


# GDELT 1.0 column positions
# Events 1.0 files have 57 columns (0..56). We read only these:
#   1  = SQLDATE (YYYYMMDD)
#   7  = Actor1CountryCode
#   17 = Actor2CountryCode
#   51 = ActionGeo_CountryCode
#   34 = AvgTone
#   28 = EventRootCode
COL_SQLDATE = 1
COL_A1C     = 7
COL_A2C     = 17
COL_ACTC    = 51
COL_TONE    = 34
COL_ROOT    = 28

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str,
}


# HTTP session + URL list
def make_session():
    """Requests session with retries (handles 429/5xx more gracefully)."""
    s = requests.Session()
    s.headers.update({"User-Agent": f"gdelt-events-{YEAR}/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def fetch_zip(sess, url: str):
    """Download a ZIP and return (ZipFile, inner_filename) or None if not found."""
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner


# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """Add UK-related rows from this chunk into monthly totals."""
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS)
        | chunk_df[COL_A1C].eq(UK_ISO3)
        | chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    docs = sub.groupby("Month").size()

    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_econ(chunk_df, counts, tone_sum, tone_n):
    """Economic subset only: EventRootCode starts with '04'."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, year: int):
    """Build a tidy monthly DataFrame and reindex so missing months appear explicitly."""
    rows = []
    for month in sorted(counts.keys()):
        if month.year != year:
            continue
        avg_tone = (tone_sum[month] / tone_n[month]) if tone_n[month] else np.nan
        rows.append({"Month": month, "Docs": counts[month], "AvgTone": avg_tone})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    full_index = pd.date_range(f"{year}-01-01", f"{year}-12-01", freq="MS")
    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )
    return df_out


# Running for the Year 2014

sess = make_session()

uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

all_days = pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-31", freq="D")
missing_days = 0

print(f"Downloading and processing daily GDELT files for {YEAR} ({len(all_days)} days)...")

for i, day in enumerate(all_days, start=1):
    url = f"{BASE_URL}/{day:%Y%m%d}.export.CSV.zip"
    got = fetch_zip(sess, url)

    if got is None:
        missing_days += 1
        if i % 30 == 0:
            print(f"  {i}/{len(all_days)} days done — missing so far: {missing_days}")
        continue

    z, inner = got

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES,
    )

    for chunk in reader:
        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_econ(chunk, ec_counts, ec_tsum, ec_tn)

    if i % 30 == 0:
        print(f"  Processed {i}/{len(all_days)} days — missing so far: {missing_days}")


# Saving into drive

df_uk = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, YEAR)
df_ec = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, YEAR)

out_uk = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_ec = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_ec.to_csv(out_ec, index=False, date_format="%Y-%m-%d")

print("\n Done")
print(f"Saved:\n  {out_uk}\n  {out_ec}")
print(f"Missing daily files skipped: {missing_days}")

print("\nPreview (UK all events):")
display(df_uk.head(12))

print("\nPreview (UK economic events only):")
display(df_ec.head(12))


Downloading and processing daily GDELT files for 2014 (365 days)...
  Processed 30/365 days — missing so far: 3
  Processed 60/365 days — missing so far: 3
  Processed 90/365 days — missing so far: 4
  Processed 120/365 days — missing so far: 4
  Processed 150/365 days — missing so far: 4
  Processed 180/365 days — missing so far: 4
  Processed 210/365 days — missing so far: 4
  Processed 240/365 days — missing so far: 4
  Processed 270/365 days — missing so far: 4
  Processed 300/365 days — missing so far: 4
  Processed 330/365 days — missing so far: 4
  Processed 360/365 days — missing so far: 4

 Done
Saved:
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2014.csv
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2014.csv
Missing daily files skipped: 4

Preview (UK all events):


Unnamed: 0,Month,Docs,AvgTone
0,2014-01-01,139963,2.678215
1,2014-02-01,166052,2.698572
2,2014-03-01,178579,2.651045
3,2014-04-01,184551,2.720858
4,2014-05-01,195643,2.677188
5,2014-06-01,191109,2.667295
6,2014-07-01,214956,2.612385
7,2014-08-01,205392,2.56462
8,2014-09-01,278268,2.626609
9,2014-10-01,265410,2.680726



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2014-01-01,35665,2.769333
1,2014-02-01,43668,2.859537
2,2014-03-01,49400,2.715
3,2014-04-01,51001,2.88143
4,2014-05-01,49443,2.832798
5,2014-06-01,52893,2.755171
6,2014-07-01,58283,2.700179
7,2014-08-01,54274,2.678771
8,2014-09-01,75431,2.690719
9,2014-10-01,71706,2.814015


2015-January

In [None]:
# Configuration
START_DATE = "2015-01-01"
END_DATE   = "2015-01-31"

BASE_URL = "http://data.gdeltproject.org/events"
OUT_DIR  = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_SIZE = 200_000

UK_FIPS = "UK"   # ActionGeo_CountryCode (FIPS-2)
UK_ISO3 = "GBR"  # Actor1/Actor2CountryCode (ISO-3)

# GDELT 1.0 event files have 57 columns (0..56). We only need a few:
COL_SQLDATE = 1    # YYYYMMDD
COL_A1C     = 7    # Actor1CountryCode
COL_A2C     = 17   # Actor2CountryCode
COL_ACTC    = 51   # ActionGeo_CountryCode
COL_TONE    = 34   # AvgTone
COL_ROOT    = 28   # EventRootCode

USECOLS = [COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT]
DTYPES = {
    COL_SQLDATE: str,
    COL_A1C: str,
    COL_A2C: str,
    COL_ACTC: str,
    COL_TONE: float,
    COL_ROOT: str,
}

# HTTP session + URL list
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-2015-jan/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def fetch_daily_zip(sess, date: pd.Timestamp):
    """Download the daily zip for a date. Returns (ZipFile, inner_filename) or None."""
    url = f"{BASE_URL}/{date:%Y%m%d}.export.CSV.zip"
    r = sess.get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner


# Aggregation logic
def update_monthly_totals(chunk_df, counts, tone_sum, tone_n):
    """
    Add UK-related rows from one chunk into monthly totals:
      - Docs   = number of UK-related events
      - AvgTone aggregated as sum + count (so we can compute mean at the end)
    """
    is_uk = (
        chunk_df[COL_ACTC].eq(UK_FIPS)
        | chunk_df[COL_A1C].eq(UK_ISO3)
        | chunk_df[COL_A2C].eq(UK_ISO3)
    )
    if not is_uk.any():
        return

    sub = chunk_df.loc[is_uk, [COL_SQLDATE, COL_TONE]].copy()

    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
        .dt.to_period("M")
        .dt.to_timestamp(how="start")
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    docs = sub.groupby("Month").size()

    tone_valid = sub.dropna(subset=["AvgTone"])
    tone_sums = tone_valid.groupby("Month")["AvgTone"].sum()
    tone_counts = tone_valid.groupby("Month")["AvgTone"].size()

    for m, n_docs in docs.items():
        counts[m] += int(n_docs)
        tone_sum[m] += float(tone_sums.get(m, 0.0))
        tone_n[m] += int(tone_counts.get(m, 0))

def update_monthly_totals_econ(chunk_df, counts, tone_sum, tone_n):
    """Economic subset only: EventRootCode starts with '04'."""
    is_econ = chunk_df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if is_econ.any():
        update_monthly_totals(chunk_df.loc[is_econ, :], counts, tone_sum, tone_n)

def finalize_monthly_frame(counts, tone_sum, tone_n, start_date: str, end_date: str):
    """Build a monthly DataFrame for the months covered by the requested window."""
    start_m = pd.to_datetime(start_date).to_period("M").to_timestamp(how="start")
    end_m   = pd.to_datetime(end_date).to_period("M").to_timestamp(how="start")

    rows = []
    for m in sorted(counts.keys()):
        if (m < start_m) or (m > end_m):
            continue
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": counts[m], "AvgTone": avg})

    df_out = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    full_index = pd.date_range(start_m, end_m, freq="MS")
    df_out = (
        df_out.set_index("Month")
        .reindex(full_index)
        .rename_axis("Month")
        .reset_index()
    )
    return df_out


# Running for Jan 2015
sess = make_session()
days = pd.date_range(START_DATE, END_DATE, freq="D")

uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

missing = 0
print(f"Processing {len(days)} daily GDELT files ({START_DATE} → {END_DATE})...")

for i, day in enumerate(days, start=1):
    got = fetch_daily_zip(sess, day)

    if got is None:
        missing += 1
        if i % 10 == 0:
            print(f"  {i}/{len(days)} days done — missing so far: {missing}")
        continue

    z, inner = got

    reader = pd.read_csv(
        z.open(inner),
        sep="\t",
        header=None,
        low_memory=False,
        chunksize=CHUNK_SIZE,
        usecols=USECOLS,
        dtype=DTYPES,
    )

    for chunk in reader:
        update_monthly_totals(chunk, uk_counts, uk_tsum, uk_tn)
        update_monthly_totals_econ(chunk, ec_counts, ec_tsum, ec_tn)

    if i % 10 == 0:
        print(f"  Processed {i}/{len(days)} days — missing so far: {missing}")


# Saving into drive

df_uk   = finalize_monthly_frame(uk_counts, uk_tsum, uk_tn, START_DATE, END_DATE)
df_econ = finalize_monthly_frame(ec_counts, ec_tsum, ec_tn, START_DATE, END_DATE)

out_suffix = "2015_Jan"
out_uk   = OUT_DIR / f"events_uk_monthly_{out_suffix}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{out_suffix}.csv"

df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n Done")
print("Saved:")
print(" ", out_uk)
print(" ", out_econ)
print(f"Missing daily files skipped: {missing}")

print("\nPreview (UK all events):")
display(df_uk)

print("\nPreview (UK economic events only):")
display(df_econ)


Processing 31 daily GDELT files (2015-01-01 → 2015-01-31)...
  Processed 10/31 days — missing so far: 0
  Processed 20/31 days — missing so far: 0
  Processed 30/31 days — missing so far: 0

 Done
Saved:
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2015_Jan.csv
  /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2015_Jan.csv
Missing daily files skipped: 0

Preview (UK all events):


Unnamed: 0,Month,Docs,AvgTone
0,2015-01-01,219962,2.650666



Preview (UK economic events only):


Unnamed: 0,Month,Docs,AvgTone
0,2015-01-01,60775,2.742955


2015- Feb-Dec

In [None]:
!pip -q install pandas requests
import io, re, zipfile, math, time, json, logging
from pathlib import Path
from datetime import datetime
from collections import defaultdict

import numpy as np
import pandas as pd
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

logging.getLogger("urllib3").setLevel(logging.ERROR)

# Configuration
YEAR = 2015

# Controls
HOURLY_ONLY = True
PROGRESS_EVERY = 100
CHECKPOINT_EVERY = 200
MAX_FILES = None

# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
# 1 = DATE, 8 = V2Themes, 10 = V2Locations, 15 = V2Tone
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

# Where we write outputs for this year
OUT_DIR = Path(f"/content/Gdelt_run_{YEAR}")
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"

# Time window per year
def year_bounds(y: int):
    """
    GKG v2 starts 2015-02-18, so 2015 must start mid-Feb.
    2025 is clipped at 2025-06-30.
    """
    if y == 2015:
        return "20150218000000", "20151231235959"
    if y == 2025:
        return "20250101000000", "20250630235959"
    return f"{y}0101000000", f"{y}1231235959"

START_TS, END_TS = year_bounds(YEAR)

# HTTP session + URL list
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url: str, sess: requests.Session, timeout: int = 90):
    # Force HTTP to avoid occasional SSL hostname issues
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# Building list of GKG URLs
def build_gkg_urls(sess: requests.Session):
    """
    Reads GDELT masterfilelist and keeps URLs that:
      - match *.gkg.csv.zip
      - have timestamp within [START_TS, END_TS]
      - optionally: keep only hourly files (HH == "00")
    """
    txt = http_get(MASTER_URL, sess=sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing
def month_from_dateint(dateint: str) -> str:
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone: str) -> float:
    """
    V2Tone looks like:
      "tone,positive,negative,polarity,activityDensity,..."
    We only use the first field = document tone.
    """
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df: pd.DataFrame, counts, tone_sum, tone_n, uk_re, theme_re) -> int:
    """
    Filters rows to UK + theme, then updates monthly aggregates.
    Returns number of rows kept.
    """
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving checkpoint + CSV
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": int(counts[m]), "avg_tone": avg})

    monthly = pd.DataFrame(rows, columns=["month", "docs", "avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)

    state_obj = {
        "processed_count": processed_count,
        "processed_urls_tail": processed_urls_tail[-20000:],
        "counts": dict(counts),
        "tone_sum": dict(tone_sum),
        "tone_n": dict(tone_n),
    }
    with open(STATE_PATH, "w") as f:
        json.dump(state_obj, f)

    print(f"Saved → {YEARLY_CSV}")

# Running for Year 2015
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"[{YEAR}] planned files: {len(urls_all):,}")

# Resume state if it exists
if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n = defaultdict(int, state.get("tone_n", {}))
    print(f"Resuming from checkpoint: {processed_count} files already processed.")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]

if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

print(f"[{YEAR}] remaining this run: {len(todo):,}")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        r = http_get(url, sess=sess, timeout=90)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        inner = z.namelist()[0]

        df = pd.read_csv(
            z.open(inner),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I: str, V2THEMES_I: str, V2LOCS_I: str, V2TONE_I: str},
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)

        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time() - t0)
        rate = i / elapsed
        eta_min = (len(todo) - i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA ≈ {eta_min:.1f} min | docs_kept={docs_kept:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# Final write
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This run: {len(todo):,} files | Total processed: {processed_count:,} | Docs kept: {docs_kept:,}")

# Preview of the year output

df_year = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")

full = (
    df_year.set_index("month")
    .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
    .rename_axis("month")
    .reset_index()
)

pd.set_option("display.float_format", lambda x: f"{x:,.3f}")
print(
    full.rename(columns={"month": "Month", "docs": "Docs", "avg_tone": "AvgTone"})
        .to_string(index=False)
)

# Stitching this year into a running master CSV

BASE_DIR = Path("/content")

year_file   = BASE_DIR / f"Gdelt_run_{YEAR}/Gdelt_uk_housing_monthly_{YEAR}.csv"
master_file = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

y = pd.read_csv(year_file, parse_dates=["month"]).sort_values("month")

if master_file.exists():
    m = pd.read_csv(master_file, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(master_file, index=False)
print("Saved master:", master_file, "| shape:", m.shape)
print(m.tail(12).to_string(index=False))


[2015] planned files: 7,536
[2015] remaining this run: 7,536
[2015] 100/7536 | 0.55 files/s | ETA ≈ 223.9 min | docs_kept=330
[2015] 200/7536 | 0.60 files/s | ETA ≈ 202.8 min | docs_kept=1,324
Saved → /content/Gdelt_run_2015/Gdelt_uk_housing_monthly_2015.csv
[2015] 300/7536 | 0.65 files/s | ETA ≈ 185.3 min | docs_kept=2,462
[2015] 400/7536 | 0.65 files/s | ETA ≈ 184.0 min | docs_kept=4,263
Saved → /content/Gdelt_run_2015/Gdelt_uk_housing_monthly_2015.csv
[2015] 500/7536 | 0.65 files/s | ETA ≈ 179.9 min | docs_kept=5,824
[2015] 600/7536 | 0.66 files/s | ETA ≈ 175.1 min | docs_kept=7,363
Saved → /content/Gdelt_run_2015/Gdelt_uk_housing_monthly_2015.csv
[2015] 700/7536 | 0.65 files/s | ETA ≈ 174.5 min | docs_kept=9,621
[2015] 800/7536 | 0.66 files/s | ETA ≈ 169.1 min | docs_kept=11,127
Saved → /content/Gdelt_run_2015/Gdelt_uk_housing_monthly_2015.csv
[2015] 900/7536 | 0.66 files/s | ETA ≈ 168.5 min | docs_kept=13,340
[2015] 1000/7536 | 0.66 files/s | ETA ≈ 165.1 min | docs_kept=15,314
Sav

2016

In [None]:
# Configuration
YEAR = 2016
# Controls
HOURLY_ONLY = True
PROGRESS_EVERY = 100
CHECKPOINT_EVERY = 200
MAX_FILES = None

# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
# 1 = DATE, 8 = V2Themes, 10 = V2Locations, 15 = V2Tone
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

# Output location
OUT_DIR = Path(f"/content/Gdelt_run_{YEAR}")
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"

# Full 2016 time window
START_TS, END_TS = "20160101000000", "20161231235959"

# HTTP session and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url: str, sess: requests.Session, timeout: int = 90):
    # force HTTP to avoid Colab SSL hostname issues (same as 2015)
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# Building list of GKG URLs
def build_gkg_urls(sess: requests.Session):
    """
    Reads masterfilelist.txt and keeps:
      - *.gkg.csv.zip
      - timestamps within the year window
      - hourly-only files if HOURLY_ONLY=True (HH == '00')
    """
    txt = http_get(MASTER_URL, sess=sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing
def month_from_dateint(dateint: str) -> str:
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone: str) -> float:
    # V2Tone: "tone,positive,negative,..."
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df: pd.DataFrame, counts, tone_sum, tone_n, uk_re, theme_re) -> int:
    """
    Keep rows that mention the UK + match your housing theme pattern,
    then update monthly counts and tone sums.
    """
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving checkpoint + yearly CSV
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": int(counts[m]), "avg_tone": avg})

    monthly = pd.DataFrame(rows, columns=["month", "docs", "avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)

    state_obj = {
        "processed_count": processed_count,
        "processed_urls_tail": processed_urls_tail[-20000:],
        "counts": dict(counts),
        "tone_sum": dict(tone_sum),
        "tone_n": dict(tone_n),
    }
    with open(STATE_PATH, "w") as f:
        json.dump(state_obj, f)

    print(f"Saved → {YEARLY_CSV}")

# Run for Year 2016
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"[{YEAR}] planned files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n = defaultdict(int, state.get("tone_n", {}))
    print(f"Resuming from checkpoint: {processed_count} files already processed.")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

print(f"[{YEAR}] remaining this run: {len(todo):,}")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        r = http_get(url, sess=sess, timeout=90)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        inner = z.namelist()[0]

        df = pd.read_csv(
            z.open(inner),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I: str, V2THEMES_I: str, V2LOCS_I: str, V2TONE_I: str},
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)

        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time() - t0)
        rate = i / elapsed
        eta_min = (len(todo) - i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA ≈ {eta_min:.1f} min | docs_kept={docs_kept:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# Final write
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This run: {len(todo):,} files | Total processed: {processed_count:,} | Docs kept: {docs_kept:,}")

# Stitch into master file
BASE_DIR = Path("/content")
year_file = BASE_DIR / f"Gdelt_run_{YEAR}/Gdelt_uk_housing_monthly_{YEAR}.csv"
master_file = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

y = pd.read_csv(year_file, parse_dates=["month"]).sort_values("month")

if master_file.exists():
    m = pd.read_csv(master_file, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(master_file, index=False)
print("Saved master:", master_file, "| shape:", m.shape)
print(m.tail(12).to_string(index=False))


[2016] planned files: 8,774
Resuming from checkpoint: 8154 files already processed.
[2016] remaining this run: 620
[2016] 100/620 | 0.44 files/s | ETA ≈ 19.7 min | docs_kept=2,569
[2016] 200/620 | 0.48 files/s | ETA ≈ 14.7 min | docs_kept=5,621
Saved → /content/Gdelt_run_2016/Gdelt_uk_housing_monthly_2016.csv
[2016] 300/620 | 0.49 files/s | ETA ≈ 11.0 min | docs_kept=9,429
[2016] 400/620 | 0.49 files/s | ETA ≈ 7.5 min | docs_kept=13,403
Saved → /content/Gdelt_run_2016/Gdelt_uk_housing_monthly_2016.csv
[2016] 500/620 | 0.51 files/s | ETA ≈ 3.9 min | docs_kept=16,257
[2016] 600/620 | 0.52 files/s | ETA ≈ 0.6 min | docs_kept=20,372
Saved → /content/Gdelt_run_2016/Gdelt_uk_housing_monthly_2016.csv
Saved → /content/Gdelt_run_2016/Gdelt_uk_housing_monthly_2016.csv
[2016] Done. This run: 620 files | Total processed: 8,723 | Docs kept: 20,948
Saved master: /content/Gdelt_uk_housing_monthly_STITCHING.csv | shape: (23, 3)
     month  docs  avg_tone
2016-01-01 22027    -1.389
2016-02-01 20296    

In [None]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [None]:
import os
print(os.listdir("/content"))


['.config', 'Gdelt_run_2017', 'Gdelt_run_2016', 'Gdelt_run_2015', 'Gdelt_uk_housing_monthly_STITCHING.csv', 'drive', 'sample_data']


In [None]:
from pathlib import Path

DRIVE_BASE = Path("/content/drive/MyDrive/msc_project/GDELT")
DRIVE_BASE.mkdir(parents=True, exist_ok=True)

print("Drive base exists at:", DRIVE_BASE)


Drive base exists at: /content/drive/MyDrive/msc_project/GDELT


In [None]:
# Copying the folders to drive
import shutil
from pathlib import Path

SRC_BASE = Path("/content")
DST_BASE = Path("/content/drive/MyDrive/msc_project/GDELT")
DST_BASE.mkdir(parents=True, exist_ok=True)

items_to_copy = [
    "Gdelt_run_2015",
    "Gdelt_run_2016",
    "Gdelt_run_2017",
    "Gdelt_uk_housing_monthly_STITCHING.csv",
]

for name in items_to_copy:
    src = SRC_BASE / name
    dst = DST_BASE / name

    if src.is_dir():
        if dst.exists():
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
        print(f"Folder copied: {name}")
    elif src.is_file():
        shutil.copy2(src, dst)
        print(f"File copied: {name}")
    else:
        print(f"Not found: {name}")


Folder copied: Gdelt_run_2015
Folder copied: Gdelt_run_2016
Folder copied: Gdelt_run_2017
File copied: Gdelt_uk_housing_monthly_STITCHING.csv


In [None]:
from pathlib import Path
import shutil

SRC = Path("/content")
DST = Path("/content/drive/MyDrive/msc_project/GDELT")

for year in [2015, 2016]:
    src = SRC / f"Gdelt_run_{year}"
    dst = DST / f"Gdelt_run_{year}"
    if src.exists() and not dst.exists():
        shutil.copytree(src, dst)
        print(f"Copied {src} → {dst}")


In [None]:
import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/msc_project/GDELT")

years = list(range(2015, 2026))
frames = []

for y in years:
    p = BASE / f"Gdelt_run_{y}" / f"Gdelt_uk_housing_monthly_{y}.csv"
    if p.exists():
        df = pd.read_csv(p)
        # normalizing column names
        ren = {}
        if "Month" in df.columns: ren["Month"] = "month"
        if "Docs" in df.columns: ren["Docs"] = "docs"
        if "AvgTone" in df.columns: ren["AvgTone"] = "avg_tone"
        if ren: df = df.rename(columns=ren)

        df["month"] = pd.to_datetime(df["month"], errors="coerce").dt.to_period("M").dt.to_timestamp(how="start")
        df["docs"] = pd.to_numeric(df["docs"], errors="coerce")
        df["avg_tone"] = pd.to_numeric(df["avg_tone"], errors="coerce")

        frames.append(df[["month", "docs", "avg_tone"]].copy())
        print(f"Loaded {y}: {len(df)} rows")
    else:
        print(f"Missing year file: {p.name}")

if not frames:
    raise ValueError("No yearly files found. Check BASE path and folder names.")

master = (
    pd.concat(frames, ignore_index=True)
      .dropna(subset=["month"])
      .sort_values("month")
      .drop_duplicates(subset=["month"], keep="last")
      .reset_index(drop=True)
)

out_path = BASE / "Gdelt_uk_housing_monthly_STITCHING.csv"
master.to_csv(out_path, index=False)

print("\nRebuilt master:", out_path)
print("Date range:", master["month"].min(), "→", master["month"].max())
print("Rows:", len(master), "| Unique months:", master["month"].nunique())

Loaded 2015: 11 rows
Loaded 2016: 12 rows
Missing year file: Gdelt_uk_housing_monthly_2017.csv
Loaded 2018: 12 rows
Loaded 2019: 1 rows
Missing year file: Gdelt_uk_housing_monthly_2020.csv
Missing year file: Gdelt_uk_housing_monthly_2021.csv
Missing year file: Gdelt_uk_housing_monthly_2022.csv
Missing year file: Gdelt_uk_housing_monthly_2023.csv
Missing year file: Gdelt_uk_housing_monthly_2024.csv
Missing year file: Gdelt_uk_housing_monthly_2025.csv

Rebuilt master: /content/drive/MyDrive/msc_project/GDELT/Gdelt_uk_housing_monthly_STITCHING.csv
Date range: 2015-02-01 00:00:00 → 2019-01-01 00:00:00
Rows: 36 | Unique months: 36


2017

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, re, zipfile, math, time, json, logging
from pathlib import Path
from datetime import datetime
from collections import defaultdict

import pandas as pd
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

logging.getLogger("urllib3").setLevel(logging.ERROR)

# Configuration
YEAR = 2017
HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
# 1 = DATE, 8 = V2Themes, 10 = V2Locations, 15 = V2Tone
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
START_TS, END_TS = "20170101000000", "20171231235959"

# Drive locations (consistent with your new folder name: GDELT)
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV  = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH  = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)

# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url: str, sess: requests.Session, timeout: int = 90):
    # Force HTTP to reduce Colab SSL issues
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess: requests.Session):
    """
    Read masterfilelist and keep:
      - *.gkg.csv.zip
      - timestamps within [START_TS, END_TS]
      - hourly-only if HOURLY_ONLY=True
    """
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing + aggregation
def month_from_dateint(dateint: str) -> str:
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"  # YYYY-MM

def doc_tone(v2tone: str) -> float:
    # V2Tone looks like: "tone,positive,negative,polarity,..."
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df: pd.DataFrame, counts, tone_sum, tone_n, uk_re, theme_re) -> int:
    """
    Keep rows mentioning the UK AND matching the housing theme pattern,
    then update monthly counts and tone sums.
    Returns number of kept rows.
    """
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    """
    Write:
      - yearly CSV with all months of the year (even if empty)
      - checkpoint JSON for resume
    """
    # Building a full Jan–Dec index for the year
    all_months = pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS")
    all_keys = [f"{d.year:04d}-{d.month:02d}" for d in all_months]

    rows = []
    for k in all_keys:
        docs = int(counts.get(k, 0))
        n_t  = int(tone_n.get(k, 0))
        avg  = (tone_sum.get(k, 0.0) / n_t) if n_t else None
        rows.append({"month": f"{k}-01", "docs": docs, "avg_tone": avg})

    yearly = pd.DataFrame(rows).sort_values("month")
    yearly.to_csv(YEARLY_CSV, index=False)

    state_obj = {
        "processed_count": int(processed_count),
        "processed_urls_tail": processed_urls_tail[-20000:],
        "counts": dict(counts),
        "tone_sum": dict(tone_sum),
        "tone_n": dict(tone_n),
    }
    with open(STATE_PATH, "w") as f:
        json.dump(state_obj, f)

    print("Saved yearly output to:", YEARLY_CSV)


# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming from checkpoint: {processed_count:,} files already processed.")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]

if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

print(f"{YEAR}: remaining this run: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        r = http_get(url, sess, timeout=90)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        inner = z.namelist()[0]

        df = pd.read_csv(
            z.open(inner),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I: str, V2THEMES_I: str, V2LOCS_I: str, V2TONE_I: str},
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time() - t0)
        rate = i / elapsed
        eta_min = (len(todo) - i) / max(1e-6, rate) / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | approx ETA {eta_min:.1f} minutes | docs kept {docs_kept:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# Final write
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Total documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")

if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated at:", MASTER_FILE)
print("Master last 12 rows:")
print(m.tail(12).to_string(index=False))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2017
2017: 8,503 files planned
2017: remaining this run: 8,503 files
2017: 100/8503 files processed | approx ETA 326.0 minutes | docs kept 3,422
2017: 200/8503 files processed | approx ETA 285.9 minutes | docs kept 7,183
Saved yearly output to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2017/Gdelt_uk_housing_monthly_2017.csv
2017: 300/8503 files processed | approx ETA 286.3 minutes | docs kept 12,752
2017: 400/8503 files processed | approx ETA 269.3 minutes | docs kept 17,049
Saved yearly output to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2017/Gdelt_uk_housing_monthly_2017.csv
2017: 500/8503 files processed | approx ETA 276.3 minutes | docs kept 24,366
2017: 600/8503 files processed | approx ETA 273.7 minutes | docs kept 31,267
Saved yearly output to: /content/driv

2018

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Configuration
YEAR = 2018

HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

START_TS, END_TS = "20180101000000", "20181231235959"

# Output paths
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)


# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url, sess, timeout=90):
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess):
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing and aggregation
def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving yearly output and checkpoint
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": counts[m], "avg_tone": avg})

    pd.DataFrame(rows).sort_values("month").to_csv(YEARLY_CSV, index=False)

    with open(STATE_PATH, "w") as f:
        json.dump({
            "processed_count": processed_count,
            "processed_urls_tail": processed_urls_tail[-20000:],
            "counts": dict(counts),
            "tone_sum": dict(tone_sum),
            "tone_n": dict(tone_n)
        }, f)

    print("Saved yearly output:", YEARLY_CSV)

# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int, state.get("tone_n", {}))
    print("Resuming from checkpoint")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

todo = [u for u in urls_all if u not in set(processed_urls_tail)]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(http_get(url, sess).content))
        name = z.namelist()[0]

        df = pd.read_csv(
            z.open(name),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I]
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        rate = i / max(1e-6, time.time() - t0)
        eta = (len(todo) - i) / rate / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | ETA ≈ {eta:.1f} minutes")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated:", MASTER_FILE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2018
2018: 8,490 files planned
2018: 100/8490 files processed | ETA ≈ 315.6 minutes
2018: 200/8490 files processed | ETA ≈ 267.4 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2018/Gdelt_uk_housing_monthly_2018.csv
2018: 300/8490 files processed | ETA ≈ 259.3 minutes
2018: 400/8490 files processed | ETA ≈ 254.7 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2018/Gdelt_uk_housing_monthly_2018.csv
2018: 500/8490 files processed | ETA ≈ 244.6 minutes
2018: 600/8490 files processed | ETA ≈ 242.6 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2018/Gdelt_uk_housing_monthly_2018.csv
2018: 700/8490 files processed | ETA ≈ 239.7 minutes
2018: 800/8490 files processed | ETA ≈ 232.4 minutes
Saved 

2019

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, re, zipfile, math, time, json, logging
from pathlib import Path
from datetime import datetime
from collections import defaultdict

import pandas as pd
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

logging.getLogger("urllib3").setLevel(logging.ERROR)

# Configuration
YEAR = 2019

HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

START_TS, END_TS = "20190101000000", "20191231235959"

# Output paths
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)


# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url, sess, timeout=90):
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess):
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing and aggregation
def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving yearly output and checkpoint
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": counts[m], "avg_tone": avg})

    pd.DataFrame(rows).sort_values("month").to_csv(YEARLY_CSV, index=False)

    with open(STATE_PATH, "w") as f:
        json.dump({
            "processed_count": processed_count,
            "processed_urls_tail": processed_urls_tail[-20000:],
            "counts": dict(counts),
            "tone_sum": dict(tone_sum),
            "tone_n": dict(tone_n)
        }, f)

    print("Saved yearly output:", YEARLY_CSV)

# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int, state.get("tone_n", {}))
    print("Resuming from checkpoint")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

todo = [u for u in urls_all if u not in set(processed_urls_tail)]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(http_get(url, sess).content))
        name = z.namelist()[0]

        df = pd.read_csv(
            z.open(name),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I]
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        rate = i / max(1e-6, time.time() - t0)
        eta = (len(todo) - i) / rate / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | ETA ≈ {eta:.1f} minutes")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated:", MASTER_FILE)

Mounted at /content/drive
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2019
2019: 8,757 files planned
Resuming from checkpoint
2019: 100/4369 files processed | ETA ≈ 44.4 minutes
2019: 200/4369 files processed | ETA ≈ 40.0 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2019/Gdelt_uk_housing_monthly_2019.csv
2019: 300/4369 files processed | ETA ≈ 43.5 minutes
2019: 400/4369 files processed | ETA ≈ 47.8 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2019/Gdelt_uk_housing_monthly_2019.csv
2019: 500/4369 files processed | ETA ≈ 47.2 minutes
2019: 600/4369 files processed | ETA ≈ 47.9 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2019/Gdelt_uk_housing_monthly_2019.csv
2019: 700/4369 files processed | ETA ≈ 47.8 minutes
2019: 800/4369 files processed | ETA ≈ 46.6 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2019/Gdelt_uk_housin

2020

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Configuration
YEAR = 2020

HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

START_TS, END_TS = "20200101000000", "20201231235959"

# Output paths
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)


# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url, sess, timeout=90):
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess):
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing and aggregation
def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving yearly output and checkpoint
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": counts[m], "avg_tone": avg})

    pd.DataFrame(rows).sort_values("month").to_csv(YEARLY_CSV, index=False)

    with open(STATE_PATH, "w") as f:
        json.dump({
            "processed_count": processed_count,
            "processed_urls_tail": processed_urls_tail[-20000:],
            "counts": dict(counts),
            "tone_sum": dict(tone_sum),
            "tone_n": dict(tone_n)
        }, f)

    print("Saved yearly output:", YEARLY_CSV)

# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int, state.get("tone_n", {}))
    print("Resuming from checkpoint")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

todo = [u for u in urls_all if u not in set(processed_urls_tail)]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(http_get(url, sess).content))
        name = z.namelist()[0]

        df = pd.read_csv(
            z.open(name),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I]
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        rate = i / max(1e-6, time.time() - t0)
        eta = (len(todo) - i) / rate / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | ETA ≈ {eta:.1f} minutes")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated:", MASTER_FILE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2020
2020: 8,122 files planned
2020: 100/8122 files processed | ETA ≈ 84.8 minutes
2020: 200/8122 files processed | ETA ≈ 93.4 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2020/Gdelt_uk_housing_monthly_2020.csv
2020: 300/8122 files processed | ETA ≈ 92.6 minutes
2020: 400/8122 files processed | ETA ≈ 97.9 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2020/Gdelt_uk_housing_monthly_2020.csv
2020: 500/8122 files processed | ETA ≈ 95.0 minutes
2020: 600/8122 files processed | ETA ≈ 96.1 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2020/Gdelt_uk_housing_monthly_2020.csv
2020: 700/8122 files processed | ETA ≈ 95.7 minutes
2020: 800/8122 files processed | ETA ≈ 93.3 minutes
Saved yearly o

2021

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Configuration
YEAR = 2021

HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

START_TS, END_TS = "20210101000000", "20211231235959"

# Output paths
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)


# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url, sess, timeout=90):
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess):
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing and aggregation
def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving yearly output and checkpoint
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": counts[m], "avg_tone": avg})

    pd.DataFrame(rows).sort_values("month").to_csv(YEARLY_CSV, index=False)

    with open(STATE_PATH, "w") as f:
        json.dump({
            "processed_count": processed_count,
            "processed_urls_tail": processed_urls_tail[-20000:],
            "counts": dict(counts),
            "tone_sum": dict(tone_sum),
            "tone_n": dict(tone_n)
        }, f)

    print("Saved yearly output:", YEARLY_CSV)

# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int, state.get("tone_n", {}))
    print("Resuming from checkpoint")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

todo = [u for u in urls_all if u not in set(processed_urls_tail)]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(http_get(url, sess).content))
        name = z.namelist()[0]

        df = pd.read_csv(
            z.open(name),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I]
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        rate = i / max(1e-6, time.time() - t0)
        eta = (len(todo) - i) / rate / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | ETA ≈ {eta:.1f} minutes")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated:", MASTER_FILE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2021
2021: 8,603 files planned
2021: 100/8603 files processed | ETA ≈ 71.3 minutes
2021: 200/8603 files processed | ETA ≈ 94.4 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2021/Gdelt_uk_housing_monthly_2021.csv
2021: 300/8603 files processed | ETA ≈ 92.9 minutes
2021: 400/8603 files processed | ETA ≈ 94.3 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2021/Gdelt_uk_housing_monthly_2021.csv
2021: 500/8603 files processed | ETA ≈ 95.0 minutes
2021: 600/8603 files processed | ETA ≈ 92.7 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2021/Gdelt_uk_housing_monthly_2021.csv
2021: 700/8603 files processed | ETA ≈ 94.5 minutes
2021: 800/8603 files processed | ETA ≈ 92.2 minutes
Saved yearly o

2022

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Configuration
YEAR = 2022

HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

START_TS, END_TS = "20220101000000", "20221231235959"

# Output paths
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)


# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url, sess, timeout=90):
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess):
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing and aggregation
def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving yearly output and checkpoint
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": counts[m], "avg_tone": avg})

    pd.DataFrame(rows).sort_values("month").to_csv(YEARLY_CSV, index=False)

    with open(STATE_PATH, "w") as f:
        json.dump({
            "processed_count": processed_count,
            "processed_urls_tail": processed_urls_tail[-20000:],
            "counts": dict(counts),
            "tone_sum": dict(tone_sum),
            "tone_n": dict(tone_n)
        }, f)

    print("Saved yearly output:", YEARLY_CSV)

# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int, state.get("tone_n", {}))
    print("Resuming from checkpoint")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

todo = [u for u in urls_all if u not in set(processed_urls_tail)]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(http_get(url, sess).content))
        name = z.namelist()[0]

        df = pd.read_csv(
            z.open(name),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I]
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        rate = i / max(1e-6, time.time() - t0)
        eta = (len(todo) - i) / rate / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | ETA ≈ {eta:.1f} minutes")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated:", MASTER_FILE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2022
2022: 8,759 files planned
2022: 100/8759 files processed | ETA ≈ 64.0 minutes
2022: 200/8759 files processed | ETA ≈ 72.2 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2022/Gdelt_uk_housing_monthly_2022.csv
2022: 300/8759 files processed | ETA ≈ 74.1 minutes
2022: 400/8759 files processed | ETA ≈ 71.5 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2022/Gdelt_uk_housing_monthly_2022.csv
2022: 500/8759 files processed | ETA ≈ 73.7 minutes
2022: 600/8759 files processed | ETA ≈ 71.3 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2022/Gdelt_uk_housing_monthly_2022.csv
2022: 700/8759 files processed | ETA ≈ 70.2 minutes
2022: 800/8759 files processed | ETA ≈ 69.4 minutes
Saved yearly o

2023

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Configuration
YEAR = 2023

HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

START_TS, END_TS = "20230101000000", "20231231235959"

# Output paths
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)


# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url, sess, timeout=90):
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess):
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing and aggregation
def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving yearly output and checkpoint
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": counts[m], "avg_tone": avg})

    pd.DataFrame(rows).sort_values("month").to_csv(YEARLY_CSV, index=False)

    with open(STATE_PATH, "w") as f:
        json.dump({
            "processed_count": processed_count,
            "processed_urls_tail": processed_urls_tail[-20000:],
            "counts": dict(counts),
            "tone_sum": dict(tone_sum),
            "tone_n": dict(tone_n)
        }, f)

    print("Saved yearly output:", YEARLY_CSV)

# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int, state.get("tone_n", {}))
    print("Resuming from checkpoint")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

todo = [u for u in urls_all if u not in set(processed_urls_tail)]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(http_get(url, sess).content))
        name = z.namelist()[0]

        df = pd.read_csv(
            z.open(name),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I]
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        rate = i / max(1e-6, time.time() - t0)
        eta = (len(todo) - i) / rate / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | ETA ≈ {eta:.1f} minutes")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated:", MASTER_FILE)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2023
2023: 8,723 files planned
2023: 100/8723 files processed | ETA ≈ 59.6 minutes
2023: 200/8723 files processed | ETA ≈ 59.2 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2023/Gdelt_uk_housing_monthly_2023.csv
2023: 300/8723 files processed | ETA ≈ 63.3 minutes
2023: 400/8723 files processed | ETA ≈ 60.5 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2023/Gdelt_uk_housing_monthly_2023.csv
2023: 500/8723 files processed | ETA ≈ 63.7 minutes
2023: 600/8723 files processed | ETA ≈ 63.5 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2023/Gdelt_uk_housing_monthly_2023.csv
2023: 700/8723 files processed | ETA ≈ 62.2 minutes
2023: 800/8723 files processed | ETA ≈ 63.6 minutes
Saved yearly o

2024

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Configuration
YEAR = 2024

HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

START_TS, END_TS = "20240101000000", "20241231235959"

# Output paths
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)


# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url, sess, timeout=90):
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess):
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing and aggregation
def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving yearly output and checkpoint
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": counts[m], "avg_tone": avg})

    pd.DataFrame(rows).sort_values("month").to_csv(YEARLY_CSV, index=False)

    with open(STATE_PATH, "w") as f:
        json.dump({
            "processed_count": processed_count,
            "processed_urls_tail": processed_urls_tail[-20000:],
            "counts": dict(counts),
            "tone_sum": dict(tone_sum),
            "tone_n": dict(tone_n)
        }, f)

    print("Saved yearly output:", YEARLY_CSV)

# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int, state.get("tone_n", {}))
    print("Resuming from checkpoint")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

todo = [u for u in urls_all if u not in set(processed_urls_tail)]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(http_get(url, sess).content))
        name = z.namelist()[0]

        df = pd.read_csv(
            z.open(name),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I]
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        rate = i / max(1e-6, time.time() - t0)
        eta = (len(todo) - i) / rate / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | ETA ≈ {eta:.1f} minutes")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated:", MASTER_FILE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2024
2024: 8,784 files planned
2024: 100/8784 files processed | ETA ≈ 94.1 minutes
2024: 200/8784 files processed | ETA ≈ 90.0 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2024/Gdelt_uk_housing_monthly_2024.csv
2024: 300/8784 files processed | ETA ≈ 96.1 minutes
2024: 400/8784 files processed | ETA ≈ 92.4 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2024/Gdelt_uk_housing_monthly_2024.csv
2024: 500/8784 files processed | ETA ≈ 90.3 minutes
2024: 600/8784 files processed | ETA ≈ 92.5 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2024/Gdelt_uk_housing_monthly_2024.csv
2024: 700/8784 files processed | ETA ≈ 90.4 minutes
2024: 800/8784 files processed | ETA ≈ 91.4 minutes
Saved yearly o

2025

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Configuration
YEAR = 2025

HOURLY_ONLY      = True
PROGRESS_EVERY   = 100
CHECKPOINT_EVERY = 200
MAX_FILES        = None

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

START_TS, END_TS = "20250101000000", "20250630235959"

# Output paths
BASE_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
BASE_DIR.mkdir(parents=True, exist_ok=True)

OUT_DIR = BASE_DIR / f"Gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEARLY_CSV = OUT_DIR / f"Gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_FILE = BASE_DIR / "Gdelt_uk_housing_monthly_STITCHING.csv"

print("Outputs will be saved to:", OUT_DIR)


# HTTP and URL lists
def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5,
                    status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://", HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def http_get(url, sess, timeout=90):
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = sess.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls(sess):
    txt = http_get(MASTER_URL, sess, timeout=120).text
    urls = []

    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue

        url = parts[2]
        m = re.search(r"/gdeltv2/(\d{14})\.gkg\.csv\.zip$", url)
        if not m:
            continue

        ts = m.group(1)
        if not (START_TS <= ts <= END_TS):
            continue

        if HOURLY_ONLY and ts[10:12] != "00":
            continue

        urls.append(url.replace("https://", "http://"))

    return sorted(urls)

# Parsing and aggregation
def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)

    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0

    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones = sub[V2TONE_I].apply(doc_tone)

    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1

    return kept

# Saving yearly output and checkpoint
def write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": counts[m], "avg_tone": avg})

    pd.DataFrame(rows).sort_values("month").to_csv(YEARLY_CSV, index=False)

    with open(STATE_PATH, "w") as f:
        json.dump({
            "processed_count": processed_count,
            "processed_urls_tail": processed_urls_tail[-20000:],
            "counts": dict(counts),
            "tone_sum": dict(tone_sum),
            "tone_n": dict(tone_n)
        }, f)

    print("Saved yearly output:", YEARLY_CSV)

# Main execution
sess = make_session()
urls_all = build_gkg_urls(sess)
print(f"{YEAR}: {len(urls_all):,} files planned")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count = state.get("processed_count", 0)
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int, state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int, state.get("tone_n", {}))
    print("Resuming from checkpoint")
else:
    processed_count = 0
    processed_urls_tail = []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

todo = [u for u in urls_all if u not in set(processed_urls_tail)]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

t0 = time.time()
docs_kept = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(http_get(url, sess).content))
        name = z.namelist()[0]

        df = pd.read_csv(
            z.open(name),
            sep="\t",
            header=None,
            quoting=3,
            low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I]
        )

        docs_kept += process_gkg_frame(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)

    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        rate = i / max(1e-6, time.time() - t0)
        eta = (len(todo) - i) / rate / 60
        print(f"{YEAR}: {i}/{len(todo)} files processed | ETA ≈ {eta:.1f} minutes")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"{YEAR} completed. Documents retained: {docs_kept:,}")

# Appending to master stitched file
y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
if MASTER_FILE.exists():
    m = pd.read_csv(MASTER_FILE, parse_dates=["month"])
    m = (
        pd.concat([m, y], ignore_index=True)
        .drop_duplicates(subset="month", keep="last")
        .sort_values("month")
        .reset_index(drop=True)
    )
else:
    m = y

m.to_csv(MASTER_FILE, index=False)
print("Master dataset updated:", MASTER_FILE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Outputs will be saved to: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2025
2025: 3,950 files planned
2025: 100/3950 files processed | ETA ≈ 34.4 minutes
2025: 200/3950 files processed | ETA ≈ 36.6 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2025/Gdelt_uk_housing_monthly_2025.csv
2025: 300/3950 files processed | ETA ≈ 35.1 minutes
2025: 400/3950 files processed | ETA ≈ 37.6 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2025/Gdelt_uk_housing_monthly_2025.csv
2025: 500/3950 files processed | ETA ≈ 35.9 minutes
2025: 600/3950 files processed | ETA ≈ 35.7 minutes
Saved yearly output: /content/drive/MyDrive/msc_project/GDELT/Gdelt_run_2025/Gdelt_uk_housing_monthly_2025.csv
2025: 700/3950 files processed | ETA ≈ 34.8 minutes
2025: 800/3950 files processed | ETA ≈ 33.6 minutes
Saved yearly o

Merging 2005 to Jan 2015

In [None]:
from pathlib import Path
# Location of your GDELT 1.0 monthly output files (Drive)
BASE_DIR = "/content/drive/MyDrive/msc_project/Gdelt_Events_1.0"
BASE = Path(BASE_DIR)

if not BASE.exists():
    raise FileNotFoundError(f"Folder not found: {BASE}. Check Drive path.")

print("Reading monthly output files from:", BASE)

# Loading all CSVs matching a pattern and returning a clean monthly series
def load_monthly_series(pattern: str, label: str, start="2005-01-01", end="2015-01-01") -> pd.DataFrame:
    """
    Load all CSVs matching pattern, standardise columns to:
        Month, Docs, AvgTone
    Then concatenate, sort, de-duplicate by Month (keep last), and clip to window.
    """
    files = sorted(BASE.glob(pattern))

    if not files:
        print(f"{label}: no files found for pattern '{pattern}'")
        return pd.DataFrame(columns=["Month", "Docs", "AvgTone"])

    frames = []
    for p in files:
        df = pd.read_csv(p)

        # Standardise likely variants of the key columns
        rename_map = {}
        for c in df.columns:
            cl = str(c).strip().lower()
            if cl == "month":
                rename_map[c] = "Month"
            elif cl == "docs":
                rename_map[c] = "Docs"
            elif cl in ("avgtone", "avg_tone"):
                rename_map[c] = "AvgTone"

        df = df.rename(columns=rename_map)

        # Only keep what is needed
        needed = [c for c in ["Month", "Docs", "AvgTone"] if c in df.columns]
        if "Month" not in needed:
            continue

        df = df[needed].copy()

        # Enforce types
        df["Month"] = pd.to_datetime(df["Month"], errors="coerce")
        if "Docs" in df.columns:
            df["Docs"] = pd.to_numeric(df["Docs"], errors="coerce")
        if "AvgTone" in df.columns:
            df["AvgTone"] = pd.to_numeric(df["AvgTone"], errors="coerce")

        df = df.dropna(subset=["Month"])
        frames.append(df)

    if not frames:
        print(f"{label}: files found, but none contained a usable 'Month' column.")
        return pd.DataFrame(columns=["Month", "Docs", "AvgTone"])

    out = (
        pd.concat(frames, ignore_index=True)
          .sort_values("Month")
          .drop_duplicates(subset="Month", keep="last")
          .reset_index(drop=True)
    )

    # Clipping to 2005-01 .. 2015-01
    start = pd.Timestamp(start)
    end   = pd.Timestamp(end)
    out = out[(out["Month"] >= start) & (out["Month"] <= end)].reset_index(drop=True)

    print(f"{label}: merged {len(files)} files → {len(out)} monthly rows")
    return out

# Loading both series
all_df  = load_monthly_series("events_uk_monthly_*.csv",          label="All-UK")
econ_df = load_monthly_series("events_uk_economic_monthly_*.csv", label="Economic-only")

# Saving each series
out_all  = BASE / "events_uk_monthly_2005_to_2015Jan.csv"
out_econ = BASE / "events_uk_economic_monthly_2005_to_2015Jan.csv"

all_df.to_csv(out_all, index=False, date_format="%Y-%m-%d")
econ_df.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("Saved:", out_all)
print("Saved:", out_econ)

# Building a wide file
wide = (
    all_df.rename(columns={"Docs": "Docs_All", "AvgTone": "AvgTone_All"})
      .merge(
        econ_df.rename(columns={"Docs": "Docs_Econ", "AvgTone": "AvgTone_Econ"}),
        on="Month",
        how="outer"
      )
      .sort_values("Month")
      .reset_index(drop=True)
)
# Saving
out_wide = BASE / "events_uk_monthly_2005_to_2015Jan_WIDE.csv"
wide.to_csv(out_wide, index=False, date_format="%Y-%m-%d")
print("Saved:", out_wide)

# Preview
print("\nPreview (last 6 rows):")
print(wide.tail(6).to_string(index=False))
print("\nRows:", len(wide), "| Months:", wide["Month"].nunique())


Reading monthly output files from: /content/drive/MyDrive/msc_project/Gdelt_Events_1.0
All-UK: merged 11 files → 121 monthly rows
Economic-only: merged 11 files → 121 monthly rows
Saved: /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2005_to_2015Jan.csv
Saved: /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2005_to_2015Jan.csv
Saved: /content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_monthly_2005_to_2015Jan_WIDE.csv

Preview (last 6 rows):
     Month  Docs_All  AvgTone_All  Docs_Econ  AvgTone_Econ
2014-08-01    205392     2.564620      54274      2.678771
2014-09-01    278268     2.626609      75431      2.690719
2014-10-01    265410     2.680726      71706      2.814015
2014-11-01    243234     2.656559      64486      2.790142
2014-12-01    198960     2.724392      53564      2.859509
2015-01-01    219962     2.650666      60775      2.742955

Rows: 121 | Months: 121


Merging (Jan 2005- Jan 2015) with (Feb 2015-June 2025)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from google.colab import drive

drive.mount("/content/drive")

# Paths
EVENTS_PATH = Path("/content/drive/MyDrive/msc_project/Gdelt_Events_1.0/events_uk_economic_monthly_2005_to_2015Jan.csv")
GKG_MASTER  = Path("/content/drive/MyDrive/msc_project/GDELT/Gdelt_uk_housing_monthly_STITCHING.csv")

OUT_DIR = Path("/content/drive/MyDrive/msc_project/GDELT")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Study window
START = pd.Timestamp("2005-01-01")
END   = pd.Timestamp("2025-06-01")

EVENTS_LAST = pd.Timestamp("2015-01-01")
GKG_FIRST   = pd.Timestamp("2015-02-01")

def month_start(x) -> pd.Series:
    """Parse dates and anchor to month-start."""
    dt = pd.to_datetime(x, errors="coerce", infer_datetime_format=True)
    return dt.dt.to_period("M").dt.to_timestamp(how="start")

def standardise_monthly(df: pd.DataFrame, month_col: str) -> pd.DataFrame:
    """
    Standardise a monthly tone dataset to columns:
        Month, Docs, AvgTone
    Accepts common variants like: month/docs/avg_tone.
    """
    df = df.copy()
    df.columns = [str(c).strip() for c in df.columns]

    rename = {}
    for c in df.columns:
        cl = c.lower()
        if cl == "month": rename[c] = "Month"
        if cl == "docs": rename[c] = "Docs"
        if cl in ("avg_tone", "avgtone"): rename[c] = "AvgTone"
    df = df.rename(columns=rename)

    if month_col != "Month" and month_col in df.columns:
        df = df.rename(columns={month_col: "Month"})

    if "Month" not in df.columns:
        raise ValueError(f"Expected a month/date column. Found columns: {df.columns.tolist()}")

    # Parsing and cleaning
    df["Month"] = month_start(df["Month"])
    if "Docs" in df.columns:
        df["Docs"] = pd.to_numeric(df["Docs"], errors="coerce")
    if "AvgTone" in df.columns:
        df["AvgTone"] = pd.to_numeric(df["AvgTone"], errors="coerce")

    # Keep only relevant columns if present
    keep = [c for c in ["Month", "Docs", "AvgTone"] if c in df.columns]
    df = df[keep].dropna(subset=["Month"]).sort_values("Month")

    # One row per month
    df = df.drop_duplicates(subset="Month", keep="last").reset_index(drop=True)
    return df

def clip_months(df: pd.DataFrame, start: pd.Timestamp, end: pd.Timestamp) -> pd.DataFrame:
    return df[(df["Month"] >= start) & (df["Month"] <= end)].reset_index(drop=True)

# Loading Events 1.0 (Economic-only) monthly series
if not EVENTS_PATH.exists():
    raise FileNotFoundError(f"Events file not found: {EVENTS_PATH}")

events_raw = pd.read_csv(EVENTS_PATH)
events = standardise_monthly(events_raw, month_col="Month")

# Keep only 2005-01 .. 2015-01 for stitching
events = clip_months(events, START, EVENTS_LAST)

# Loading GKG housing monthly series
if not GKG_MASTER.exists():
    raise FileNotFoundError(f"GKG master file not found: {GKG_MASTER}")

gkg_raw = pd.read_csv(GKG_MASTER)
gkg = standardise_monthly(gkg_raw, month_col="month" if "month" in gkg_raw.columns else "Month")

# Keep only 2015-02 .. 2025-06 for stitching
gkg = clip_months(gkg, GKG_FIRST, END)

# Building WIDE dataset
full_months = pd.date_range(START, END, freq="MS")

events_w = (
    events.set_index("Month")
          .rename(columns={"Docs": "Docs_Events_Econ", "AvgTone": "AvgTone_Events_Econ"})
)

gkg_w = (
    gkg.set_index("Month")
       .rename(columns={"Docs": "Docs_GKG", "AvgTone": "AvgTone_GKG"})
)

wide = (
    pd.DataFrame(index=full_months)
      .join(events_w[["Docs_Events_Econ", "AvgTone_Events_Econ"]], how="left")
      .join(gkg_w[["Docs_GKG", "AvgTone_GKG"]], how="left")
      .rename_axis("Month")
      .reset_index()
)

# Building STITCHED single series
wide["AvgTone_Stitched"] = np.where(
    wide["Month"] <= EVENTS_LAST,
    wide["AvgTone_Events_Econ"],
    wide["AvgTone_GKG"]
)

wide["Docs_Stitched"] = np.where(
    wide["Month"] <= EVENTS_LAST,
    wide["Docs_Events_Econ"],
    wide["Docs_GKG"]
)

wide["Source"] = np.where(
    wide["Month"] <= EVENTS_LAST,
    "GDELT_Events_1_0_Economic",
    "GDELT_GKG_2_1_Housing"
)

stitched = wide[["Month", "AvgTone_Stitched", "Docs_Stitched", "Source"]].copy()

# Saving outputs
out_wide = OUT_DIR / "gdelt_tone_2005_2025_wide_ECONxGKG.csv"
out_st   = OUT_DIR / "gdelt_tone_2005_2025_stitched_ECONxGKG.csv"

wide.to_csv(out_wide, index=False, date_format="%Y-%m-%d")
stitched.to_csv(out_st, index=False, date_format="%Y-%m-%d")

print("Files saved")
print("Wide (both sources):", out_wide)
print("Stitched (single series):", out_st)

# Preview
print("\nCoverage check:")
print("Events months:", events["Month"].min(), "→", events["Month"].max(), "| rows:", len(events))
print("GKG months   :", gkg["Month"].min(), "→", gkg["Month"].max(), "| rows:", len(gkg))

print("\nPreview (last 6 rows of stitched):")
print(stitched.tail(6).to_string(index=False))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files saved
Wide (both sources): /content/drive/MyDrive/msc_project/GDELT/gdelt_tone_2005_2025_wide_ECONxGKG.csv
Stitched (single series): /content/drive/MyDrive/msc_project/GDELT/gdelt_tone_2005_2025_stitched_ECONxGKG.csv

Coverage check:
Events months: 2005-01-01 00:00:00 → 2015-01-01 00:00:00 | rows: 121
GKG months   : 2015-02-01 00:00:00 → 2025-06-01 00:00:00 | rows: 125

Preview (last 6 rows of stitched):
     Month  AvgTone_Stitched  Docs_Stitched                Source
2025-01-01         -1.096796        14180.0 GDELT_GKG_2_1_Housing
2025-02-01         -0.884768        16683.0 GDELT_GKG_2_1_Housing
2025-03-01         -0.769141        17780.0 GDELT_GKG_2_1_Housing
2025-04-01         -1.112180        14861.0 GDELT_GKG_2_1_Housing
2025-05-01         -0.360499        18253.0 GDELT_GKG_2_1_Housing
2025-06-01         -0.826562         8278.0 GDELT_GKG_2_1_Hou

  dt = pd.to_datetime(x, errors="coerce", infer_datetime_format=True)
  dt = pd.to_datetime(x, errors="coerce", infer_datetime_format=True)
