In [None]:
# ======================= GDELT 1.0 EVENTS → 2005 monthly UK AvgTone =======================
# Outputs (saved to Drive: /content/drive/MyDrive/msc_project/gdelt_events_1p0):
#   - events_uk_monthly_2005.csv
#   - events_uk_economic_monthly_2005.csv

!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math, re
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2005  # change if needed
BASE = "http://data.gdeltproject.org/events"  # HTTP avoids SSL hostname issues
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* country code (FIPS-2)
UK_ISO3 = "GBR"   # Actor*CountryCode (ISO-3/CAMEO)
CHUNK = 200_000   # rows per chunk

# Events 1.0 column positions (pre-2013 format; 57 columns total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-2005/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def fetch_year_zip(year: int):
    """Try the direct yearly ZIP (1979–2005). Fallback to index listing if needed."""
    url_direct = f"{BASE}/{year}.zip"
    try:
        r = session().get(url_direct, timeout=180)
        if r.ok:
            z = zipfile.ZipFile(io.BytesIO(r.content))
            inner = z.namelist()[0]
            return url_direct, z, inner
    except Exception:
        pass

    # Fallback via /events/index.html
    idx_html = session().get(f"{BASE}/index.html", timeout=120).text
    m = re.search(rf'href="(.*?/{year}\.zip)"', idx_html, re.I)
    if not m:
        raise RuntimeError(f"Could not find a yearly ZIP for {year} on the events index.")
    url = m.group(1)
    if url.startswith("/"):
        url = "http://data.gdeltproject.org" + url
    r = session().get(url, timeout=180); r.raise_for_status()
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return url, z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    """Aggregate UK-related rows in this chunk into monthly Docs and AvgTone sums."""
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return

    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()

    # Month = calendar month start (YYYY-MM-01)
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()                  # start-of-month
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")

    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)
    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    """Economic family rows only: EventRootCode starts with '04'."""
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    return pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

# ---------- run ----------
url, z, inner = fetch_year_zip(YEAR)
print(f"Fetched: {url} | Inner: {inner}")

reader = pd.read_csv(
    z.open(inner),
    sep="\t", header=None, low_memory=False, chunksize=CHUNK,
    usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
    dtype={COL_SQLDATE: str, COL_A1C: str, COL_A2C: str, COL_ACTC: str, COL_TONE: float, COL_ROOT: str}
)

uk_counts,   uk_tsum,   uk_tn   = defaultdict(int), defaultdict(float), defaultdict(int)
econ_counts, econ_tsum, econ_tn = defaultdict(int), defaultdict(float), defaultdict(int)

total_rows = 0
for i, chunk in enumerate(reader, 1):
    total_rows += len(chunk)
    add_chunk(chunk,      uk_counts,   uk_tsum,   uk_tn)
    add_chunk_econ(chunk, econ_counts, econ_tsum, econ_tn)
    if i % 5 == 0:
        print(f"Processed ~{total_rows:,} rows…")

df_uk   = finalize(uk_counts,   uk_tsum,   uk_tn)
df_econ = finalize(econ_counts, econ_tsum, econ_tn)

# Save to Drive
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fetched: http://data.gdeltproject.org/events/2005.zip | Inner: 2005.csv
Processed ~1,000,000 rows…
Processed ~2,000,000 rows…
Processed ~3,000,000 rows…

--- Done ---
UK-all monthly:
     Month  Docs  AvgTone
2005-01-01 12223 5.485720
2005-02-01 14461 5.838248
2005-03-01 12688 5.779465
2005-04-01  9765 5.893368
2005-05-01 10942 5.626196
2005-06-01 14543 5.742366
2005-07-01 25623 4.728918
2005-08-01 14541 4.783105
2005-09-01 14552 5.197001
2005-10-01 14015 5.401413
2005-11-01 14822 5.419917
2005-12-01 12740 5.520385

UK-economic monthly (EventRootCode '04*'):
     Month  Docs  AvgTone
2005-01-01  3443 5.659953
2005-02-01  4481 5.849820
2005-03-01  3601 6.084276
2005-04-01  2613 6.143649
2005-05-01  2777 5.747168
2005-06-01  4020 5.632774
2005-07-01  6308 5.016168
2005-08-01  3363 4.927621
2005-09-01  3698 5.500993
2005-10-01  3940 5.579145
2005-11-01  4564 5.7

2006

In [None]:
# ======================= GDELT 1.0 EVENTS → 2006 monthly UK AvgTone =======================
# Saves to Google Drive: /content/drive/MyDrive/msc_project/gdelt_events_1p0
# Outputs:
#   - events_uk_monthly_2006.csv
#   - events_uk_economic_monthly_2006.csv
# Notes: 2006 files are monthly ZIPs like 200601.zip, 200602.zip, ... (through 201303.zip).

!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math, re
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2006
BASE = "http://data.gdeltproject.org/events"   # index lists 200601.zip, 200602.zip, ... for 2006
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* uses FIPS-2
UK_ISO3 = "GBR"   # Actor*CountryCode uses ISO-3
CHUNK = 200_000

# Column positions in Events 1.0 (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_urls(year: int):
    # 2006–2012 → 12 months; 2013 → months 1..3
    last_m = 3 if year == 2013 else 12
    return [f"{BASE}/{year}{m:02d}.zip" for m in range(1, last_m+1)]

def stream_month(url: str):
    """Yield a (ZipFile, inner_csv_name) pair for a monthly ZIP, or None if 404."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]  # e.g., '200601.csv'
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # reindex to full year months so any missing appear explicitly
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01" if year != 2013 else f"{year}-03-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
urls = monthly_urls(YEAR)
print(f"{YEAR}: attempting {len(urls)} monthly files")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = stream_month(url)
    if got is None:
        print("SKIP (not found):", url)
        continue
    z, inner = got
    print("Processing:", inner)
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    rows = 0
    for i, chunk in enumerate(reader, 1):
        rows += len(chunk)
        add_chunk(chunk,   uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
        if i % 5 == 0:
            print(f"  read ~{rows:,} rows…")

# finalize
df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

# save
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2006: attempting 12 monthly files
Processing: 200601.csv
Processing: 200602.csv
Processing: 200603.csv
Processing: 200604.csv
Processing: 200605.csv
Processing: 200606.csv
Processing: 200607.csv
Processing: 200608.csv
Processing: 200609.csv
Processing: 200610.csv
Processing: 200611.csv
Processing: 200612.csv

--- Done ---
UK-all monthly:
     Month  Docs  AvgTone
2006-01-01 17312 5.221570
2006-02-01 15091 5.228871
2006-03-01 28846 5.577158
2006-04-01 17763 5.630909
2006-05-01 23968 5.630888
2006-06-01 27549 5.632453
2006-07-01 28312 5.433701
2006-08-01 39512 5.021931
2006-09-01 37393 5.587386
2006-10-01 25470 5.655227
2006-11-01 42019 5.496018
2006-12-01 40233 5.241839

UK-economic monthly (EventRootCode '04*'):
     Month  Docs  AvgTone
2006-01-01  5670 5.345079
2006-02-01  3695 5.678923
2006-03-01  8627 5.792521
2006-04-01  5070 5.805373
2006-05-01  6969 5.

2007

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math, re
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2007
BASE = "http://data.gdeltproject.org/events"   # index lists 200601.zip, 200602.zip, ... for 2006
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* uses FIPS-2
UK_ISO3 = "GBR"   # Actor*CountryCode uses ISO-3
CHUNK = 200_000

# Column positions in Events 1.0 (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_urls(year: int):
    # 2006–2012 → 12 months; 2013 → months 1..3
    last_m = 3 if year == 2013 else 12
    return [f"{BASE}/{year}{m:02d}.zip" for m in range(1, last_m+1)]

def stream_month(url: str):
    """Yield a (ZipFile, inner_csv_name) pair for a monthly ZIP, or None if 404."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]  # e.g., '200601.csv'
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # reindex to full year months so any missing appear explicitly
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01" if year != 2013 else f"{year}-03-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
urls = monthly_urls(YEAR)
print(f"{YEAR}: attempting {len(urls)} monthly files")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = stream_month(url)
    if got is None:
        print("SKIP (not found):", url)
        continue
    z, inner = got
    print("Processing:", inner)
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    rows = 0
    for i, chunk in enumerate(reader, 1):
        rows += len(chunk)
        add_chunk(chunk,   uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
        if i % 5 == 0:
            print(f"  read ~{rows:,} rows…")

# finalize
df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

# save
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2007: attempting 12 monthly files
Processing: 200701.csv
Processing: 200702.csv
Processing: 200703.csv
  read ~903,257 rows…
Processing: 200704.csv
  read ~1,000,000 rows…
Processing: 200705.csv
  read ~1,000,000 rows…
Processing: 200706.csv
  read ~1,000,000 rows…
Processing: 200707.csv
  read ~960,901 rows…
Processing: 200708.csv
Processing: 200709.csv
  read ~957,387 rows…
Processing: 200710.csv
  read ~1,000,000 rows…
Processing: 200711.csv
  read ~1,000,000 rows…
Processing: 200712.csv
  read ~940,702 rows…

--- Done ---
UK-all monthly:
     Month  Docs  AvgTone
2007-01-01 38932 5.650727
2007-02-01 42950 5.515068
2007-03-01 69455 5.518773
2007-04-01 77598 5.511486
2007-05-01 86161 5.475945
2007-06-01 81884 5.518685
2007-07-01 90549 5.063248
2007-08-01 43332 5.276237
2007-09-01 60090 5.528454
2007-10-01 66685 5.700373
2007-11-01 67144 5.483070
2007-12-01 

2008

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math, re
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2008
BASE = "http://data.gdeltproject.org/events"   # index lists 200601.zip, 200602.zip, ... for 2006
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* uses FIPS-2
UK_ISO3 = "GBR"   # Actor*CountryCode uses ISO-3
CHUNK = 200_000

# Column positions in Events 1.0 (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_urls(year: int):
    # 2006–2012 → 12 months; 2013 → months 1..3
    last_m = 3 if year == 2013 else 12
    return [f"{BASE}/{year}{m:02d}.zip" for m in range(1, last_m+1)]

def stream_month(url: str):
    """Yield a (ZipFile, inner_csv_name) pair for a monthly ZIP, or None if 404."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]  # e.g., '200601.csv'
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # reindex to full year months so any missing appear explicitly
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01" if year != 2013 else f"{year}-03-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
urls = monthly_urls(YEAR)
print(f"{YEAR}: attempting {len(urls)} monthly files")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = stream_month(url)
    if got is None:
        print("SKIP (not found):", url)
        continue
    z, inner = got
    print("Processing:", inner)
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    rows = 0
    for i, chunk in enumerate(reader, 1):
        rows += len(chunk)
        add_chunk(chunk,   uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
        if i % 5 == 0:
            print(f"  read ~{rows:,} rows…")

# finalize
df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

# save
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2008: attempting 12 monthly files
Processing: 200801.csv
  read ~1,000,000 rows…
Processing: 200802.csv
  read ~1,000,000 rows…
Processing: 200803.csv
  read ~1,000,000 rows…
Processing: 200804.csv
  read ~1,000,000 rows…
Processing: 200805.csv
  read ~1,000,000 rows…
Processing: 200806.csv
  read ~1,000,000 rows…
Processing: 200807.csv
  read ~1,000,000 rows…
Processing: 200808.csv
  read ~1,000,000 rows…
Processing: 200809.csv
  read ~912,791 rows…
Processing: 200810.csv
  read ~1,000,000 rows…
Processing: 200811.csv
  read ~1,000,000 rows…
Processing: 200812.csv
  read ~1,000,000 rows…

--- Done ---
UK-all monthly:
     Month  Docs  AvgTone
2008-01-01 75043 5.551900
2008-02-01 77358 5.573736
2008-03-01 65256 5.635886
2008-04-01 67284 5.646853
2008-05-01 70249 5.763192
2008-06-01 80523 5.596329
2008-07-01 85633 5.616669
2008-08-01 55390 5.580945
2008-09-01 

2009

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math, re
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2009
BASE = "http://data.gdeltproject.org/events"   # index lists 200601.zip, 200602.zip, ... for 2006
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* uses FIPS-2
UK_ISO3 = "GBR"   # Actor*CountryCode uses ISO-3
CHUNK = 200_000

# Column positions in Events 1.0 (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_urls(year: int):
    # 2006–2012 → 12 months; 2013 → months 1..3
    last_m = 3 if year == 2013 else 12
    return [f"{BASE}/{year}{m:02d}.zip" for m in range(1, last_m+1)]

def stream_month(url: str):
    """Yield a (ZipFile, inner_csv_name) pair for a monthly ZIP, or None if 404."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]  # e.g., '200601.csv'
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # reindex to full year months so any missing appear explicitly
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01" if year != 2013 else f"{year}-03-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
urls = monthly_urls(YEAR)
print(f"{YEAR}: attempting {len(urls)} monthly files")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = stream_month(url)
    if got is None:
        print("SKIP (not found):", url)
        continue
    z, inner = got
    print("Processing:", inner)
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    rows = 0
    for i, chunk in enumerate(reader, 1):
        rows += len(chunk)
        add_chunk(chunk,   uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
        if i % 5 == 0:
            print(f"  read ~{rows:,} rows…")

# finalize
df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

# save
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2009: attempting 12 monthly files
Processing: 200901.csv
  read ~1,000,000 rows…
  read ~1,971,575 rows…
Processing: 200902.csv
  read ~1,000,000 rows…
Processing: 200903.csv
  read ~1,000,000 rows…
Processing: 200904.csv
  read ~1,000,000 rows…
  read ~1,910,335 rows…
Processing: 200905.csv
  read ~1,000,000 rows…
  read ~1,917,261 rows…
Processing: 200906.csv
  read ~1,000,000 rows…
  read ~1,808,376 rows…
Processing: 200907.csv
  read ~1,000,000 rows…
Processing: 200908.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 200909.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 200910.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 200911.csv
  read ~1,000,000 rows…
  read ~1,893,590 rows…
Processing: 200912.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…

--- Done ---
UK-all monthly:
     Month   Docs  AvgTone


2010

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math, re
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2010
BASE = "http://data.gdeltproject.org/events"   # index lists 200601.zip, 200602.zip, ... for 2006
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* uses FIPS-2
UK_ISO3 = "GBR"   # Actor*CountryCode uses ISO-3
CHUNK = 200_000

# Column positions in Events 1.0 (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_urls(year: int):
    # 2006–2012 → 12 months; 2013 → months 1..3
    last_m = 3 if year == 2013 else 12
    return [f"{BASE}/{year}{m:02d}.zip" for m in range(1, last_m+1)]

def stream_month(url: str):
    """Yield a (ZipFile, inner_csv_name) pair for a monthly ZIP, or None if 404."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]  # e.g., '200601.csv'
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # reindex to full year months so any missing appear explicitly
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01" if year != 2013 else f"{year}-03-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
urls = monthly_urls(YEAR)
print(f"{YEAR}: attempting {len(urls)} monthly files")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = stream_month(url)
    if got is None:
        print("SKIP (not found):", url)
        continue
    z, inner = got
    print("Processing:", inner)
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    rows = 0
    for i, chunk in enumerate(reader, 1):
        rows += len(chunk)
        add_chunk(chunk,   uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
        if i % 5 == 0:
            print(f"  read ~{rows:,} rows…")

# finalize
df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

# save
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2010: attempting 12 monthly files
Processing: 201001.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201002.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201003.csv
Processing: 201004.csv
  read ~1,000,000 rows…
Processing: 201005.csv
  read ~1,000,000 rows…
Processing: 201006.csv
  read ~967,769 rows…
Processing: 201007.csv
  read ~1,000,000 rows…
Processing: 201008.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201009.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~2,870,049 rows…
Processing: 201010.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201011.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201012.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…

--- Done ---
UK-all monthly:
     Month   Docs  AvgTone
2010-01-01 145272 5.568387
2010-02-01 116537 5.786

2011

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math, re
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2011
BASE = "http://data.gdeltproject.org/events"   # index lists 200601.zip, 200602.zip, ... for 2006
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* uses FIPS-2
UK_ISO3 = "GBR"   # Actor*CountryCode uses ISO-3
CHUNK = 200_000

# Column positions in Events 1.0 (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_urls(year: int):
    # 2006–2012 → 12 months; 2013 → months 1..3
    last_m = 3 if year == 2013 else 12
    return [f"{BASE}/{year}{m:02d}.zip" for m in range(1, last_m+1)]

def stream_month(url: str):
    """Yield a (ZipFile, inner_csv_name) pair for a monthly ZIP, or None if 404."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]  # e.g., '200601.csv'
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # reindex to full year months so any missing appear explicitly
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01" if year != 2013 else f"{year}-03-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
urls = monthly_urls(YEAR)
print(f"{YEAR}: attempting {len(urls)} monthly files")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = stream_month(url)
    if got is None:
        print("SKIP (not found):", url)
        continue
    z, inner = got
    print("Processing:", inner)
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    rows = 0
    for i, chunk in enumerate(reader, 1):
        rows += len(chunk)
        add_chunk(chunk,   uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
        if i % 5 == 0:
            print(f"  read ~{rows:,} rows…")

# finalize
df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

# save
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2011: attempting 12 monthly files
Processing: 201101.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201102.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201103.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~3,000,000 rows…
Processing: 201104.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~2,947,005 rows…
Processing: 201105.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201106.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201107.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201108.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~2,854,375 rows…
Processing: 201109.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~2,819,542 rows…
Processing: 201110.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201111.csv
  read ~

2012

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math, re
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2012
BASE = "http://data.gdeltproject.org/events"   # index lists 200601.zip, 200602.zip, ... for 2006
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* uses FIPS-2
UK_ISO3 = "GBR"   # Actor*CountryCode uses ISO-3
CHUNK = 200_000

# Column positions in Events 1.0 (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-monthly/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def monthly_urls(year: int):
    # 2006–2012 → 12 months; 2013 → months 1..3
    last_m = 3 if year == 2013 else 12
    return [f"{BASE}/{year}{m:02d}.zip" for m in range(1, last_m+1)]

def stream_month(url: str):
    """Yield a (ZipFile, inner_csv_name) pair for a monthly ZIP, or None if 404."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]  # e.g., '200601.csv'
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # reindex to full year months so any missing appear explicitly
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01" if year != 2013 else f"{year}-03-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
urls = monthly_urls(YEAR)
print(f"{YEAR}: attempting {len(urls)} monthly files")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

for url in urls:
    got = stream_month(url)
    if got is None:
        print("SKIP (not found):", url)
        continue
    z, inner = got
    print("Processing:", inner)
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    rows = 0
    for i, chunk in enumerate(reader, 1):
        rows += len(chunk)
        add_chunk(chunk,   uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
        if i % 5 == 0:
            print(f"  read ~{rows:,} rows…")

# finalize
df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

# save
out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
2012: attempting 12 monthly files
Processing: 201201.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201202.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~3,000,000 rows…
Processing: 201203.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~3,000,000 rows…
Processing: 201204.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~3,000,000 rows…
Processing: 201205.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~3,000,000 rows…
Processing: 201206.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~2,824,395 rows…
Processing: 201207.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201208.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201209.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing: 201210.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Process

2013

In [None]:
# ======================= GDELT 1.0 EVENTS → 2013 monthly UK AvgTone =======================
# Saves to Google Drive: /content/drive/MyDrive/msc_project/gdelt_events_1p0
# Outputs:
#   - events_uk_monthly_2013.csv
#   - events_uk_economic_monthly_2013.csv
# Notes:
#   • 2013 has monthly ZIPs for Jan–Mar: 201301.zip, 201302.zip, 201303.zip
#   • and daily ZIPs from 2013-04-01: YYYYMMDD.export.CSV.zip

!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2013
BASE = "http://data.gdeltproject.org/events"
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* country code (FIPS-2)
UK_ISO3 = "GBR"   # Actor*CountryCode (ISO-3/CAMEO)
CHUNK = 200_000   # rows per chunk

# Events 1.0 column positions (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-2013/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def stream_zip(url: str):
    """Return (ZipFile, inner_csv_name) for a URL or None if not found."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # Reindex to full-year months (Jan–Dec)
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

# 1) Monthly ZIPs for Jan–Mar 2013
for m in [1,2,3]:
    url = f"{BASE}/{YEAR}{m:02d}.zip"   # e.g., http://.../201301.zip
    got = stream_zip(url)
    if got is None:
        print("SKIP (not found):", url)
        continue
    z, inner = got
    print("Processing monthly:", inner)
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    rows = 0
    for i, chunk in enumerate(reader, 1):
        rows += len(chunk)
        add_chunk(chunk, uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
        if i % 5 == 0:
            print(f"  read ~{rows:,} rows…")

# 2) Daily ZIPs from 2013-04-01 to 2013-12-31 (YYYYMMDD.export.CSV.zip)
dates = pd.date_range("2013-04-01", "2013-12-31", freq="D")
for d in dates:
    url = f"{BASE}/{d:%Y%m%d}.export.CSV.zip"
    got = stream_zip(url)
    if got is None:
        # some days can be missing; skip quietly
        continue
    z, inner = got
    # inner is usually like '20130401.export.CSV'
    # print("Processing daily:", inner)  # uncomment if you want verbose logs
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    for chunk in reader:
        add_chunk(chunk, uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)

# finalize + save
df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing monthly: 201301.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing monthly: 201302.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
Processing monthly: 201303.csv
  read ~1,000,000 rows…
  read ~2,000,000 rows…
  read ~2,834,527 rows…

--- Done ---
UK-all monthly:
     Month   Docs  AvgTone
2013-01-01 126048 2.425202
2013-02-01 116075 2.565489
2013-03-01 125238 2.609551
2013-04-01  68333 2.500042
2013-05-01  80872 2.296863
2013-06-01 131886 2.576806
2013-07-01 185674 2.691628
2013-08-01 218965 2.447774
2013-09-01 217858 2.529328
2013-10-01 215743 2.679382
2013-11-01 220830 2.679942
2013-12-01 160578 2.708217

UK-economic monthly (EventRootCode '04*'):
     Month  Docs  AvgTone
2013-01-01 32178 2.481781
2013-02-01 33896 2.601469
2013-03-01 31667 2.749431
2013-04-01 17039 2.496610
2013-05-01 20432 2.407361
2013-06-01 37077 2.733151
201

2014

In [None]:
# ======================= GDELT 1.0 EVENTS → 2014 monthly UK AvgTone =======================
# Saves to Google Drive: /content/drive/MyDrive/msc_project/gdelt_events_1p0
# Outputs:
#   - events_uk_monthly_2014.csv
#   - events_uk_economic_monthly_2014.csv

!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
YEAR = 2014
BASE = "http://data.gdeltproject.org/events"  # daily files: YYYYMMDD.export.CSV.zip
OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* country code (FIPS-2)
UK_ISO3 = "GBR"   # Actor*CountryCode (ISO-3/CAMEO)
CHUNK = 200_000   # rows per chunk

# Events 1.0 column positions (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-2014/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def stream_zip(url: str):
    """Return (ZipFile, inner_csv_name) for a URL or None if not found."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()     # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Vectorized per-chunk aggregation
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    # Economic family: EventRootCode starts with '04'
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, year:int):
    rows = []
    for m in sorted(counts.keys()):
        if m.year != year:
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    # Reindex to full-year months (Jan–Dec)
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)
    allm = pd.date_range(f"{year}-01-01", f"{year}-12-01", freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

dates = pd.date_range("2014-01-01", "2014-12-31", freq="D")
miss = 0
for j, d in enumerate(dates, 1):
    url = f"{BASE}/{d:%Y%m%d}.export.CSV.zip"
    got = stream_zip(url)
    if got is None:
        miss += 1
        if j % 30 == 0:
            print(f"{j}/{len(dates)} days, missing so far: {miss}")
        continue
    z, inner = got
    # print("Processing:", inner)  # uncomment for verbose logs
    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    for chunk in reader:
        add_chunk(chunk, uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
    if j % 30 == 0:
        print(f"Processed {j}/{len(dates)} days… (missing so far: {miss})")

df_uk   = finalize(uk_counts, uk_tsum, uk_tn, YEAR)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, YEAR)

out_uk   = OUT_DIR / f"events_uk_monthly_{YEAR}.csv"
out_econ = OUT_DIR / f"events_uk_economic_monthly_{YEAR}.csv"
df_uk.to_csv(out_uk, index=False, date_format="%Y-%m-%d")
df_econ.to_csv(out_econ, index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print(f"\nWrote:\n  {out_uk}\n  {out_econ}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processed 30/365 days… (missing so far: 3)
Processed 60/365 days… (missing so far: 3)
Processed 90/365 days… (missing so far: 4)
Processed 120/365 days… (missing so far: 4)
Processed 150/365 days… (missing so far: 4)
Processed 180/365 days… (missing so far: 4)
Processed 210/365 days… (missing so far: 4)
Processed 240/365 days… (missing so far: 4)
Processed 270/365 days… (missing so far: 4)
Processed 300/365 days… (missing so far: 4)
Processed 330/365 days… (missing so far: 4)
Processed 360/365 days… (missing so far: 4)

--- Done ---
UK-all monthly:
     Month   Docs  AvgTone
2014-01-01 139963 2.678215
2014-02-01 166052 2.698572
2014-03-01 178579 2.651045
2014-04-01 184551 2.720858
2014-05-01 195643 2.677188
2014-06-01 191109 2.667295
2014-07-01 214956 2.612385
2014-08-01 205392 2.564620
2014-09-01 278268 2.626609
2014-10-01 265410 2.680726
2014-11-01 243234 2

Jan 2015

In [None]:
# ======================= GDELT 1.0 EVENTS → Jan 2015 (monthly UK AvgTone) =======================
# Saves to Google Drive: /content/drive/MyDrive/msc_project/gdelt_events_1p0
# Outputs:
#   - events_uk_monthly_2015_Jan.csv
#   - events_uk_economic_monthly_2015_Jan.csv
# Notes: GDELT 2.0 starts Feb 19, 2015. Jan 2015 is from GDELT 1.0 daily files: YYYYMMDD.export.CSV.zip

!pip -q install pandas requests

from google.colab import drive
drive.mount("/content/drive")

import io, zipfile, requests, pandas as pd, numpy as np, math
from pathlib import Path
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from collections import defaultdict

# ---------- CONFIG ----------
START_DATE = "2015-01-01"          # you can extend to "2015-02-18" if you also want early Feb 2015
END_DATE   = "2015-01-31"
BASE = "http://data.gdeltproject.org/events"   # daily: YYYYMMDD.export.CSV.zip

OUT_DIR = Path("/content/drive/MyDrive/msc_project/gdelt_events_1p0")
OUT_DIR.mkdir(parents=True, exist_ok=True)

UK_FIPS = "UK"    # ActionGeo_* country code (FIPS-2)
UK_ISO3 = "GBR"   # Actor*CountryCode (ISO-3/CAMEO)
CHUNK = 200_000   # rows per chunk

# Events 1.0 column positions (57 cols total: 0..56)
# 1=SQLDATE, 7=Actor1CountryCode, 17=Actor2CountryCode, 51=ActionGeo_CountryCode,
# 34=AvgTone, 28=EventRootCode
COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT = 1, 7, 17, 51, 34, 28

# ---------- helpers ----------
def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-events-2015-Jan/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def stream_zip(url: str):
    """Return (ZipFile, inner_csv_name) if available; else None."""
    r = session().get(url, timeout=180)
    if not r.ok:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    inner = z.namelist()[0]
    return z, inner

def add_chunk(df, counts, tone_sum, tone_n):
    """Aggregate UK-related rows into monthly Docs and AvgTone sums."""
    # UK if ActionGeo FIPS==UK OR Actor1/2 ISO3==GBR
    m_uk = (df[COL_ACTC].eq(UK_FIPS)) | (df[COL_A1C].eq(UK_ISO3)) | (df[COL_A2C].eq(UK_ISO3))
    if not m_uk.any():
        return
    sub = df.loc[m_uk, [COL_SQLDATE, COL_TONE]].copy()
    sub["Month"] = (
        pd.to_datetime(sub[COL_SQLDATE], format="%Y%m%d", errors="coerce")
          .dt.to_period("M")
          .dt.to_timestamp()    # month start
    )
    sub["AvgTone"] = pd.to_numeric(sub[COL_TONE], errors="coerce")
    sub = sub.dropna(subset=["Month"])
    if sub.empty:
        return

    # Per-chunk aggregation (vectorized)
    docs = sub.groupby("Month", as_index=False).size().rename(columns={"size": "Docs"})
    tsum = sub.dropna(subset=["AvgTone"]).groupby("Month", as_index=False)["AvgTone"].sum()\
              .rename(columns={"AvgTone": "ToneSum"})
    tnum = sub["AvgTone"].notna().groupby(sub["Month"]).sum().reset_index(name="ToneN")
    tmp = docs.merge(tsum, on="Month", how="left").merge(tnum, on="Month", how="left").fillna(0)

    for _, r in tmp.iterrows():
        m = r["Month"]
        counts[m]   += int(r["Docs"])
        tone_sum[m] += float(r["ToneSum"])
        tone_n[m]   += int(r["ToneN"])

def add_chunk_econ(df, counts, tone_sum, tone_n):
    """Economic family only: EventRootCode starts with '04'."""
    m_econ = df[COL_ROOT].astype(str).str.startswith("04", na=False)
    if m_econ.any():
        add_chunk(df.loc[m_econ, :], counts, tone_sum, tone_n)

def finalize(counts, tone_sum, tone_n, start_date: str, end_date: str):
    rows = []
    for m in sorted(counts.keys()):
        if m < pd.to_datetime(start_date).to_period("M").to_timestamp() or \
           m > pd.to_datetime(end_date).to_period("M").to_timestamp():
            continue
        n = counts[m]
        avg = (tone_sum[m] / tone_n[m]) if tone_n[m] else np.nan
        rows.append({"Month": m, "Docs": n, "AvgTone": avg})
    df = pd.DataFrame(rows).sort_values("Month").reset_index(drop=True)

    # Reindex to month(s) fully covered by the date window (here, just Jan 2015)
    mstart = pd.to_datetime(start_date).to_period("M").to_timestamp()
    mend   = pd.to_datetime(end_date).to_period("M").to_timestamp()
    allm = pd.date_range(mstart, mend, freq="MS")
    df = (df.set_index("Month")
            .reindex(allm)
            .rename_axis("Month")
            .reset_index())
    return df

# ---------- run ----------
dates = pd.date_range(START_DATE, END_DATE, freq="D")
uk_counts, uk_tsum, uk_tn = defaultdict(int), defaultdict(float), defaultdict(int)
ec_counts, ec_tsum, ec_tn = defaultdict(int), defaultdict(float), defaultdict(int)

missing = 0
for j, d in enumerate(dates, 1):
    url = f"{BASE}/{d:%Y%m%d}.export.CSV.zip"
    got = stream_zip(url)
    if got is None:
        missing += 1
        if j % 10 == 0:
            print(f"{j}/{len(dates)} days, missing so far: {missing}")
        continue
    z, inner = got
    # print("Processing:", inner)  # uncomment for verbose logs

    reader = pd.read_csv(
        z.open(inner),
        sep="\t", header=None, low_memory=False, chunksize=CHUNK,
        usecols=[COL_SQLDATE, COL_A1C, COL_A2C, COL_ACTC, COL_TONE, COL_ROOT],
        dtype={COL_SQLDATE:str, COL_A1C:str, COL_A2C:str, COL_ACTC:str, COL_TONE:float, COL_ROOT:str}
    )
    for chunk in reader:
        add_chunk(chunk, uk_counts, uk_tsum, uk_tn)
        add_chunk_econ(chunk, ec_counts, ec_tsum, ec_tn)
    if j % 10 == 0:
        print(f"Processed {j}/{len(dates)} days… (missing so far: {missing})")

df_uk   = finalize(uk_counts, uk_tsum, uk_tn, START_DATE, END_DATE)
df_econ = finalize(ec_counts, ec_tsum, ec_tn, START_DATE, END_DATE)

out_suffix = "2015_Jan"
df_uk.to_csv(OUT_DIR / f"events_uk_monthly_{out_suffix}.csv", index=False, date_format="%Y-%m-%d")
df_econ.to_csv(OUT_DIR / f"events_uk_economic_monthly_{out_suffix}.csv", index=False, date_format="%Y-%m-%d")

print("\n--- Done ---")
print("UK-all monthly:")
print(df_uk.to_string(index=False))
print("\nUK-economic monthly (EventRootCode '04*'):")
print(df_econ.to_string(index=False))
print("\nWrote:")
print(" ", OUT_DIR / f"events_uk_monthly_{out_suffix}.csv")
print(" ", OUT_DIR / f"events_uk_economic_monthly_{out_suffix}.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processed 10/31 days… (missing so far: 0)
Processed 20/31 days… (missing so far: 0)
Processed 30/31 days… (missing so far: 0)

--- Done ---
UK-all monthly:
     Month   Docs  AvgTone
2015-01-01 219962 2.650666

UK-economic monthly (EventRootCode '04*'):
     Month  Docs  AvgTone
2015-01-01 60775 2.742955

Wrote:
  /content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_monthly_2015_Jan.csv
  /content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_economic_monthly_2015_Jan.csv


Merging 2005 to Jan 2015

In [None]:
# ===================== Merge GDELT Events 1.0 monthly outputs (2005 → Jan 2015) =====================
# Input folder (what you used earlier)
BASE_DIR = "/content/drive/MyDrive/msc_project/gdelt_events_1p0"

import pandas as pd
from pathlib import Path

BASE = Path(BASE_DIR)

def load_concat(pattern, label=None):
    """Load all CSVs matching pattern, normalize columns, concat, sort, dedupe by Month."""
    files = sorted([p for p in BASE.glob(pattern)])
    if not files:
        print(f"No files found for pattern: {pattern}")
        return pd.DataFrame(columns=["Month","Docs","AvgTone"])
    frames = []
    for p in files:
        df = pd.read_csv(p)
        # Normalize column names
        cols = {c.lower(): c for c in df.columns}
        # try both 'Month' and 'month'
        if "Month" not in df.columns and "month" in cols:
            df.rename(columns={"month":"Month"}, inplace=True)
        if "Docs" not in df.columns and "docs" in cols:
            df.rename(columns={"docs":"Docs"}, inplace=True)
        if "AvgTone" not in df.columns and "avg_tone" in cols:
            df.rename(columns={"avg_tone":"AvgTone"}, inplace=True)
        # Parse types
        df["Month"] = pd.to_datetime(df["Month"], errors="coerce")
        df["Docs"] = pd.to_numeric(df["Docs"], errors="coerce")
        df["AvgTone"] = pd.to_numeric(df["AvgTone"], errors="coerce")
        frames.append(df[["Month","Docs","AvgTone"]])
    out = pd.concat(frames, ignore_index=True).sort_values("Month")
    # De-duplicate months (keep last if the same month appears in multiple files)
    out = out.drop_duplicates(subset="Month", keep="last").reset_index(drop=True)
    # Clip to 2005-01 to 2015-01 (Jan-2015 only)
    start = pd.Timestamp("2005-01-01")
    end   = pd.Timestamp("2015-01-01")  # adjust to 2015-02-01 if you included early Feb 2015
    out = out[(out["Month"] >= start) & (out["Month"] <= end)].copy()
    out.sort_values("Month", inplace=True)
    out.reset_index(drop=True, inplace=True)
    print(f"{label or pattern}: merged {len(files)} files → {len(out)} rows")
    return out

# 1) Merge the All-UK series (exclude 'economic' files)
all_df  = load_concat("events_uk_monthly_*.csv", label="All-UK")
# 2) Merge the Economic-only series
econ_df = load_concat("events_uk_economic_monthly_*.csv", label="Economic-only")

# 3) Save each separately (optional but handy)
out_all  = BASE / "events_uk_monthly_2005_to_2015Jan.csv"
out_econ = BASE / "events_uk_economic_monthly_2005_to_2015Jan.csv"
all_df.to_csv(out_all, index=False, date_format="%Y-%m-%d")
econ_df.to_csv(out_econ, index=False, date_format="%Y-%m-%d")
print("Wrote:", out_all)
print("Wrote:", out_econ)

# 4) Also build ONE wide file with both series side-by-side
wide = (
    all_df.rename(columns={"Docs":"Docs_All","AvgTone":"AvgTone_All"})
      .merge(
        econ_df.rename(columns={"Docs":"Docs_Econ","AvgTone":"AvgTone_Econ"}),
        on="Month", how="outer"
      )
      .sort_values("Month")
      .reset_index(drop=True)
)

out_wide = BASE / "events_uk_monthly_2005_to_2015Jan_WIDE.csv"
wide.to_csv(out_wide, index=False, date_format="%Y-%m-%d")
print("Wrote:", out_wide)

# Quick peek
print("\nPreview (last 6 rows):")
print(wide.tail(6).to_string(index=False))


All-UK: merged 11 files → 121 rows
Economic-only: merged 11 files → 121 rows
Wrote: /content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_monthly_2005_to_2015Jan.csv
Wrote: /content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_economic_monthly_2005_to_2015Jan.csv
Wrote: /content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_monthly_2005_to_2015Jan_WIDE.csv

Preview (last 6 rows):
     Month  Docs_All  AvgTone_All  Docs_Econ  AvgTone_Econ
2014-08-01    205392     2.564620      54274      2.678771
2014-09-01    278268     2.626609      75431      2.690719
2014-10-01    265410     2.680726      71706      2.814015
2014-11-01    243234     2.656559      64486      2.790142
2014-12-01    198960     2.724392      53564      2.859509
2015-01-01    219962     2.650666      60775      2.742955


merge

In [None]:
# ================== Merge GDELT 2005–Jan 2015 (Events 1.0) with Feb 2015–2025 (GKG 2.0) ==================
# Inputs (adjust paths if yours differ):
#   /content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_monthly_2005_to_2015Jan.csv
#   /content/drive/MyDrive/msc_project/gdelt/gdelt_uk_housing_monthly_STITCHING.csv
# Outputs:
#   /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_wide.csv
#   /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_stitched.csv

import pandas as pd
import numpy as np
from pathlib import Path
from google.colab import drive

# Mount Drive (no-op if already mounted)
drive.mount("/content/drive")

# ---- paths ----
EVENTS_PATH = "/content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_monthly_2005_to_2015Jan.csv"
GKG_MASTER  = "/content/drive/MyDrive/msc_project/gdelt/gdelt_uk_housing_monthly_STITCHING.csv"
OUT_DIR     = Path("/content/drive/MyDrive/msc_project/gdelt")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- load Events v1.0 merged (2005–2015-01) ----
ev = pd.read_csv(EVENTS_PATH, parse_dates=["Month"])
ev = ev.sort_values("Month").reset_index(drop=True)

# ---- load GKG master (2015-02 onward) ----
gkg = pd.read_csv(GKG_MASTER)
# normalize columns
if "month" in gkg.columns:
    gkg.rename(columns={"month":"Month"}, inplace=True)
if "docs" in gkg.columns:
    gkg.rename(columns={"docs":"Docs"}, inplace=True)
if "avg_tone" in gkg.columns:
    gkg.rename(columns={"avg_tone":"AvgTone"}, inplace=True)

# parse + clean types
gkg["Month"]   = pd.to_datetime(gkg["Month"], errors="coerce")
gkg["Docs"]    = pd.to_numeric(gkg["Docs"], errors="coerce")
gkg["AvgTone"] = pd.to_numeric(gkg["AvgTone"], errors="coerce")

# ---- clip windows ----
start = pd.Timestamp("2005-01-01")
cutover_last_events = pd.Timestamp("2015-01-01")  # keep Events through Jan-2015
cutover_first_gkg   = pd.Timestamp("2015-02-01")  # use GKG from Feb-2015 onward
end   = pd.Timestamp("2025-06-01")                # set your project end

ev = ev[(ev["Month"] >= start) & (ev["Month"] <= cutover_last_events)].copy()
gkg = gkg[(gkg["Month"] >= cutover_first_gkg) & (gkg["Month"] <= end)].copy()

# ---- make a complete monthly index and merge wide ----
full_idx = pd.date_range(start, end, freq="MS")
ev_w  = ev.set_index("Month").rename(columns={"Docs":"Docs_Events", "AvgTone":"AvgTone_Events"})
gkg_w = gkg.set_index("Month").rename(columns={"Docs":"Docs_GKG",   "AvgTone":"AvgTone_GKG"})

wide = (
    pd.DataFrame(index=full_idx)
      .join(ev_w[["Docs_Events","AvgTone_Events"]], how="left")
      .join(gkg_w[["Docs_GKG","AvgTone_GKG"]],      how="left")
      .rename_axis("Month")
      .reset_index()
)

# ---- build a stitched single series (Events until 2015-01, then GKG) ----
wide["AvgTone_Stitched"] = np.where(
    wide["Month"] <= cutover_last_events, wide["AvgTone_Events"], wide["AvgTone_GKG"]
)
wide["Docs_Stitched"] = np.where(
    wide["Month"] <= cutover_last_events, wide["Docs_Events"], wide["Docs_GKG"]
)
wide["Source"] = np.where(
    wide["Month"] <= cutover_last_events, "Events_v1_0", "GKG_v2_0_Housing"
)

# ---- save outputs ----
out_wide = OUT_DIR / "gdelt_tone_2005_2025_wide.csv"
out_st   = OUT_DIR / "gdelt_tone_2005_2025_stitched.csv"

wide.to_csv(out_wide, index=False, date_format="%Y-%m-%d")

wide[["Month","AvgTone_Stitched","Docs_Stitched","Source"]].to_csv(
    out_st, index=False, date_format="%Y-%m-%d"
)

print("Saved:")
print(" ", out_wide)
print(" ", out_st)

# quick peek
print("\nPreview (tail):")
print(wide.tail(6).to_string(index=False))


Mounted at /content/drive
Saved:
  /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_wide.csv
  /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_stitched.csv

Preview (tail):
     Month  Docs_Events  AvgTone_Events  Docs_GKG  AvgTone_GKG  AvgTone_Stitched  Docs_Stitched           Source
2025-01-01          NaN             NaN   14180.0    -1.096796         -1.096796        14180.0 GKG_v2_0_Housing
2025-02-01          NaN             NaN   16683.0    -0.884768         -0.884768        16683.0 GKG_v2_0_Housing
2025-03-01          NaN             NaN   17850.0    -0.767261         -0.767261        17850.0 GKG_v2_0_Housing
2025-04-01          NaN             NaN   14902.0    -1.109188         -1.109188        14902.0 GKG_v2_0_Housing
2025-05-01          NaN             NaN   18253.0    -0.360499         -0.360499        18253.0 GKG_v2_0_Housing
2025-06-01          NaN             NaN    8278.0    -0.826562         -0.826562         8278.0 GKG_v2_0_Housing


In [None]:
# ================== Merge GDELT economic-only (2005–Jan 2015) with GKG housing (Feb 2015–Jun 2025) ==================
# Inputs (change paths if yours differ):
#   /content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_economic_monthly_2005_to_2015Jan.csv
#   /content/drive/MyDrive/msc_project/gdelt/gdelt_uk_housing_monthly_STITCHING.csv
# Outputs:
#   /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_wide_ECONxGKG.csv
#   /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_stitched_ECONxGKG.csv

!pip -q install pandas

import pandas as pd
import numpy as np
from pathlib import Path
from google.colab import drive

drive.mount("/content/drive")

# ---------- paths ----------
EVENTS_PATH = "/content/drive/MyDrive/msc_project/gdelt_events_1p0/events_uk_economic_monthly_2005_to_2015Jan.csv"  # ECON ONLY
GKG_MASTER  = "/content/drive/MyDrive/msc_project/gdelt/gdelt_uk_housing_monthly_STITCHING.csv"                      # your GKG (housing) master
OUT_DIR     = Path("/content/drive/MyDrive/msc_project/gdelt")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- window ----------
start = pd.Timestamp("2005-01-01")
cutover_last_events = pd.Timestamp("2015-01-01")  # use ECON-only events up to Jan 2015
cutover_first_gkg   = pd.Timestamp("2015-02-01")  # use GKG housing from Feb 2015
end   = pd.Timestamp("2025-06-01")                # adjust if needed

# ---------- load ECON-only events ----------
ev = pd.read_csv(EVENTS_PATH)

# normalize columns just in case
rename_map = {}
if "month" in ev.columns: rename_map["month"] = "Month"
if "docs"  in ev.columns: rename_map["docs"]  = "Docs"
if "avg_tone" in ev.columns: rename_map["avg_tone"] = "AvgTone"
if rename_map: ev = ev.rename(columns=rename_map)

ev["Month"]   = pd.to_datetime(ev["Month"], errors="coerce")
ev["Docs"]    = pd.to_numeric(ev["Docs"], errors="coerce")
ev["AvgTone"] = pd.to_numeric(ev["AvgTone"], errors="coerce")
ev = ev.dropna(subset=["Month"]).sort_values("Month").drop_duplicates(subset="Month", keep="last")

# clip to window for safety
ev = ev[(ev["Month"] >= start) & (ev["Month"] <= cutover_last_events)].copy()

# ---------- load GKG master (housing filtered) ----------
gkg = pd.read_csv(GKG_MASTER)

gkg_ren = {}
if "month" in gkg.columns: gkg_ren["month"] = "Month"
if "docs"  in gkg.columns: gkg_ren["docs"]  = "Docs"
if "avg_tone" in gkg.columns: gkg_ren["avg_tone"] = "AvgTone"
if gkg_ren: gkg = gkg.rename(columns=gkg_ren)

gkg["Month"]   = pd.to_datetime(gkg["Month"], errors="coerce")
gkg["Docs"]    = pd.to_numeric(gkg["Docs"], errors="coerce")
gkg["AvgTone"] = pd.to_numeric(gkg["AvgTone"], errors="coerce")
gkg = gkg.dropna(subset=["Month"]).sort_values("Month").drop_duplicates(subset="Month", keep="last")

# clip to Feb 2015 – Jun 2025
gkg = gkg[(gkg["Month"] >= cutover_first_gkg) & (gkg["Month"] <= end)].copy()

# ---------- build full index & merge wide ----------
full_idx = pd.date_range(start, end, freq="MS")

ev_w  = ev.set_index("Month").rename(columns={"Docs":"Docs_Events_Econ", "AvgTone":"AvgTone_Events_Econ"})
gkg_w = gkg.set_index("Month").rename(columns={"Docs":"Docs_GKG",        "AvgTone":"AvgTone_GKG"})

wide = (
    pd.DataFrame(index=full_idx)
      .join(ev_w[["Docs_Events_Econ","AvgTone_Events_Econ"]], how="left")
      .join(gkg_w[["Docs_GKG","AvgTone_GKG"]],               how="left")
      .rename_axis("Month")
      .reset_index()
)

# ---------- stitched single series (ECON Events -> GKG) ----------
wide["AvgTone_Stitched"] = np.where(
    wide["Month"] <= cutover_last_events, wide["AvgTone_Events_Econ"], wide["AvgTone_GKG"]
)
wide["Docs_Stitched"] = np.where(
    wide["Month"] <= cutover_last_events, wide["Docs_Events_Econ"], wide["Docs_GKG"]
)
wide["Source"] = np.where(
    wide["Month"] <= cutover_last_events, "Events_v1_0_ECON", "GKG_v2_0_Housing"
)

# ---------- save ----------
out_wide = OUT_DIR / "gdelt_tone_2005_2025_wide_ECONxGKG.csv"
out_st   = OUT_DIR / "gdelt_tone_2005_2025_stitched_ECONxGKG.csv"

wide.to_csv(out_wide, index=False, date_format="%Y-%m-%d")
wide[["Month","AvgTone_Stitched","Docs_Stitched","Source"]].to_csv(
    out_st, index=False, date_format="%Y-%m-%d"
)

print("Saved:")
print(" ", out_wide)
print(" ", out_st)

# quick peek
print("\nPreview (tail):")
print(wide.tail(6).to_string(index=False))


Mounted at /content/drive
Saved:
  /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_wide_ECONxGKG.csv
  /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_stitched_ECONxGKG.csv

Preview (tail):
     Month  Docs_Events_Econ  AvgTone_Events_Econ  Docs_GKG  AvgTone_GKG  AvgTone_Stitched  Docs_Stitched           Source
2025-01-01               NaN                  NaN   14180.0    -1.096796         -1.096796        14180.0 GKG_v2_0_Housing
2025-02-01               NaN                  NaN   16683.0    -0.884768         -0.884768        16683.0 GKG_v2_0_Housing
2025-03-01               NaN                  NaN   17850.0    -0.767261         -0.767261        17850.0 GKG_v2_0_Housing
2025-04-01               NaN                  NaN   14902.0    -1.109188         -1.109188        14902.0 GKG_v2_0_Housing
2025-05-01               NaN                  NaN   18253.0    -0.360499         -0.360499        18253.0 GKG_v2_0_Housing
2025-06-01               NaN                

Merge with 2018. 2019, 2020, 2021

In [None]:
# === Update stitched series with 2019, 2020, 2021 (now present in /gdelt) ===
!pip -q install pandas

from google.colab import drive
drive.mount("/content/drive")

import pandas as pd
from pathlib import Path

BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")

# Prefer ECON×GKG stitched; fall back to plain stitched
for cand in [
    BASE_DIR / "gdelt_tone_2005_2025_stitched_ECONxGKG.csv",
    BASE_DIR / "gdelt_tone_2005_2025_stitched.csv",
]:
    if cand.exists():
        STITCHED = cand
        break
else:
    raise FileNotFoundError("No stitched file found in: " + str(BASE_DIR))

print("Using stitched:", STITCHED.name)

YEARS = [2019, 2020, 2021]

def find_year_file(year:int):
    # look in root and in gdelt_run_<year>/
    for p in [
        BASE_DIR / f"gdelt_uk_housing_monthly_{year}.csv",
        BASE_DIR / f"gdelt_run_{year}" / f"gdelt_uk_housing_monthly_{year}.csv",
    ]:
        if p.exists():
            return p
    # fallback: search anywhere under BASE_DIR
    for p in BASE_DIR.rglob(f"gdelt_uk_housing_monthly_{year}.csv"):
        return p
    print(f"!! Could not find a yearly CSV for {year}")
    return None

def load_year(year:int) -> pd.DataFrame:
    p = find_year_file(year)
    if p is None:
        return pd.DataFrame(columns=["Month","AvgTone","Docs"])
    df = pd.read_csv(p).rename(columns={"month":"Month","docs":"Docs","avg_tone":"AvgTone"})
    df["Month"]   = pd.to_datetime(df["Month"], errors="coerce")
    df["AvgTone"] = pd.to_numeric(df.get("AvgTone"), errors="coerce")
    df["Docs"]    = pd.to_numeric(df.get("Docs"), errors="coerce")
    df = (df[df["Month"].dt.year.eq(year)]
          .dropna(subset=["Month"])
          .sort_values("Month")
          .drop_duplicates(subset="Month", keep="last"))
    print(f"Loaded {year}: {p.relative_to(BASE_DIR)} → {len(df)} rows")
    return df[["Month","AvgTone","Docs"]]

# Load stitched base
base = pd.read_csv(STITCHED)
if "Month" not in base.columns:
    base.rename(columns={"month":"Month"}, inplace=True)
for c in ["AvgTone_Stitched","Docs_Stitched","Source"]:
    if c not in base.columns:
        raise ValueError(f"Expected column '{c}' in {STITCHED.name}")

base["Month"]            = pd.to_datetime(base["Month"], errors="coerce")
base["AvgTone_Stitched"] = pd.to_numeric(base["AvgTone_Stitched"], errors="coerce")
base["Docs_Stitched"]    = pd.to_numeric(base["Docs_Stitched"], errors="coerce")
base = (base.dropna(subset=["Month"])
             .sort_values("Month")
             .drop_duplicates(subset="Month", keep="last")
             .set_index("Month"))

# Load the three years and combine
parts = [load_year(y) for y in YEARS]
new_df = pd.concat(parts, ignore_index=True).dropna(subset=["Month"]).drop_duplicates(subset="Month", keep="last").set_index("Month")

if new_df.empty:
    print("\nNo yearly rows found; stitched file unchanged.")
else:
    upd = pd.DataFrame(index=new_df.index)
    upd["AvgTone_Stitched"] = pd.to_numeric(new_df["AvgTone"], errors="coerce")
    upd["Docs_Stitched"]    = pd.to_numeric(new_df["Docs"], errors="coerce")
    upd["Source"]           = "GKG_v2_0_Housing"

    to_overwrite = base.index.intersection(upd.index)
    base.loc[to_overwrite, ["AvgTone_Stitched","Docs_Stitched","Source"]] = \
        upd.loc[to_overwrite, ["AvgTone_Stitched","Docs_Stitched","Source"]].values

    print(f"\nOverwrote {len(to_overwrite)} months:",
          ", ".join(pd.Index(to_overwrite).strftime("%Y-%m").tolist()[:24]) + (" ..." if len(to_overwrite) > 24 else ""))

# Save updated stitched
OUT_PATH = BASE_DIR / (STITCHED.stem + "_UPDATED.csv")
base.reset_index().to_csv(OUT_PATH, index=False, date_format="%Y-%m-%d")
print("\nSaved:", OUT_PATH)

# Quick peek for 2019–2021
peek = base.loc[base.index.year.isin(YEARS)].sort_index()
print("\nUpdated span preview (tail):")
print(peek.tail(12))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using stitched: gdelt_tone_2005_2025_stitched_ECONxGKG.csv
Loaded 2019: gdelt_uk_housing_monthly_2019.csv → 12 rows
Loaded 2020: gdelt_uk_housing_monthly_2020.csv → 12 rows
Loaded 2021: gdelt_uk_housing_monthly_2021.csv → 12 rows

Overwrote 36 months: 2019-01, 2019-02, 2019-03, 2019-04, 2019-05, 2019-06, 2019-07, 2019-08, 2019-09, 2019-10, 2019-11, 2019-12, 2020-01, 2020-02, 2020-03, 2020-04, 2020-05, 2020-06, 2020-07, 2020-08, 2020-09, 2020-10, 2020-11, 2020-12 ...

Saved: /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv

Updated span preview (tail):
            AvgTone_Stitched  Docs_Stitched            Source
Month                                                        
2021-01-01         -1.772631        16649.0  GKG_v2_0_Housing
2021-02-01         -1.397794        13488.0  GKG_v2_0_Housing
2021-03-01         -1.

In [None]:
# === Merge 2018 into the ECON×GKG UPDATED stitched file ===
!pip -q install pandas

from google.colab import drive
drive.mount("/content/drive")

import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/msc_project/gdelt")
YEAR = 2018

# Prefer to edit the UPDATED stitched file; fall back to base if needed
CAND_IN = [
    BASE / "gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv",
    BASE / "gdelt_tone_2005_2025_stitched_ECONxGKG.csv",
]
for p in CAND_IN:
    if p.exists():
        STITCHED = p
        break
else:
    raise FileNotFoundError("Could not find stitched ECONxGKG file in " + str(BASE))

print("Editing stitched file:", STITCHED.name)

# --- load stitched (base) ---
s = pd.read_csv(STITCHED)
if "Month" not in s.columns:
    s.rename(columns={"month":"Month"}, inplace=True)

# enforce schema + types
needed_cols = ["Month","AvgTone_Stitched","Docs_Stitched","Source"]
missing = [c for c in needed_cols if c not in s.columns]
if missing:
    raise ValueError(f"Stitched file missing columns: {missing}")

s["Month"]            = pd.to_datetime(s["Month"], errors="coerce")
s["AvgTone_Stitched"] = pd.to_numeric(s["AvgTone_Stitched"], errors="coerce")
s["Docs_Stitched"]    = pd.to_numeric(s["Docs_Stitched"], errors="coerce")
s = (s.dropna(subset=["Month"])
       .sort_values("Month")
       .drop_duplicates(subset="Month", keep="last")
       .set_index("Month"))

# --- load 2018 year file ---
year_path_opts = [
    BASE / f"gdelt_uk_housing_monthly_{YEAR}.csv",
    BASE / f"gdelt_run_{YEAR}" / f"gdelt_uk_housing_monthly_{YEAR}.csv",
]
for yp in year_path_opts:
    if yp.exists():
        YEAR_FILE = yp
        break
else:
    raise FileNotFoundError(f"Could not find gdelt_uk_housing_monthly_{YEAR}.csv under {BASE}")

y = pd.read_csv(YEAR_FILE).rename(columns={"month":"Month","docs":"Docs","avg_tone":"AvgTone"})
y["Month"]   = pd.to_datetime(y["Month"], errors="coerce")
y["AvgTone"] = pd.to_numeric(y.get("AvgTone"), errors="coerce")
y["Docs"]    = pd.to_numeric(y.get("Docs"), errors="coerce")

y = (y[y["Month"].dt.year.eq(YEAR)]
       .dropna(subset=["Month"])
       .sort_values("Month")
       .drop_duplicates(subset="Month", keep="last")
       .set_index("Month"))

if y.empty:
    raise ValueError(f"No rows for {YEAR} in {YEAR_FILE.name}")

# --- prepare update frame and overwrite matching months ---
upd = pd.DataFrame(index=y.index)
upd["AvgTone_Stitched"] = y["AvgTone"]
upd["Docs_Stitched"]    = y["Docs"]
upd["Source"]           = "GKG_v2_0_Housing"

to_overwrite = s.index.intersection(upd.index)
s.loc[to_overwrite, ["AvgTone_Stitched","Docs_Stitched","Source"]] = \
    upd.loc[to_overwrite, ["AvgTone_Stitched","Docs_Stitched","Source"]].values

print(f"Overwrote {len(to_overwrite)} months:",
      ", ".join(pd.Index(to_overwrite).strftime("%Y-%m").tolist()))

# --- save back to UPDATED file (overwrite-in-place if it exists) ---
OUT = BASE / "gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv"
s.reset_index().to_csv(OUT, index=False, date_format="%Y-%m-%d")
print("Saved:", OUT)

# quick peek of 2018 after merge
print("\n2018 preview:")
print(s.loc[s.index.year.eq(YEAR)][["AvgTone_Stitched","Docs_Stitched","Source"]]
        .sort_index().to_string())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Editing stitched file: gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv
Overwrote 12 months: 2018-01, 2018-02, 2018-03, 2018-04, 2018-05, 2018-06, 2018-07, 2018-08, 2018-09, 2018-10, 2018-11, 2018-12
Saved: /content/drive/MyDrive/msc_project/gdelt/gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv

2018 preview:


AttributeError: 'Index' object has no attribute 'eq'

In [None]:
import pandas as pd
from pathlib import Path

BASE = Path("/content/drive/MyDrive/msc_project/gdelt")
PATH = BASE / "gdelt_tone_2005_2025_stitched_ECONxGKG_UPDATED.csv"

s = (pd.read_csv(PATH, parse_dates=["Month"])
       .set_index("Month"))

cols = ["AvgTone_Stitched","Docs_Stitched","Source"]

# 2018 preview
mask_2018 = s.index.year == 2018
print("2018 preview:")
print(s.loc[mask_2018, cols].sort_index().to_string())

# (Optional) quick check for 2019–2021 too
for yr in [2019, 2020, 2021]:
    m = s.index.year == yr
    print(f"\n{yr} ({m.sum()} months):")
    if m.sum():
        print(s.loc[m, cols].sort_index().to_string())
    else:
        print("No rows in stitched file for this year yet.")


2018 preview:
            AvgTone_Stitched  Docs_Stitched            Source
Month                                                        
2018-01-01         -1.257888        23140.0  GKG_v2_0_Housing
2018-02-01         -1.336428        21142.0  GKG_v2_0_Housing
2018-03-01         -1.652378        30247.0  GKG_v2_0_Housing
2018-04-01         -1.951460        30498.0  GKG_v2_0_Housing
2018-05-01         -1.159840        27642.0  GKG_v2_0_Housing
2018-06-01         -1.025702        26249.0  GKG_v2_0_Housing
2018-07-01         -1.313607        26933.0  GKG_v2_0_Housing
2018-08-01         -1.576400        19712.0  GKG_v2_0_Housing
2018-09-01         -1.553960        18445.0  GKG_v2_0_Housing
2018-10-01         -1.314133        20027.0  GKG_v2_0_Housing
2018-11-01         -1.401884        20941.0  GKG_v2_0_Housing
2018-12-01         -1.599224        18113.0  GKG_v2_0_Housing

2019 (12 months):
            AvgTone_Stitched  Docs_Stitched            Source
Month                                