2015

In [None]:

!pip -q install pandas requests

# ================== CONFIG (set this per run) ==================
YEAR = 2015

HOURLY_ONLY = True
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100
MAX_FILES = None

# Filters (same as before)
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Where to write for this YEAR
from pathlib import Path
OUT_DIR = Path(f"/content/gdelt_run_{YEAR}")
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV  = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH  = OUT_DIR / "state.json"

MASTER_URL  = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"


import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
logging.getLogger("urllib3").setLevel(logging.ERROR)

def year_bounds(y:int):
    if y == 2015:
        return "20150218000000", "20151231235959"     # GKG v2 starts mid-Feb 2015
    elif y == 2025:
        return "20250101000000", "20250630235959"
    else:
        return f"{y}0101000000", f"{y}1231235959"

START_TS, END_TS = year_bounds(YEAR)

def session():
    from urllib3.util.retry import Retry
    from requests.adapters import HTTPAdapter
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # force http to dodge cert issues
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls():
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3: continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m: continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # one per hour
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])  # first field is doc tone
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty: return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ---------- plan + resume for this YEAR ----------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count      = state.get("processed_count", 0)
    processed_urls_tail  = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
# MAX_FILES=None -> no slicing; process all remaining files
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] remaining this run: {len(todo):,}")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ---------------- main loop ----------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        kept = process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        docs_seen += kept
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA ≈ {eta_min:.1f} min | docs_kept={docs_seen}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo)} | Total processed: {processed_count} | Docs kept: {docs_seen}")


[2015] planned files: 7,536
Resuming: 997 files done previously.
[2015] remaining this run: 6,539
[2015] 100/6539 | 1.02 files/s | ETA ≈ 105.7 min | docs_kept=1215
[2015] 200/6539 | 0.95 files/s | ETA ≈ 111.5 min | docs_kept=2911
Saved → /content/gdelt_run_2015/gdelt_uk_housing_monthly_2015.csv
[2015] 300/6539 | 0.98 files/s | ETA ≈ 106.3 min | docs_kept=4174
[2015] 400/6539 | 0.94 files/s | ETA ≈ 109.3 min | docs_kept=6214
Saved → /content/gdelt_run_2015/gdelt_uk_housing_monthly_2015.csv
[2015] 500/6539 | 0.95 files/s | ETA ≈ 105.5 min | docs_kept=7456
[2015] 600/6539 | 0.94 files/s | ETA ≈ 105.5 min | docs_kept=9238
Saved → /content/gdelt_run_2015/gdelt_uk_housing_monthly_2015.csv
[2015] 700/6539 | 0.91 files/s | ETA ≈ 106.4 min | docs_kept=12145
[2015] 800/6539 | 0.92 files/s | ETA ≈ 104.0 min | docs_kept=13953
Saved → /content/gdelt_run_2015/gdelt_uk_housing_monthly_2015.csv
[2015] 900/6539 | 0.90 files/s | ETA ≈ 104.3 min | docs_kept=17574
[2015] 1000/6539 | 0.91 files/s | ETA ≈ 1

In [None]:
import pandas as pd
YEAR = 2015  # change per year you just processed

df = pd.read_csv(f"/content/gdelt_run_{YEAR}/gdelt_uk_housing_monthly_{YEAR}.csv",
                 parse_dates=["month"]).sort_values("month")

# Reindex to full year so you see empty months explicitly
full = (df.set_index("month")
          .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
          .rename_axis("month")
          .reset_index())

pd.set_option("display.float_format", lambda x: f"{x:,.3f}")
print(full.rename(columns={"month":"Month","docs":"Docs","avg_tone":"AvgTone"})
          .to_string(index=False))


     Month       Docs  AvgTone
2015-01-01        NaN      NaN
2015-02-01  1,594.000   -0.542
2015-03-01 13,219.000   -0.226
2015-04-01 12,775.000   -0.327
2015-05-01 17,417.000   -0.999
2015-06-01 18,734.000   -1.570
2015-07-01 23,645.000   -1.352
2015-08-01 18,591.000   -1.222
2015-09-01 20,365.000   -1.225
2015-10-01 18,921.000   -0.969
2015-11-01 22,705.000   -1.871
2015-12-01 22,439.000   -1.730


In [None]:
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import pandas as pd

BASE_DIR = Path("/content")  # Corrected path to where the files are located
YEAR = 2015

year_file   = BASE_DIR / f"gdelt_run_{YEAR}/gdelt_uk_housing_monthly_{YEAR}.csv"
master_file = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"  # running master

y = pd.read_csv(year_file, parse_dates=["month"]).sort_values("month")

if master_file.exists():
    m = pd.read_csv(master_file, parse_dates=["month"])
    m = (pd.concat([m, y], ignore_index=True)
           .drop_duplicates(subset="month", keep="last")
           .sort_values("month")
           .reset_index(drop=True))
else:
    m = y

m.to_csv(master_file, index=False)
print("Saved master:", master_file, m.shape)
print(m.tail(12).to_string(index=False))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved master: /content/gdelt_uk_housing_monthly_STITCHING.csv (11, 3)
     month  docs  avg_tone
2015-02-01  1594    -0.542
2015-03-01 13219    -0.226
2015-04-01 12775    -0.327
2015-05-01 17417    -0.999
2015-06-01 18734    -1.570
2015-07-01 23645    -1.352
2015-08-01 18591    -1.222
2015-09-01 20365    -1.225
2015-10-01 18921    -0.969
2015-11-01 22705    -1.871
2015-12-01 22439    -1.730


In [None]:
from pathlib import Path
import shutil, os

# 1) Confirm mount
print("Drive mounted:", os.path.ismount("/content/drive"))

# 2) Your Drive workspace
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")
BASE_DIR.mkdir(parents=True, exist_ok=True)

# 3) Move the master file from /content -> Drive (if it exists there)
src = Path("/content/gdelt_uk_housing_monthly_STITCHING.csv")
dst = BASE_DIR / src.name
if src.exists():
    shutil.move(str(src), str(dst))
    print("Moved master to:", dst)
else:
    print("No master file found in /content; nothing to move.")

# 4) Always use this path for the master file
MASTER_FILE = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"
print("Master path to use:", MASTER_FILE)

# 5) Quick listing
!ls -lh "/content/drive/MyDrive/msc_project/gdelt"


Drive mounted: True
Moved master to: /content/drive/MyDrive/msc_project/gdelt/gdelt_uk_housing_monthly_STITCHING.csv
Master path to use: /content/drive/MyDrive/msc_project/gdelt/gdelt_uk_housing_monthly_STITCHING.csv
total 512
-rw------- 1 root root 423 Oct  5 13:40 gdelt_uk_housing_monthly_STITCHING.csv


2016

In [None]:
# ===================== GDELT GKG v2.1 — UK Housing — YEAR 2016 =====================
# Runs in chunks with resume; writes to Google Drive; auto-appends to master.

!pip -q install pandas requests

# --------- Mount Drive & set project folder ---------
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")  # <- change if you prefer
BASE_DIR.mkdir(parents=True, exist_ok=True)

# ================== CONFIG (2016) ==================
YEAR = 2016          # <- this cell is for 2016
MAX_FILES = None     # <- chunk size per run; set None to process ALL remaining files this year
HOURLY_ONLY = True   # take top-of-hour files only (lighter than every 15 min)
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths (on Drive)
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

# Year bounds for 2016 (full calendar year)
START_TS, END_TS = "20160101000000", "20161231235959"

# ----------------- Imports & setup -----------------
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls():
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3: continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m: continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # one per hour
                continue
            urls.append(url.replace("https://","http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])  # first item is doc tone
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty: return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ----------------- Plan + resume -----------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ----------------- Main loop -----------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# Final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ----------------- Auto-append to Drive master -----------------
try:
    master_file = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"
    y = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    if master_file.exists():
        m = pd.read_csv(master_file, parse_dates=["month"])
        m = (pd.concat([m, y], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = y
    m.to_csv(master_file, index=False)
    print("Appended to master:", master_file, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ----------------- Pretty print monthly (quick check) -----------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)
# ===================== end of cell =====================


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[2016] planned hourly files: 8,774
[2016] processing this session: 8,774 files
[2016] 100/8774 | 1.01 files/s | ETA≈143.0m | docs_kept=1979
[2016] 200/8774 | 0.81 files/s | ETA≈177.0m | docs_kept=5754
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2016/gdelt_uk_housing_monthly_2016.csv
[2016] 300/8774 | 0.81 files/s | ETA≈174.0m | docs_kept=8347
[2016] 400/8774 | 0.79 files/s | ETA≈175.6m | docs_kept=11532
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2016/gdelt_uk_housing_monthly_2016.csv
[2016] 500/8774 | 0.77 files/s | ETA≈178.2m | docs_kept=15409
[2016] 600/8774 | 0.78 files/s | ETA≈175.0m | docs_kept=17834
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2016/gdelt_uk_housing_monthly_2016.csv
[2016] 700/8774 | 0.76 files/s | ETA≈177.8m | docs_kept=21314
[2016] 800/8774 | 0.76 files/s | ETA≈174.1m | docs_kept=23675
Saved

2017

In [None]:
# ========================= GDELT GKG (UK housing) → monthly (docs + avg tone) =========================
# Runs year-by-year. Set YEAR below (e.g., 2017). Saves to Google Drive and appends to a master CSV.

!pip -q install pandas requests

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ------------------------ CONFIG ------------------------
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")  # change if you like
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2017                     # <<< set the year you want to process
RESET_STATE = True              # <<< set True if you previously ran the wrong year; else False
MAX_FILES = None                # cap files processed this run; None = process all remaining
HOURLY_ONLY = True              # process top-of-hour files only (lighter than every 15 minutes)
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Date window from YEAR (dynamic!)
START_TS = f"{YEAR}0101000000"  # YYYY-01-01 00:00:00
END_TS   = f"{YEAR}1231235959"  # YYYY-12-31 23:59:59
# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
MASTER_CSV = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"

# ------------------------ Networking helpers ------------------------
logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ------------------------ GDELT plumbing ------------------------
def build_gkg_urls():
    """Collect GKG v2.1 .zip URLs within the YEAR bounds (top-of-hour if HOURLY_ONLY)."""
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m:
            continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # only top-of-hour files
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        # V2Tone = "docTone, pos, neg, polarity, activityRefDensity, selfGroupRefDensity, wordCount"
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],  # retain a long tail for resume
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ------------------------ Reset (if previously wrong year) ------------------------
if RESET_STATE:
    if STATE_PATH.exists():
        STATE_PATH.unlink()
    if YEARLY_CSV.exists():
        YEARLY_CSV.unlink()

# ------------------------ Plan + resume ------------------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ------------------------ Main loop ------------------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ------------------------ Append to master ------------------------
try:
    ydf = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # sanity: ensure we really wrote YEAR
    years_found = set(ydf["month"].dt.year.dropna().unique().tolist())
    if years_found != {YEAR}:
        raise ValueError(f"Year mismatch: expected {YEAR}, found {sorted(years_found)} in {YEARLY_CSV}")

    if MASTER_CSV.exists():
        m = pd.read_csv(MASTER_CSV, parse_dates=["month"])
        m = (pd.concat([m, ydf], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = ydf
    m.to_csv(MASTER_CSV, index=False)
    print("Appended to master:", MASTER_CSV, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ------------------------ Pretty print monthly for YEAR ------------------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # build a complete monthly index for YEAR so missing months show as NaN
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)
# ========================= end =========================


Mounted at /content/drive
[2017] planned hourly files: 8,503
[2017] processing this session: 8,503 files
[2017] 100/8503 | 0.78 files/s | ETA≈180.2m | docs_kept=3,422
[2017] 200/8503 | 0.76 files/s | ETA≈182.9m | docs_kept=7,183
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2017/gdelt_uk_housing_monthly_2017.csv
[2017] 300/8503 | 0.69 files/s | ETA≈197.5m | docs_kept=12,752
[2017] 400/8503 | 0.70 files/s | ETA≈191.6m | docs_kept=17,049
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2017/gdelt_uk_housing_monthly_2017.csv
[2017] 500/8503 | 0.67 files/s | ETA≈198.1m | docs_kept=24,366
[2017] 600/8503 | 0.67 files/s | ETA≈197.4m | docs_kept=31,267
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2017/gdelt_uk_housing_monthly_2017.csv
[2017] 700/8503 | 0.67 files/s | ETA≈194.4m | docs_kept=38,433
[2017] 800/8503 | 0.65 files/s | ETA≈197.8m | docs_kept=45,617
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2017/gdelt_uk_housing_monthly_2017.csv
[

2018

In [None]:
# --- GDELT UK housing monthly for a single year (no auto-append to master) ---
!pip -q install pandas requests
from google.colab import drive
drive.mount('/content/drive')

import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2018
RESET_STATE = True
MAX_FILES = None
HOURLY_ONLY = True
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100
APPEND_TO_MASTER = False

START_TS = f"{YEAR}0101000000"
END_TS   = f"{YEAR}1231235959"

UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def build_gkg_urls():
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3: continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m: continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # top-of-hour only
                continue
            urls.append(url.replace("https://","http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty: return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# Reset per-year state if needed
if RESET_STATE:
    if STATE_PATH.exists(): STATE_PATH.unlink()
    if YEARLY_CSV.exists(): YEARLY_CSV.unlink()

# Plan + resume
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# Main loop
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# Final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. Processed this session: {len(todo):,} | Total processed: {processed_count:,} | Docs kept: {docs_seen:,}")

# Quick print for the year
df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
full = (df.set_index("month")
          .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
          .rename_axis("Month").reset_index())
full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
print(full[["Month","Docs","AvgTone"]].to_string(index=False))


2019

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ------------------------ CONFIG ------------------------
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")  # change if you like
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2019
RESET_STATE = True
MAX_FILES = None
HOURLY_ONLY = True
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Date window from YEAR (dynamic!)
START_TS = f"{YEAR}0101000000"
END_TS   = f"{YEAR}1231235959"
# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
MASTER_CSV = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"

# ------------------------ Networking helpers ------------------------
logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ------------------------ GDELT plumbing ------------------------
def build_gkg_urls():
    """Collect GKG v2.1 .zip URLs within the YEAR bounds (top-of-hour if HOURLY_ONLY)."""
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m:
            continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # only top-of-hour files
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        # V2Tone = "docTone, pos, neg, polarity, activityRefDensity, selfGroupRefDensity, wordCount"
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],  # retain a long tail for resume
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ------------------------ Reset (if previously wrong year) ------------------------
if RESET_STATE:
    if STATE_PATH.exists():
        STATE_PATH.unlink()
    if YEARLY_CSV.exists():
        YEARLY_CSV.unlink()

# ------------------------ Plan + resume ------------------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ------------------------ Main loop ------------------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ------------------------ Append to master ------------------------
try:
    ydf = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # sanity: ensure we really wrote YEAR
    years_found = set(ydf["month"].dt.year.dropna().unique().tolist())
    if years_found != {YEAR}:
        raise ValueError(f"Year mismatch: expected {YEAR}, found {sorted(years_found)} in {YEARLY_CSV}")

    if MASTER_CSV.exists():
        m = pd.read_csv(MASTER_CSV, parse_dates=["month"])
        m = (pd.concat([m, ydf], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = ydf
    m.to_csv(MASTER_CSV, index=False)
    print("Appended to master:", MASTER_CSV, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ------------------------ Pretty print monthly for YEAR ------------------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # build a complete monthly index for YEAR so missing months show as NaN
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)

2020

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ------------------------ CONFIG ------------------------
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")  # change if you like
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2020
RESET_STATE = True
MAX_FILES = None
HOURLY_ONLY = True
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Date window from YEAR (dynamic!)
START_TS = f"{YEAR}0101000000"  # YYYY-01-01 00:00:00
END_TS   = f"{YEAR}1231235959"  # YYYY-12-31 23:59:59
# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
MASTER_CSV = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"

# ------------------------ Networking helpers ------------------------
logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ------------------------ GDELT plumbing ------------------------
def build_gkg_urls():
    """Collect GKG v2.1 .zip URLs within the YEAR bounds (top-of-hour if HOURLY_ONLY)."""
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m:
            continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # only top-of-hour files
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        # V2Tone = "docTone, pos, neg, polarity, activityRefDensity, selfGroupRefDensity, wordCount"
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],  # retain a long tail for resume
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ------------------------ Reset (if previously wrong year) ------------------------
if RESET_STATE:
    if STATE_PATH.exists():
        STATE_PATH.unlink()
    if YEARLY_CSV.exists():
        YEARLY_CSV.unlink()

# ------------------------ Plan + resume ------------------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ------------------------ Main loop ------------------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ------------------------ Append to master ------------------------
try:
    ydf = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # sanity: ensure we really wrote YEAR
    years_found = set(ydf["month"].dt.year.dropna().unique().tolist())
    if years_found != {YEAR}:
        raise ValueError(f"Year mismatch: expected {YEAR}, found {sorted(years_found)} in {YEARLY_CSV}")

    if MASTER_CSV.exists():
        m = pd.read_csv(MASTER_CSV, parse_dates=["month"])
        m = (pd.concat([m, ydf], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = ydf
    m.to_csv(MASTER_CSV, index=False)
    print("Appended to master:", MASTER_CSV, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ------------------------ Pretty print monthly for YEAR ------------------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # build a complete monthly index for YEAR so missing months show as NaN
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)

2021

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ------------------------ CONFIG ------------------------
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")  # change if you like
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2021
RESET_STATE = True
MAX_FILES = None
HOURLY_ONLY = True
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Date window from YEAR (dynamic!)
START_TS = f"{YEAR}0101000000"  # YYYY-01-01 00:00:00
END_TS   = f"{YEAR}1231235959"  # YYYY-12-31 23:59:59
# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
MASTER_CSV = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"

# ------------------------ Networking helpers ------------------------
logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ------------------------ GDELT plumbing ------------------------
def build_gkg_urls():
    """Collect GKG v2.1 .zip URLs within the YEAR bounds (top-of-hour if HOURLY_ONLY)."""
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m:
            continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # only top-of-hour files
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        # V2Tone = "docTone, pos, neg, polarity, activityRefDensity, selfGroupRefDensity, wordCount"
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],  # retain a long tail for resume
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ------------------------ Reset (if previously wrong year) ------------------------
if RESET_STATE:
    if STATE_PATH.exists():
        STATE_PATH.unlink()
    if YEARLY_CSV.exists():
        YEARLY_CSV.unlink()

# ------------------------ Plan + resume ------------------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ------------------------ Main loop ------------------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ------------------------ Append to master ------------------------
try:
    ydf = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # sanity: ensure we really wrote YEAR
    years_found = set(ydf["month"].dt.year.dropna().unique().tolist())
    if years_found != {YEAR}:
        raise ValueError(f"Year mismatch: expected {YEAR}, found {sorted(years_found)} in {YEARLY_CSV}")

    if MASTER_CSV.exists():
        m = pd.read_csv(MASTER_CSV, parse_dates=["month"])
        m = (pd.concat([m, ydf], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = ydf
    m.to_csv(MASTER_CSV, index=False)
    print("Appended to master:", MASTER_CSV, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ------------------------ Pretty print monthly for YEAR ------------------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # build a complete monthly index for YEAR so missing months show as NaN
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)

2022

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ------------------------ CONFIG ------------------------
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")  # change if you like
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2022                     # <<< set the year you want to process
RESET_STATE = True              # <<< set True if you previously ran the wrong year; else False
MAX_FILES = None                # cap files processed this run; None = process all remaining
HOURLY_ONLY = True              # process top-of-hour files only (lighter than every 15 minutes)
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Date window from YEAR (dynamic!)
START_TS = f"{YEAR}0101000000"  # YYYY-01-01 00:00:00
END_TS   = f"{YEAR}1231235959"  # YYYY-12-31 23:59:59
# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
MASTER_CSV = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"

# ------------------------ Networking helpers ------------------------
logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ------------------------ GDELT plumbing ------------------------
def build_gkg_urls():
    """Collect GKG v2.1 .zip URLs within the YEAR bounds (top-of-hour if HOURLY_ONLY)."""
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m:
            continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # only top-of-hour files
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        # V2Tone = "docTone, pos, neg, polarity, activityRefDensity, selfGroupRefDensity, wordCount"
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],  # retain a long tail for resume
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ------------------------ Reset (if previously wrong year) ------------------------
if RESET_STATE:
    if STATE_PATH.exists():
        STATE_PATH.unlink()
    if YEARLY_CSV.exists():
        YEARLY_CSV.unlink()

# ------------------------ Plan + resume ------------------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ------------------------ Main loop ------------------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ------------------------ Append to master ------------------------
try:
    ydf = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # sanity: ensure we really wrote YEAR
    years_found = set(ydf["month"].dt.year.dropna().unique().tolist())
    if years_found != {YEAR}:
        raise ValueError(f"Year mismatch: expected {YEAR}, found {sorted(years_found)} in {YEARLY_CSV}")

    if MASTER_CSV.exists():
        m = pd.read_csv(MASTER_CSV, parse_dates=["month"])
        m = (pd.concat([m, ydf], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = ydf
    m.to_csv(MASTER_CSV, index=False)
    print("Appended to master:", MASTER_CSV, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ------------------------ Pretty print monthly for YEAR ------------------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # build a complete monthly index for YEAR so missing months show as NaN
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)



Mounted at /content/drive
[2022] planned hourly files: 8,759
[2022] processing this session: 8,759 files
[2022] 100/8759 | 1.87 files/s | ETA≈77.0m | docs_kept=1,105
[2022] 200/8759 | 1.63 files/s | ETA≈87.7m | docs_kept=2,397
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2022/gdelt_uk_housing_monthly_2022.csv
[2022] 300/8759 | 1.58 files/s | ETA≈89.4m | docs_kept=3,681
[2022] 400/8759 | 1.63 files/s | ETA≈85.6m | docs_kept=4,779
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2022/gdelt_uk_housing_monthly_2022.csv
[2022] 500/8759 | 1.57 files/s | ETA≈87.6m | docs_kept=6,773
[2022] 600/8759 | 1.60 files/s | ETA≈84.8m | docs_kept=8,119
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2022/gdelt_uk_housing_monthly_2022.csv
[2022] 700/8759 | 1.62 files/s | ETA≈83.1m | docs_kept=9,261
[2022] 800/8759 | 1.62 files/s | ETA≈82.0m | docs_kept=10,709
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2022/gdelt_uk_housing_monthly_2022.csv
[2022] 900/875

2023

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ------------------------ CONFIG ------------------------
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2023
RESET_STATE = True
MAX_FILES = None
HOURLY_ONLY = True
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Date window from YEAR (dynamic!)
START_TS = f"{YEAR}0101000000"  # YYYY-01-01 00:00:00
END_TS   = f"{YEAR}1231235959"  # YYYY-12-31 23:59:59
# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
MASTER_CSV = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"

# ------------------------ Networking helpers ------------------------
logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ------------------------ GDELT plumbing ------------------------
def build_gkg_urls():
    """Collect GKG v2.1 .zip URLs within the YEAR bounds (top-of-hour if HOURLY_ONLY)."""
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m:
            continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # only top-of-hour files
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        # V2Tone = "docTone, pos, neg, polarity, activityRefDensity, selfGroupRefDensity, wordCount"
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],  # retain a long tail for resume
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ------------------------ Reset (if previously wrong year) ------------------------
if RESET_STATE:
    if STATE_PATH.exists():
        STATE_PATH.unlink()
    if YEARLY_CSV.exists():
        YEARLY_CSV.unlink()

# ------------------------ Plan + resume ------------------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ------------------------ Main loop ------------------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ------------------------ Append to master ------------------------
try:
    ydf = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # sanity: ensure we really wrote YEAR
    years_found = set(ydf["month"].dt.year.dropna().unique().tolist())
    if years_found != {YEAR}:
        raise ValueError(f"Year mismatch: expected {YEAR}, found {sorted(years_found)} in {YEARLY_CSV}")

    if MASTER_CSV.exists():
        m = pd.read_csv(MASTER_CSV, parse_dates=["month"])
        m = (pd.concat([m, ydf], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = ydf
    m.to_csv(MASTER_CSV, index=False)
    print("Appended to master:", MASTER_CSV, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ------------------------ Pretty print monthly for YEAR ------------------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # build a complete monthly index for YEAR so missing months show as NaN
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)



2024

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ------------------------ CONFIG ------------------------
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")  # change if you like
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2024                     # <<< set the year you want to process
RESET_STATE = True              # <<< set True if you previously ran the wrong year; else False
MAX_FILES = None                # cap files processed this run; None = process all remaining
HOURLY_ONLY = True              # process top-of-hour files only (lighter than every 15 minutes)
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Date window from YEAR (dynamic!)
START_TS = f"{YEAR}0101000000"  # YYYY-01-01 00:00:00
END_TS   = f"{YEAR}1231235959"  # YYYY-12-31 23:59:59
# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
MASTER_CSV = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"

# ------------------------ Networking helpers ------------------------
logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ------------------------ GDELT plumbing ------------------------
def build_gkg_urls():
    """Collect GKG v2.1 .zip URLs within the YEAR bounds (top-of-hour if HOURLY_ONLY)."""
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m:
            continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # only top-of-hour files
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        # V2Tone = "docTone, pos, neg, polarity, activityRefDensity, selfGroupRefDensity, wordCount"
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],  # retain a long tail for resume
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ------------------------ Reset (if previously wrong year) ------------------------
if RESET_STATE:
    if STATE_PATH.exists():
        STATE_PATH.unlink()
    if YEARLY_CSV.exists():
        YEARLY_CSV.unlink()

# ------------------------ Plan + resume ------------------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ------------------------ Main loop ------------------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ------------------------ Append to master ------------------------
try:
    ydf = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # sanity: ensure we really wrote YEAR
    years_found = set(ydf["month"].dt.year.dropna().unique().tolist())
    if years_found != {YEAR}:
        raise ValueError(f"Year mismatch: expected {YEAR}, found {sorted(years_found)} in {YEARLY_CSV}")

    if MASTER_CSV.exists():
        m = pd.read_csv(MASTER_CSV, parse_dates=["month"])
        m = (pd.concat([m, ydf], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = ydf
    m.to_csv(MASTER_CSV, index=False)
    print("Appended to master:", MASTER_CSV, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ------------------------ Pretty print monthly for YEAR ------------------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # build a complete monthly index for YEAR so missing months show as NaN
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)
# ========================= end =========================


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[2024] planned hourly files: 8,784
[2024] processing this session: 8,784 files
[2024] 100/8784 | 1.29 files/s | ETA≈112.3m | docs_kept=2,007
[2024] 200/8784 | 1.34 files/s | ETA≈106.6m | docs_kept=3,537
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2024/gdelt_uk_housing_monthly_2024.csv
[2024] 300/8784 | 1.24 files/s | ETA≈114.4m | docs_kept=6,104
[2024] 400/8784 | 1.26 files/s | ETA≈111.1m | docs_kept=7,980
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2024/gdelt_uk_housing_monthly_2024.csv
[2024] 500/8784 | 1.23 files/s | ETA≈112.0m | docs_kept=10,277
[2024] 600/8784 | 1.20 files/s | ETA≈113.9m | docs_kept=12,709
Saved → /content/drive/MyDrive/msc_project/gdelt/gdelt_run_2024/gdelt_uk_housing_monthly_2024.csv
[2024] 700/8784 | 1.22 files/s | ETA≈110.2m | docs_kept=14,558
[2024] 800/8784 | 1.20 files/s | ETA≈110.5m | docs_kept=16,85

2025

In [None]:
!pip -q install pandas requests

from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import re, io, zipfile, math, time, json, logging, requests, pandas as pd
from collections import defaultdict
from datetime import datetime
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# ------------------------ CONFIG ------------------------
BASE_DIR = Path("/content/drive/MyDrive/msc_project/gdelt")
BASE_DIR.mkdir(parents=True, exist_ok=True)

YEAR = 2025
RESET_STATE = True
MAX_FILES = None
HOURLY_ONLY = True
CHECKPOINT_EVERY = 200
PROGRESS_EVERY   = 100

# Date window from YEAR (dynamic!)
START_TS = f"{YEAR}0101000000"  # YYYY-01-01 00:00:00
END_TS   = f"{YEAR}0630235959"  # YYYY-12-31 23:59:59
# Filters
UK_REGEX = r'#(?:GBR|GB|UK|United Kingdom|England|Scotland|Wales|Northern Ireland)#'
THEME_PATTERN = r'(?:HOUS|MORTGAG|REMORTGAG|RENT|TENAN|REAL[ _]?ESTATE|PROPERTY|HOME[ _]?PRIC|HOUSE[ _]?PRIC|RIGHTMOVE|ZOOPLA|LANDLORD|BUY[ -]?TO[ -]?LET)'

# GKG v2.1 columns (0-based indices)
DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I = 1, 8, 10, 15

# Paths
OUT_DIR = BASE_DIR / f"gdelt_run_{YEAR}"
OUT_DIR.mkdir(parents=True, exist_ok=True)
YEARLY_CSV = OUT_DIR / f"gdelt_uk_housing_monthly_{YEAR}.csv"
STATE_PATH = OUT_DIR / "state.json"
MASTER_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
MASTER_CSV = BASE_DIR / "gdelt_uk_housing_monthly_STITCHING.csv"

# ------------------------ Networking helpers ------------------------
logging.getLogger("urllib3").setLevel(logging.ERROR)

def session():
    s = requests.Session()
    s.headers.update({"User-Agent": "gdelt-colab/1.0"})
    retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

def get(url, timeout=60, s=None):
    s = s or session()
    if url.startswith("https://"):  # avoid SSL hiccups in Colab
        url = "http://" + url[len("https://"):]
    r = s.get(url, timeout=timeout)
    r.raise_for_status()
    return r

# ------------------------ GDELT plumbing ------------------------
def build_gkg_urls():
    """Collect GKG v2.1 .zip URLs within the YEAR bounds (top-of-hour if HOURLY_ONLY)."""
    txt = get(MASTER_URL, timeout=90).text
    urls = []
    for line in txt.splitlines():
        parts = line.split()
        if len(parts) != 3:
            continue
        url = parts[2]
        m = re.search(r'/gdeltv2/(\d{14})\.gkg\.csv\.zip$', url)
        if not m:
            continue
        ts = m.group(1)
        if START_TS <= ts <= END_TS:
            if HOURLY_ONLY and ts[10:12] != "00":  # only top-of-hour files
                continue
            urls.append(url.replace("https://", "http://"))
    urls.sort()
    return urls

def month_from_dateint(dateint):
    dt = datetime.strptime(str(dateint), "%Y%m%d%H%M%S")
    return f"{dt.year:04d}-{dt.month:02d}"

def doc_tone(v2tone):
    try:
        # V2Tone = "docTone, pos, neg, polarity, activityRefDensity, selfGroupRefDensity, wordCount"
        return float(str(v2tone).split(",")[0])
    except Exception:
        return math.nan

def process_df(df, counts, tone_sum, tone_n, uk_re, theme_re):
    m_uk = df[V2LOCS_I].astype(str).str.contains(uk_re, na=False, regex=True)
    m_th = df[V2THEMES_I].astype(str).str.contains(theme_re, na=False, regex=True)
    sub = df.loc[m_uk & m_th, [DATE_I, V2TONE_I]]
    if sub.empty:
        return 0
    months = sub[DATE_I].astype(str).apply(month_from_dateint)
    tones  = sub[V2TONE_I].apply(doc_tone)
    kept = 0
    for m, t in zip(months, tones):
        counts[m] += 1
        if not math.isnan(t):
            tone_sum[m] += t
            tone_n[m] += 1
        kept += 1
    return kept

def write_year_output(processed_count, processed_tail, counts, tone_sum, tone_n):
    rows = []
    for m in sorted(counts.keys()):
        n = counts[m]
        avg = (tone_sum[m]/tone_n[m]) if tone_n[m] else None
        rows.append({"month": f"{m}-01", "docs": n, "avg_tone": avg})
    monthly = pd.DataFrame(rows, columns=["month","docs","avg_tone"]).sort_values("month")
    monthly.to_csv(YEARLY_CSV, index=False)
    json.dump({
        "processed_count": processed_count,
        "processed_urls_tail": processed_tail[-20000:],  # retain a long tail for resume
        "counts": counts, "tone_sum": tone_sum, "tone_n": tone_n
    }, open(STATE_PATH, "w"))
    print(f"Saved → {YEARLY_CSV}")

# ------------------------ Reset (if previously wrong year) ------------------------
if RESET_STATE:
    if STATE_PATH.exists():
        STATE_PATH.unlink()
    if YEARLY_CSV.exists():
        YEARLY_CSV.unlink()

# ------------------------ Plan + resume ------------------------
s = session()
urls_all = build_gkg_urls()
print(f"[{YEAR}] planned hourly files: {len(urls_all):,}")

if STATE_PATH.exists():
    state = json.load(open(STATE_PATH))
    processed_count     = int(state.get("processed_count", 0))
    processed_urls_tail = state.get("processed_urls_tail", [])
    counts   = defaultdict(int,   state.get("counts", {}))
    tone_sum = defaultdict(float, state.get("tone_sum", {}))
    tone_n   = defaultdict(int,   state.get("tone_n", {}))
    print(f"Resuming: {processed_count:,} files done previously.")
else:
    processed_count, processed_urls_tail = 0, []
    counts, tone_sum, tone_n = defaultdict(int), defaultdict(float), defaultdict(int)

already = set(processed_urls_tail)
todo = [u for u in urls_all if u not in already]
if MAX_FILES is not None:
    todo = todo[:MAX_FILES]
print(f"[{YEAR}] processing this session: {len(todo):,} files")

uk_re = re.compile(UK_REGEX, re.I)
theme_re = re.compile(THEME_PATTERN, re.I)

# ------------------------ Main loop ------------------------
t0 = time.time()
docs_seen = 0

for i, url in enumerate(todo, 1):
    try:
        z = zipfile.ZipFile(io.BytesIO(get(url, timeout=90, s=s).content))
        name = z.namelist()[0]
        df = pd.read_csv(
            z.open(name),
            sep="\t", header=None, quoting=3, low_memory=False,
            usecols=[DATE_I, V2THEMES_I, V2LOCS_I, V2TONE_I],
            dtype={DATE_I:str, V2THEMES_I:str, V2LOCS_I:str, V2TONE_I:str}
        )
        docs_seen += process_df(df, counts, tone_sum, tone_n, uk_re, theme_re)
        processed_count += 1
        processed_urls_tail.append(url)
        if len(processed_urls_tail) > 20000:
            processed_urls_tail = processed_urls_tail[-20000:]
    except Exception:
        # skip bad zips quietly
        pass

    if i % PROGRESS_EVERY == 0:
        elapsed = max(1e-6, time.time()-t0)
        rate = i / elapsed
        eta_min = (len(todo)-i) / max(1e-6, rate) / 60
        print(f"[{YEAR}] {i}/{len(todo)} | {rate:.2f} files/s | ETA≈{eta_min:.1f}m | docs_kept={docs_seen:,}")

    if i % CHECKPOINT_EVERY == 0:
        write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)

# final write for the year
write_year_output(processed_count, processed_urls_tail, counts, tone_sum, tone_n)
print(f"[{YEAR}] Done. This session processed: {len(todo):,} | Total processed so far: {processed_count:,} | Docs kept: {docs_seen:,}")

# ------------------------ Append to master ------------------------
try:
    ydf = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # sanity: ensure we really wrote YEAR
    years_found = set(ydf["month"].dt.year.dropna().unique().tolist())
    if years_found != {YEAR}:
        raise ValueError(f"Year mismatch: expected {YEAR}, found {sorted(years_found)} in {YEARLY_CSV}")

    if MASTER_CSV.exists():
        m = pd.read_csv(MASTER_CSV, parse_dates=["month"])
        m = (pd.concat([m, ydf], ignore_index=True)
               .drop_duplicates(subset="month", keep="last")
               .sort_values("month")
               .reset_index(drop=True))
    else:
        m = ydf
    m.to_csv(MASTER_CSV, index=False)
    print("Appended to master:", MASTER_CSV, m.shape)
except Exception as e:
    print("Append-to-master skipped:", e)

# ------------------------ Pretty print monthly for YEAR ------------------------
try:
    df = pd.read_csv(YEARLY_CSV, parse_dates=["month"]).sort_values("month")
    # build a complete monthly index for YEAR so missing months show as NaN
    full = (df.set_index("month")
              .reindex(pd.date_range(f"{YEAR}-01-01", f"{YEAR}-12-01", freq="MS"))
              .rename_axis("Month").reset_index())
    full["Docs"]    = pd.to_numeric(full["docs"], errors="coerce").round().astype("Int64")
    full["AvgTone"] = pd.to_numeric(full["avg_tone"], errors="coerce")
    print(full[["Month","Docs","AvgTone"]].to_string(index=False))
except Exception as e:
    print("Preview skipped:", e)

