In [1]:
# build_modelling_dataset.py

In [3]:
import pandas as pd

# static features
static = pd.read_csv("feature_static_london.csv")

# dynamic crime features
crime_dyn = pd.read_csv("crime_rolling6m.csv")

# merge
panel = (
    crime_dyn.merge(static, on="LSOA_Code", how="left")
             .dropna()   # ensure no missing
)

panel.to_csv("modelling_dataset.csv", index=False)
print("✓ modelling_dataset.csv", panel.shape)

✓ modelling_dataset.csv (851499, 28)


In [4]:
import pandas as pd

imd = pd.read_csv("imd_london.csv")

rename_map = {
    "income_pct":     "dep_income",
    "employment_pct": "dep_employment",
    "education_pct":  "dep_education",
    "health_pct":     "dep_health",
    "barriers_pct":   "dep_barriers",
    "livingenv_pct":  "dep_livingenv",
}

imd = imd.rename(columns=rename_map)

imd.to_csv("imd_london.csv", index=False)
print("✓ columns renamed and saved")
panel = pd.read_csv("modelling_dataset.csv")

panel = panel.rename(columns=rename_map)
panel.to_csv("modelling_dataset.csv", index=False)

✓ columns renamed and saved


In [5]:
# build_modelling_dataset_ready.py
import pandas as pd
import numpy as np

CRIME_FILE  = "crime_rolling6m.csv"
STATIC_FILE = "feature_static_london.csv"

# 1| load
crime  = pd.read_csv(CRIME_FILE, parse_dates=["date"], dtype={"LSOA_Code":"string"})
static = pd.read_csv(STATIC_FILE, dtype={"LSOA_Code":"string"})

# 2| identify crime category columns and total
if CRIME_FILE.endswith("crime_rolling6m.csv"):
    cat_cols = [c for c in crime.columns if c.startswith("cnt_") and c.endswith("_roll6")]
else:
    cat_cols = [c for c in crime.columns if c.startswith("cnt_") and not c.endswith("_roll6")]

crime["cnt_total"] = crime[cat_cols].sum(axis=1)

# 3| time features
crime["year"]  = crime["date"].dt.year
crime["month"] = crime["date"].dt.month
crime["sin_m"] = np.sin(2*np.pi*crime["month"]/12.0)
crime["cos_m"] = np.cos(2*np.pi*crime["month"]/12.0)

# add a simple 1-month lag only for monthly snapshot table
if CRIME_FILE.endswith("crime_monthly_wide.csv"):
    crime = crime.sort_values(["LSOA_Code","date"])
    crime["cnt_total_lag1"] = crime.groupby("LSOA_Code")["cnt_total"].shift(1).fillna(0)

# 4| merge with static
panel = crime.merge(static, on="LSOA_Code", how="left")

# sanity checks
print("shape:", panel.shape)
print("missing values:", panel.isna().sum().sum())

# 5| targets
panel["y_reg"] = panel["cnt_total"]
thr = panel["cnt_total"].quantile(0.75)
panel["y_cls"] = (panel["cnt_total"] > thr).astype(int)

# 6| save
panel.to_csv("modelling_dataset_ready.csv", index=False)
print("✓ saved modelling_dataset_ready.csv", panel.shape)
print("date range:", panel["date"].min(), "→", panel["date"].max())
print("positive class share (y_cls=1):", round(panel["y_cls"].mean(), 3))

shape: (851499, 33)
missing values: 0
✓ saved modelling_dataset_ready.csv (851499, 35)
date range: 2010-04-01 00:00:00 → 2025-06-01 00:00:00
positive class share (y_cls=1): 0.248


In [6]:
# Build a leakage-free panel:
#   X: lagged (t-1) rolling-6m features + seasonality + static
#   y: next-month (t+1) total count & high-risk flag
import pandas as pd
import numpy as np

ROLL6   = "crime_rolling6m.csv"
MONTHLY = "crime_monthly_wide.csv"
STATIC  = "feature_static_london.csv"

# 1| FEATURES: rolling-6m at t-1 (lagged)
feat = pd.read_csv(ROLL6, parse_dates=["date"], dtype={"LSOA_Code":"string"}).sort_values(["LSOA_Code","date"])
roll_cols = [c for c in feat.columns if c.startswith("cnt_") and c.endswith("_roll6")]

# create 1-lag versions so that X_t only uses info up to t-1
for c in roll_cols:
    feat[f"{c}_lag1"] = feat.groupby("LSOA_Code")[c].shift(1)

# keep only lagged columns to be safe
lag_cols = [f"{c}_lag1" for c in roll_cols]
feat = feat[["LSOA_Code","date"] + lag_cols].dropna()

# seasonality
feat["month"] = feat["date"].dt.month
feat["sin_m"] = np.sin(2*np.pi*feat["month"]/12.0)
feat["cos_m"] = np.cos(2*np.pi*feat["month"]/12.0)

# 2| TARGETS: next-month totals (t+1) from monthly-wide
mon = pd.read_csv(MONTHLY, parse_dates=["date"], dtype={"LSOA_Code":"string"})
mon_cols = [c for c in mon.columns if c.startswith("cnt_") and not c.endswith("_roll6")]
mon["cnt_total"] = mon[mon_cols].sum(axis=1)

# y_next aligns with features at time t
mon["y_next"] = mon.groupby("LSOA_Code")["cnt_total"].shift(-1)
target = mon[["LSOA_Code","date","y_next"]].dropna()

# 3| MERGE: X (lagged roll6) + static + y_next
static = pd.read_csv(STATIC, dtype={"LSOA_Code":"string"})
df = (feat.merge(static, on="LSOA_Code", how="left")
          .merge(target, on=["LSOA_Code","date"], how="inner"))

# classification target (global 75% threshold on y_next)
thr = df["y_next"].quantile(0.75)
df["y_cls"] = (df["y_next"] > thr).astype(int)

# remove ID/time cols from X later; save dataset
df.to_csv("modelling_dataset_no_leak.csv", index=False)
print("Saved:", df.shape, "→ modelling_dataset_no_leak.csv")

# 4| BASELINES (classification)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# time-based split
train = df[df["date"] < "2020-01-01"]
test  = df[df["date"] >= "2020-01-01"]

drop_cols = ["LSOA_Code","date","y_next","y_cls"]
X_train = train.drop(columns=drop_cols)
y_train = train["y_cls"]
X_test  = test.drop(columns=drop_cols)
y_test  = test["y_cls"]

# Logistic Regression (no strict need to scale tree-friendly features)
logit = LogisticRegression(max_iter=1000, n_jobs=-1)
logit.fit(X_train, y_train)
print("Logistic AUC:", round(roc_auc_score(y_test, logit.predict_proba(X_test)[:,1]), 3))

# Random Forest
rf = RandomForestClassifier(n_estimators=400, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
print("RF AUC:", round(roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]), 3))


Saved: (842193, 33) → modelling_dataset_no_leak.csv
Logistic AUC: 0.915
RF AUC: 0.917
