<a href="https://colab.research.google.com/github/NigelWilliamUOP/vibe-coding/blob/main/Passport_bro_05b_event_study_design_early_treatment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 05b — Event-study design (revised): **early high-confidence policing** as treatment
This revision fixes the failure mode seen in Notebook 05: almost all *active* threads have *some* policing at the default threshold, so the only untreated threads were effectively dead threads.

**Treatment definition (default):** a thread is **treated** if its maximum `policing_prob` in the first **K=10** interactions is ≥ **0.70**.
Controls are threads (size ≥ `MIN_THREAD_SIZE`) where max(prob) in first K interactions is < threshold.

**Inputs**: `policing_labels.parquet`, `thread_events.parquet` (from 05), and optionally `author_month_authority.parquet`, `raw.parquet`.

**Outputs**:
- `artefacts/thread_events_v2.parquet`
- `artefacts/matched_threads_v2.parquet`
- `artefacts/matching_diagnostics_v2.json`


In [8]:
# --- Install deps (Colab-safe) ---
!pip -q install -U pyarrow tqdm scikit-learn

import sys, platform, json, math
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("pandas:", pd.__version__)
print("sklearn:", __import__("sklearn").__version__)
print("pyarrow:", __import__("pyarrow").__version__)


Python: 3.12.12
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
pandas: 2.2.2
sklearn: 1.8.0
pyarrow: 22.0.0


## 1) Locate and load inputs

In [9]:
ARTEFACT_DIR = Path("/content/artefacts")
ARTEFACT_DIR.mkdir(parents=True, exist_ok=True)

def find_file(name: str, candidates):
    for p in candidates:
        p = Path(p)
        if p.exists():
            return p
    hits = list(Path("/content").rglob(name))
    if hits:
        return hits[0]
    return None

POLICING_PATH = find_file("policing_labels.parquet", [
    "/content/artefacts/policing_labels.parquet",
    "/content/policing_labels.parquet",
])

THREAD_EVENTS_PATH = find_file("thread_events.parquet", [
    "/content/artefacts/thread_events.parquet",
    "/content/thread_events.parquet",
])

POLICING_PATH, THREAD_EVENTS_PATH


(PosixPath('/content/policing_labels.parquet'),
 PosixPath('/content/thread_events.parquet'))

In [10]:
# Optional upload (uncomment if needed)
# from google.colab import files
# uploaded = files.upload()
# list(uploaded.keys())


In [11]:
for name, p in [("policing_labels.parquet", POLICING_PATH), ("thread_events.parquet", THREAD_EVENTS_PATH)]:
    if p is None:
        raise FileNotFoundError(f"{name} not found. Upload it or mount Drive, then rerun.")
print("Found inputs.")


Found inputs.


In [12]:
pol = pd.read_parquet(POLICING_PATH, engine="pyarrow")
te  = pd.read_parquet(THREAD_EVENTS_PATH, engine="pyarrow")

print("policing_labels:", pol.shape)
print("thread_events:", te.shape)

pol["date_dt"] = pd.to_datetime(pol["date_dt"], errors="coerce")
pol["type"] = pol["type"].astype("string")
pol = pol[pol["type"].str.lower().isin(["comment","reply"])].copy()
pol = pol.sort_values(["root_submission_id","date_dt","id"])
pol["i_in_thread"] = pol.groupby("root_submission_id").cumcount() + 1

te["thread_start_dt"] = pd.to_datetime(te["thread_start_dt"], errors="coerce")
te["root_submission_id"] = te["root_submission_id"].astype("string")
pol["root_submission_id"] = pol["root_submission_id"].astype("string")


policing_labels: (75811, 10)
thread_events: (989, 19)


## 2) Re-define treatment: early max(prob) in first K interactions

In [13]:
# --- Config ---
MIN_THREAD_SIZE = 20        # keep: focus on active threads
K_EARLY = 10                # early window by interaction count
PROB_THRESHOLD = 0.70       # "high-confidence" policing cutoff
REQUIRE_AT_LEAST_K = True   # if True, only include threads with >=K interactions

# Thread sizes
sizes = te.set_index("root_submission_id")["thread_size"]
pol["thread_size"] = pol["root_submission_id"].map(sizes)

# Compute early max prob per thread
pol_early = pol[(pol["thread_size"] >= MIN_THREAD_SIZE) & (pol["i_in_thread"] <= K_EARLY)].copy()
max_early = pol_early.groupby("root_submission_id")["policing_prob"].max().rename("max_policing_prob_firstK")

# Eligibility
eligible_threads = max_early.index
if REQUIRE_AT_LEAST_K:
    # ensure each eligible thread actually has K interactions
    cnt_early = pol_early.groupby("root_submission_id")["id"].size()
    eligible_threads = cnt_early[cnt_early >= K_EARLY].index
    max_early = max_early.loc[eligible_threads]

# Treatment indicator
treated_v2 = (max_early >= PROB_THRESHOLD).astype(int).rename("treated_v2")

# Merge into thread_events
te2 = te.merge(max_early.reset_index(), on="root_submission_id", how="left").merge(
    treated_v2.reset_index(), on="root_submission_id", how="left"
)

# Keep matching universe: active + eligible
te2 = te2[te2["root_submission_id"].isin(eligible_threads)].copy()
te2["treated_v2"] = te2["treated_v2"].fillna(0).astype(int)

print("Eligible threads:", len(te2))
print("Treated_v2:", int(te2["treated_v2"].sum()), "Controls_v2:", int((te2["treated_v2"]==0).sum()))
te2[["root_submission_id","thread_size","max_policing_prob_firstK","treated_v2"]].head(5)


Eligible threads: 559
Treated_v2: 207 Controls_v2: 352


Unnamed: 0,root_submission_id,thread_size,max_policing_prob_firstK,treated_v2
0,162huet,301,0.652546,0
1,16wowgc,185,0.653987,0
2,17slf39,454,0.712891,1
3,1880wns,227,0.712891,1
4,18oldj8,193,0.712891,1


## 3) Matching covariates
We reuse early-thread covariates already computed in `thread_events.parquet` (first 5 interactions).
We include `thread_start_month` and `thread_cluster` as *covariates* (not exact strata) to avoid empty strata.

In [14]:
cov_cols = [c for c in te2.columns if c.startswith("early_")]
needed = cov_cols + ["thread_start_month","thread_cluster","thread_size","treated_v2","max_policing_prob_firstK"]
missing = [c for c in needed if c not in te2.columns]
if missing:
    raise ValueError(f"Missing required columns in thread_events: {missing}")

match_df = te2.copy()

# Apply the same active-thread filter
match_df = match_df[match_df["thread_size"] >= MIN_THREAD_SIZE].copy()

# Require covariates; keep categorical even if missing by imputing "UNK"
match_df = match_df.dropna(subset=cov_cols)
match_df["thread_start_month"] = match_df["thread_start_month"].astype("string")
match_df["thread_cluster"] = match_df["thread_cluster"].astype("string").fillna("UNK")

print("Matching sample:", len(match_df))
print(match_df["treated_v2"].value_counts())


Matching sample: 554
treated_v2
0    349
1    205
Name: count, dtype: int64


## 4) Propensity scores + nearest-neighbour matching
We fit a single propensity model over the whole matching sample, including month and cluster as categorical predictors.
Then we do 1:1 greedy matching on logit(PS) with caliper = 0.2 × SD(logit PS).

In [15]:
def logit(p):
    p = np.clip(p, 1e-6, 1-1e-6)
    return np.log(p / (1 - p))

CALIPER_MULT = 0.2
MATCH_WITH_REPLACEMENT = True

y = match_df["treated_v2"].astype(int).to_numpy()

X_num = match_df[cov_cols].copy()
X_cat = match_df[["thread_start_month","thread_cluster"]].copy()

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), cov_cols),
        ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                          ("oh", OneHotEncoder(handle_unknown="ignore"))]),
         ["thread_start_month","thread_cluster"])
    ]
)

ps_model = Pipeline(steps=[
    ("prep", preprocess),
    ("lr", LogisticRegression(max_iter=2000, solver="lbfgs"))
])

ps_model.fit(match_df[cov_cols + ["thread_start_month","thread_cluster"]], y)
ps = ps_model.predict_proba(match_df[cov_cols + ["thread_start_month","thread_cluster"]])[:, 1]
lps = logit(ps)

match_df = match_df.reset_index(drop=True)
match_df["ps"] = ps
match_df["lps"] = lps

sd_lps = float(np.std(lps, ddof=0))
caliper = CALIPER_MULT * sd_lps
print("SD(logit PS):", sd_lps, "Caliper:", caliper)

t_idx = match_df.index[match_df["treated_v2"] == 1].tolist()
c_idx = match_df.index[match_df["treated_v2"] == 0].tolist()

# Sort treated by lps descending for stable greedy matching
t_idx = sorted(t_idx, key=lambda i: match_df.loc[i, "lps"], reverse=True)

available_controls = set(c_idx)

matched_rows = []
distances = []

for ti in tqdm(t_idx, desc="Matching"):
    if not available_controls and not MATCH_WITH_REPLACEMENT:
        break

    tlps = match_df.loc[ti, "lps"]

    # Find best control
    best_ci = None
    best_d = None
    candidates = available_controls if not MATCH_WITH_REPLACEMENT else c_idx
    for ci in candidates:
        d = abs(tlps - match_df.loc[ci, "lps"])
        if best_d is None or d < best_d:
            best_d = d
            best_ci = ci

    if best_d is not None and best_d <= caliper:
        matched_rows.append({
            "treated_root": match_df.loc[ti, "root_submission_id"],
            "control_root": match_df.loc[best_ci, "root_submission_id"],
            "treated_ps": float(match_df.loc[ti, "ps"]),
            "control_ps": float(match_df.loc[best_ci, "ps"]),
            "abs_lps_diff": float(best_d),
            "caliper": float(caliper),
        })
        distances.append(best_d)
        if not MATCH_WITH_REPLACEMENT:
            available_controls.remove(best_ci)

matched = pd.DataFrame(matched_rows)
print("Matched pairs:", len(matched), "| unique treated:", matched["treated_root"].nunique() if len(matched) else 0)
matched.head(5)


SD(logit PS): 0.3758185721097169 Caliper: 0.07516371442194339


Matching:   0%|          | 0/205 [00:00<?, ?it/s]

Matched pairs: 199 | unique treated: 199


Unnamed: 0,treated_root,control_root,treated_ps,control_ps,abs_lps_diff,caliper
0,1q04ulc,1kk6xgw,0.67495,0.669154,0.026298,0.075164
1,1f5hk91,1kk6xgw,0.657297,0.669154,0.053089,0.075164
2,1880wns,1km1lrt,0.591077,0.600485,0.039067,0.075164
3,1j610ha,1km1lrt,0.58984,0.600485,0.044181,0.075164
4,1pufhy0,1k7mpzb,0.572822,0.557694,0.061567,0.075164


## 5) Balance diagnostics (SMD)
SMD = (mean_t − mean_c) / pooled_sd, pre- and post-match.

In [16]:
def smd(x_t, x_c):
    x_t = pd.to_numeric(pd.Series(x_t), errors="coerce").dropna()
    x_c = pd.to_numeric(pd.Series(x_c), errors="coerce").dropna()
    if len(x_t) < 2 or len(x_c) < 2:
        return np.nan
    mt, mc = x_t.mean(), x_c.mean()
    vt, vc = x_t.var(ddof=1), x_c.var(ddof=1)
    pooled = math.sqrt((vt + vc) / 2)
    if pooled == 0 or np.isnan(pooled):
        return 0.0
    return float((mt - mc) / pooled)

pre = {}
post = {}
treated_df = match_df[match_df["treated_v2"] == 1]
control_df = match_df[match_df["treated_v2"] == 0]

for col in cov_cols:
    pre[col] = smd(treated_df[col], control_df[col])

if len(matched) > 0:
    idx = match_df.set_index("root_submission_id")
    tvals = idx.loc[matched["treated_root"]]
    cvals = idx.loc[matched["control_root"]]
    for col in cov_cols:
        post[col] = smd(tvals[col], cvals[col])
else:
    for col in cov_cols:
        post[col] = np.nan

bal = pd.DataFrame({"pre_smd": pre, "post_smd": post}).sort_values("pre_smd", key=lambda s: s.abs(), ascending=False)
bal.head(15)


Unnamed: 0,pre_smd,post_smd
early_reply_gini,0.132486,-0.068716
early_max_depth,0.11587,-0.12068
early_mean_textlen,-0.039179,-0.049241
early_mean_authority_z,-0.036584,-0.078446
early_participation_entropy,-0.032078,0.021711
early_mean_sentiment,-0.03163,-0.020236
early_unique_authors,-0.02942,0.005497
early_mean_toxicity,0.028397,-0.082059
early_p95_authority_z,-0.014079,-0.07658


## 6) Export outputs

In [17]:
THREAD_EVENTS_V2_PATH = ARTEFACT_DIR / "thread_events_v2.parquet"
MATCHED_V2_PATH = ARTEFACT_DIR / "matched_threads_v2.parquet"
DIAG_V2_PATH = ARTEFACT_DIR / "matching_diagnostics_v2.json"

te2.to_parquet(THREAD_EVENTS_V2_PATH, engine="pyarrow", compression="snappy", index=False)
matched.to_parquet(MATCHED_V2_PATH, engine="pyarrow", compression="snappy", index=False)

diag = {
    "MIN_THREAD_SIZE": MIN_THREAD_SIZE,
    "K_EARLY": K_EARLY,
    "PROB_THRESHOLD": PROB_THRESHOLD,
    "REQUIRE_AT_LEAST_K": REQUIRE_AT_LEAST_K,
    "CALIPER_MULT_SD_LOGIT": CALIPER_MULT,
    "MATCH_WITH_REPLACEMENT": MATCH_WITH_REPLACEMENT,
    "n_eligible_threads": int(len(te2)),
    "n_match_sample": int(len(match_df)),
    "n_treated": int((match_df["treated_v2"]==1).sum()),
    "n_controls": int((match_df["treated_v2"]==0).sum()),
    "n_pairs": int(len(matched)),
    "mean_abs_lps_diff": float(np.mean(distances)) if distances else None,
    "balance": {"pre_match_smd": pre, "post_match_smd": post},
}

with DIAG_V2_PATH.open("w", encoding="utf-8") as f:
    json.dump(diag, f, indent=2)

print("Wrote:", THREAD_EVENTS_V2_PATH)
print("Wrote:", MATCHED_V2_PATH)
print("Wrote:", DIAG_V2_PATH)


Wrote: /content/artefacts/thread_events_v2.parquet
Wrote: /content/artefacts/matched_threads_v2.parquet
Wrote: /content/artefacts/matching_diagnostics_v2.json


## 7) Next notebook
Once `matched_threads_v2.parquet` exists and is non-empty, proceed to `06_models_confirmatory.ipynb` to build pre/post outcome windows and estimate the preregistered models.