In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# -----------------------------
# Load data (WITH Dark already computed)
# -----------------------------
BIG_PATH = "../data/mdm2_data_files/big_table_with_dark.csv"

big = pd.read_csv(BIG_PATH)
big["datetime"] = pd.to_datetime(big["datetime"], utc=True, errors="coerce")
big = big.dropna(subset=["datetime"])

# Ensure required variables exist
req = ["ped","cyc","Dark","sensor_id"]
missing = [c for c in req if c not in big.columns]
if missing:
    raise ValueError(f"Missing columns in big_table_with_dark.csv: {missing}")

# Numeric outcomes + time controls
big["ped"] = pd.to_numeric(big["ped"], errors="coerce").fillna(0)
big["cyc"] = pd.to_numeric(big["cyc"], errors="coerce").fillna(0)
big["hour"] = big["datetime"].dt.hour.astype(int)
big["weekday"] = big["datetime"].dt.dayofweek.astype(int)
big["month"] = big["datetime"].dt.month.astype(int)
big["Dark"] = pd.to_numeric(big["Dark"], errors="coerce").fillna(0).astype(int)

# -----------------------------
# Stack into long format
# -----------------------------
ped_long = big[["sensor_id","datetime","hour","weekday","month","Dark","ped"]].copy()
ped_long = ped_long.rename(columns={"ped":"count"})
ped_long["mode"] = "ped"

cyc_long = big[["sensor_id","datetime","hour","weekday","month","Dark","cyc"]].copy()
cyc_long = cyc_long.rename(columns={"cyc":"count"})
cyc_long["mode"] = "cyc"

long = pd.concat([ped_long, cyc_long], ignore_index=True)

print("Rows (combined long):", len(long))
print(long.groupby("mode")["count"].agg(["mean","var","count"]).to_string())

# Make mode a categorical so the reference is ped (clean interpretation)
long["mode"] = pd.Categorical(long["mode"], categories=["ped","cyc"])

# -----------------------------
# Fit Negative Binomial models
# -----------------------------
# Base model: shared Dark effect + mode shift
nb_base = smf.glm(
    formula="count ~ Dark + C(mode) + C(hour) + C(weekday) + C(month)",
    data=long,
    family=sm.families.NegativeBinomial()
).fit()

# Interaction model: allow Dark effect to differ by mode
nb_int = smf.glm(
    formula="count ~ Dark * C(mode) + C(hour) + C(weekday) + C(month)",
    data=long,
    family=sm.families.NegativeBinomial()
).fit()

print(nb_int.summary())

print("\nAIC base:", nb_base.aic)
print("AIC interaction:", nb_int.aic)

# -----------------------------
# Interpret effects (ped is reference)
# -----------------------------
beta_dark_ped = nb_int.params["Dark"]

term = "Dark:C(mode)[T.cyc]"
beta_dark_cyc = beta_dark_ped + (nb_int.params[term] if term in nb_int.params else 0.0)

pct_ped = (np.exp(beta_dark_ped) - 1) * 100
pct_cyc = (np.exp(beta_dark_cyc) - 1) * 100

print("\n" + "="*105)
print("RESULTS: DARKNESS EFFECT BY MODE (COMBINED NEGATIVE BINOMIAL, WITH INTERACTION)".center(105))
print("="*105)
print(f"Pedestrians (reference): beta={beta_dark_ped:.4f}  ->  {pct_ped:.2f}% change")
print(f"Cyclists:               beta={beta_dark_cyc:.4f}  ->  {pct_cyc:.2f}% change")

if term in nb_int.params:
    print(f"\nInteraction term ({term}): {nb_int.params[term]:.4f} (p={nb_int.pvalues[term]:.3g})")
    print("If p is small, the darkness effect differs significantly between cyclists and pedestrians.")
else:
    print("\nInteraction term not found â€” check mode reference category.")
print("="*105 + "\n")

  from pandas.core.computation.check import NUMEXPR_INSTALLED


Rows (combined long): 956930
           mean           var   count
mode                                 
cyc   15.929635   1069.254193  478465
ped   83.357537  30026.660446  478465
                 Generalized Linear Model Regression Results                  
Dep. Variable:                  count   No. Observations:               956930
Model:                            GLM   Df Residuals:                   956886
Model Family:        NegativeBinomial   Df Model:                           43
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -4.0479e+06
Date:                Tue, 17 Feb 2026   Deviance:                   2.3705e+06
Time:                        17:41:25   Pearson chi2:                 6.52e+06
No. Iterations:                    15   Pseudo R-squ. (CS):             0.7458
Covariance Type:            nonrobust                                         
                          coe