In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

# -----------------------------
# Load data (WITH Dark already computed)
# -----------------------------
BIG_PATH = "../data/mdm2_data_files/big_table_with_dark.csv"

big = pd.read_csv(BIG_PATH)
big["datetime"] = pd.to_datetime(big["datetime"], utc=True, errors="coerce")
big = big.dropna(subset=["datetime"])

# Ensure required variables exist
req = ["ped","Dark","sensor_id"]
missing = [c for c in req if c not in big.columns]
if missing:
    raise ValueError(f"Missing columns in big_table_with_dark.csv: {missing}")

# Numeric outcome + time controls
big["ped"] = pd.to_numeric(big["ped"], errors="coerce").fillna(0)
big["hour"] = big["datetime"].dt.hour.astype(int)
big["weekday"] = big["datetime"].dt.dayofweek.astype(int)
big["month"] = big["datetime"].dt.month.astype(int)
big["Dark"] = pd.to_numeric(big["Dark"], errors="coerce").fillna(0).astype(int)

print("Rows (ped model):", len(big))
print("Dark proportion:", big["Dark"].mean())

# -----------------------------
# Overdispersion intuition
# -----------------------------
mean_y = big["ped"].mean()
var_y = big["ped"].var()
print("\nMean ped:", mean_y)
print("Var ped:", var_y)
print("Var/Mean:", var_y/(mean_y+1e-9))

# -----------------------------
# Poisson GLM + dispersion check
# -----------------------------
poisson_ped = smf.glm(
    formula="ped ~ Dark + C(hour) + C(weekday) + C(month)",
    data=big,
    family=sm.families.Poisson()
).fit()

disp = np.sum(poisson_ped.resid_pearson**2) / poisson_ped.df_resid
print("\nPoisson dispersion (ped):", disp)

# -----------------------------
# Negative Binomial GLM (main)
# -----------------------------
nb_ped = smf.glm(
    formula="ped ~ Dark + C(hour) + C(weekday) + C(month)",
    data=big,
    family=sm.families.NegativeBinomial()
).fit()

print(nb_ped.summary())

beta = nb_ped.params["Dark"]
se = nb_ped.bse["Dark"]
pct = (np.exp(beta) - 1) * 100

print("\n" + "="*95)
print("RESULTS: DARKNESS EFFECT (PEDESTRIANS)".center(95))
print("="*95)
print(f"Coefficient (log scale): {beta:.4f}")
print(f"Std. Error: {se:.4f}")
print(f"Estimated % change due to darkness: {pct:.2f}%")
print("="*95 + "\n")

print("AIC Poisson (ped):", poisson_ped.aic)
print("AIC NegBin (ped):", nb_ped.aic)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


Rows (ped model): 478465
Dark proportion: 0.4665336022488583

Mean ped: 83.35753712392756
Var ped: 30026.660445891746
Var/Mean: 360.21530243739

Poisson dispersion (ped): 231.69624859434964
                 Generalized Linear Model Regression Results                  
Dep. Variable:                    ped   No. Observations:               478465
Model:                            GLM   Df Residuals:                   478423
Model Family:        NegativeBinomial   Df Model:                           41
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -2.3525e+06
Date:                Tue, 17 Feb 2026   Deviance:                   1.3399e+06
Time:                        17:37:28   Pearson chi2:                 2.41e+06
No. Iterations:                    14   Pseudo R-squ. (CS):             0.6411
Covariance Type:            nonrobust                                         
                    