In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import linregress   # If scipy is not available, numpy.polyfit can be used instead

# ========= Configuration =========
PATH_CAP = Path("./employment/state_dc_capacity.xlsx")    # Capacity Data
PATH_JOB = Path("./employment/state_5182_avg_emp.xlsx")   # Employment Data
OUT_PATH = Path("./employment/capacity_employment_effect.xlsx")

YEARS = [2020, 2021, 2022, 2023, 2024]
ALPHA = 0.1   # Significance level (e.g., 0.05 or 0.1)

# ========= Read Tables =========
def read_table(path: Path) -> pd.DataFrame:
    if path.suffix.lower() in [".xls", ".xlsx"]:
        df = pd.read_excel(path)
    else:
        df = pd.read_csv(path)
    return df

df_cap = read_table(PATH_CAP)
df_job = read_table(PATH_JOB)

# Keep only State + Year columns
cap_cols = ["State"] + YEARS
job_cols = ["State"] + YEARS
df_cap = df_cap[cap_cols]
df_job = df_job[job_cols]

# Keep only states present in both tables
states_common = sorted(set(df_cap["State"]) & set(df_job["State"]))
print(f"Total states participating in analysis: {len(states_common)}")

# ========= Regression per State: Jobs ~ Capacity =========
records = []

for st in states_common:
    cap_row = df_cap.loc[df_cap["State"] == st, YEARS]
    job_row = df_job.loc[df_job["State"] == st, YEARS]

    if cap_row.empty or job_row.empty:
        continue

    cap_vals = cap_row.iloc[0].astype(float).values
    job_vals = job_row.iloc[0].astype(float).values

    # Perform regression using "level values"
    d_cap = cap_vals
    d_job = job_vals

    # Remove years containing NaN
    mask = ~np.isnan(d_cap) & ~np.isnan(d_job)
    d_cap = d_cap[mask]
    d_job = d_job[mask]

    n_obs = len(d_cap)

    # 1. Too few points; 2. x is constant (all values same) -> Return NaN
    if n_obs < 2 or np.allclose(d_cap, d_cap[0]):
        records.append({
            "State": st,
            "n_obs": n_obs,
            "slope_jobs_per_cap": np.nan,
            "intercept": np.nan,
            "r_value": np.nan,
            "p_value": np.nan,
            "supports_hypothesis": False,
        })
        continue

    # Linear Regression: job = β0 + β1 * cap
    res = linregress(d_cap, d_job)

    slope = res.slope
    intercept = res.intercept
    r_value = res.rvalue
    p_value = res.pvalue

    supports = (slope > 0) and (p_value < ALPHA)

    records.append({
        "State": st,
        "n_obs": n_obs,
        "slope_jobs_per_cap": slope,
        "intercept": intercept,
        "r_value": r_value,
        "p_value": p_value,
        "supports_hypothesis": supports,
    })


# ========= Aggregate & Export =========
result_df = pd.DataFrame(records).sort_values("supports_hypothesis", ascending=False)

print(result_df)

# Save to Excel
result_df.to_excel(OUT_PATH, index=False)
print(f"Results saved to: {OUT_PATH}")
