In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.preprocessing import StandardScaler

MERGED_DATA_DIR = "../../data/merged data"
OUTPUT_DIR = "../../output/assumption"

# Load the dataset
tnp_20 = pd.read_csv(os.path.join(MERGED_DATA_DIR, "2020", "merged_tnp_data.csv"))
tnp_19 = pd.read_csv(os.path.join(MERGED_DATA_DIR, "2019", "merged_tnp_data.csv"))

In [2]:
# 添加年份指示变量
tnp_20["is_2020"] = 1
tnp_19["is_2020"] = 0

scaler = StandardScaler()
cols_to_scale = ["daily_bus_rides", "rides"]
tnp_20[cols_to_scale] = scaler.fit_transform(tnp_20[cols_to_scale])
tnp_19[cols_to_scale] = scaler.fit_transform(tnp_19[cols_to_scale])

# 拼接数据
merged_df = pd.concat([tnp_20, tnp_19], ignore_index=True)
merged_df["price"] = merged_df["fare"] + merged_df["additional_charges"]
# merged_df['price'] = np.log1p(merged_df['price'])

# 确保日期变量是 datetime 类型（后续 RDiT 会用到）
merged_df["trip_start_date"] = pd.to_datetime(merged_df["trip_start_date"])

# 创建工作日虚拟变量，drop_first=True 是为了避免虚拟变量陷阱
day_dummies = pd.get_dummies(merged_df['day_of_week'], prefix='dow', drop_first=True)

# 合并到原始数据中
merged_df = pd.concat([merged_df, day_dummies], axis=1)

# 创建地区虚拟变量，drop_first=True 是为了避免虚拟变量陷阱
area_dummies = pd.get_dummies(merged_df['area_type'], prefix='at', drop_first=True)

# 合并到原始数据中
merged_df = pd.concat([merged_df, area_dummies], axis=1)

In [3]:
print("数据总行数：", len(merged_df))
print("是否有NaN：", merged_df["price"].isna().sum())
print("时间列范围：", merged_df["trip_start_date"].min(), merged_df["trip_start_date"].max())

数据总行数： 3187906
是否有NaN： 0
时间列范围： 2018-12-10 00:00:00 2020-02-05 00:00:00


In [4]:
trip_controls = [
    "trip_seconds", "trip_miles", "trip_during_peak"
]

weather_controls = [
    "Avg_Temp_C", "Precipitation_mm",
    "Snowfall_mm", "Avg_Wind_Speed_mps",
]

substitutes_controls = [
    # "total_rides", "taxi",
    "rides", "daily_bus_rides", "taxi",
]

day_of_week_controls = ["dow_1", "dow_2", "dow_3", "dow_4"]

area_type_controls = ["at_1", "at_2"]

control_vars = (
    trip_controls
    + weather_controls
    + substitutes_controls
    + day_of_week_controls
    + area_type_controls
)

merged_df_0 = merged_df[merged_df["Cluster"] == 0]
merged_df_1 = merged_df[merged_df["Cluster"] == 1]
merged_df_2 = merged_df[merged_df["Cluster"] == 2]

In [5]:
def estimate_dif_in_rdit(
    df: pd.DataFrame,
    outcome: str,
    time_var: str,
    treat_year_var: str,
    cutoff_date,
    placebo_cutoff_date="2019-01-07",
    heterogeneity_vars: list = None,
    covariates: list = None,
    trend_order: int = 1,
    bandwidth: int = 29
):
    """
    估计Dif-in-RDiT模型，支持多个异质性变量（连续或虚拟变量）

    数学模型：
    Y_{it} = β0 + β1·Post_t + β2·TreatYear_i + β3·(Post_t·TreatYear_i)
           + f(TFC_t) + f(TFC_t)·Post_t + f(TFC_t)·TreatYear_i + f(TFC_t)·Post_t·TreatYear_i
           + ∑ δ_k·H_{ik} + ∑ θ_k·(Post_t·H_{ik}) + γ^T X + u_{it}
    """

    import pandas as pd
    import numpy as np
    import statsmodels.formula.api as smf

    df = df.copy()
    df[time_var] = pd.to_datetime(df[time_var])
    cutoff_date = pd.to_datetime(cutoff_date)
    placebo_cutoff_date = pd.to_datetime(placebo_cutoff_date)

    df["cutoff_for_row"] = df[treat_year_var].apply(
        lambda x: cutoff_date if x == 1 else placebo_cutoff_date
    )
    df["days_from_cutoff"] = (df[time_var] - df["cutoff_for_row"]).dt.days
    df = df[df["days_from_cutoff"].between(-bandwidth, bandwidth)]

    df["post_cutoff"] = (df["days_from_cutoff"] >= 0).astype(int)
    df["post_treat"] = df["post_cutoff"] * df[treat_year_var]

    trend_terms = []
    for i in range(1, trend_order + 1):
        base = f"days_from_cutoff_pow{i}"
        df[base] = df["days_from_cutoff"] ** i

        post = f"{base}_x_post"
        treat = f"{base}_x_treat"
        post_treat = f"{base}_x_post_treat"

        df[post] = df[base] * df["post_cutoff"]
        df[treat] = df[base] * df[treat_year_var]
        df[post_treat] = df[base] * df["post_cutoff"] * df[treat_year_var]

        trend_terms += [base, post, treat, post_treat]

    rhs = ["post_cutoff", treat_year_var, "post_treat"] + trend_terms

    if covariates:
        rhs += covariates

    interaction_terms = []
    if heterogeneity_vars:
        for var in heterogeneity_vars:
            rhs.append(var)  # H_k
            inter = f"post_cutoff:{var}"  # Post_t * H_k
            rhs.append(inter)
            interaction_terms.append(inter)

    formula = f"{outcome} ~ " + " + ".join(rhs)
    model = smf.ols(formula=formula, data=df).fit(cov_type='HC3')

    summary_df = model.summary2().tables[1].copy()
    summary_df.columns = summary_df.columns.astype(str)
    summary_df = summary_df.rename(columns={
        "Coef.": "coef",
        "Std.Err.": "std_err",
        "P>|t|": "p_value",
        "[0.025": "ci_lower",
        "0.975]": "ci_upper"
    })
    summary_df["variable"] = summary_df.index
    summary_df.reset_index(drop=True, inplace=True)
    print(model.rsquared)

    return summary_df

In [None]:
def run_robustness_all_trend_orders():
    dfs = {
        "pooled": merged_df,
        "low": merged_df_0,
        "high": merged_df_1,
        "mid": merged_df_2
    }
    
    results = {}

    for name, df in dfs.items():
        results[name] = {}
        for trend in [1, 3]:  # trend_order 从 1 到 4
            summary = estimate_dif_in_rdit(
                df=df,
                outcome="price",
                time_var="trip_start_date",
                treat_year_var="is_2020",
                cutoff_date="2020-01-06",
                placebo_cutoff_date="2019-01-07",
                covariates=control_vars,
                heterogeneity_vars=None,
                trend_order=trend,
                bandwidth=15
            )
            results[name][f"trend_order_{trend}"] = summary
    
    return results

# 调用一次就能跑完所有情况
all_results = run_robustness_all_trend_orders()

0.7795886814191689
0.7798326270812114




0.7798886542605641




0.7799060793012614
0.8139738498096765
0.813999185270152




0.8141217642183834




0.814140795669363
0.7688939921582473
0.7691713282723608




0.7692273914734633




0.7692497946709709
0.7951670006904098
0.7953920485430735




0.7954787081529067
0.7955197489622321




In [8]:
def export_trend_order_heterogeneity_table_to_tex(
    summary_df: pd.DataFrame,
    file_path: str = "output/custom_trend_hetero_table.tex",
    sig_levels: list = [0.1, 0.05, 0.01],
    caption: str = "Estimated effects across income groups and trend orders.",
    label: str = "tab:custom_trend_hetero",
    group_order: list = None,
    trend_order_list: list = None
):
    """
    生成 LaTeX 表格，行是 trend_order，列是组，表格内容以 makecell 包装支持单元格内换行。
    """
    import os
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    def get_stars(p):
        if p < sig_levels[2]: return '***'
        elif p < sig_levels[1]: return '**'
        elif p < sig_levels[0]: return '*'
        return ''

    df = summary_df[summary_df["variable"] == "post_treat"].copy()

    if trend_order_list is None:
        trend_order_list = sorted(df["trend_order"].dropna().unique())
    if group_order is None:
        group_order = df["hetero_group"].dropna().unique().tolist()

    tex_lines = [
        "\\begin{table}[H]\\centering",
        "\\renewcommand{\\arraystretch}{1.3}",
        "\\caption{" + caption + "}",
        "\\label{" + label + "}",
        "\\footnotesize",
        "\\begin{tabular}{l" + "c" * len(group_order) + "}",
        "\\toprule",
        "Trend Order & " + " & ".join(group_order) + " \\\\",
        "\\midrule"
    ]

    for trend in trend_order_list:
        row = [f"Order {int(trend)}"]
        for grp in group_order:
            sub = df[(df["trend_order"] == trend) & (df["hetero_group"] == grp)]
            if not sub.empty:
                r = sub.iloc[0]
                cell = f"\\makecell{{{r['coef']:.3f}{get_stars(r['P>|z|'])} \\\\ ({r['std_err']:.3f})}}"
            else:
                cell = "--"
            row.append(cell)
        tex_lines.append(" & ".join(row) + " \\\\")

    tex_lines += [
        "\\bottomrule",
        "\\end{tabular}",
        "\\vspace{0.5em}",
        "\\begin{minipage}{0.95\\textwidth}\\footnotesize\\textit{Notes:} This table reports the estimated treatment effects under different trend orders. Each cell shows the coefficient and the robust standard error (in parentheses). * $p<0.1$, ** $p<0.05$, *** $p<0.01$.\\end{minipage}",
        "\\end{table}"
    ]

    with open(file_path, "w", encoding="utf-8") as f:
        f.write("\n".join(tex_lines))

    print(f"LaTeX table saved to: {file_path}")


In [15]:
def flatten_results(all_results):
    """
    把 all_results (字典) 转成一个扁平的 DataFrame，附上 group 和 trend_order 标记。
    """
    rows = []
    for group, trend_results in all_results.items():
        for trend_key, df in trend_results.items():
            trend_order = int(trend_key.split("_")[-1])  # 提取 trend_order
            temp_df = df.copy()
            temp_df["hetero_group"] = group
            temp_df["trend_order"] = trend_order
            rows.append(temp_df)
    return pd.concat(rows, ignore_index=True)

flattened_df = flatten_results(all_results)

# 第二步：再导出
export_trend_order_heterogeneity_table_to_tex(
    summary_df=flattened_df,
    file_path="../../output/robustness/rdit_across_trend_orders.tex",
    caption="Treatment effects across trend orders.",
    label="tab:trend_order_hetero"
)

LaTeX table saved to: ../../output/robustness/rdit_across_trend_orders.tex
