In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler

MERGED_DATA_DIR = "../../data/merged data"
OUTPUT_DIR = "../../output/assumption"

# Load the dataset
tnp_20 = pd.read_csv(os.path.join(MERGED_DATA_DIR, "2020", "merged_tnp_data.csv"))
tnp_19 = pd.read_csv(os.path.join(MERGED_DATA_DIR, "2019", "merged_tnp_data.csv"))

scaler = StandardScaler()
cols_to_scale = ["daily_bus_rides", "rides"]
tnp_20[cols_to_scale] = scaler.fit_transform(tnp_20[cols_to_scale])
tnp_19[cols_to_scale] = scaler.fit_transform(tnp_19[cols_to_scale])

# 创建工作日虚拟变量，drop_first=True 是为了避免虚拟变量陷阱
day_dummies = pd.get_dummies(tnp_20['day_of_week'], prefix='dow', drop_first=True)
tnp_20 = pd.concat([tnp_20, day_dummies], axis=1)

day_dummies = pd.get_dummies(tnp_19['day_of_week'], prefix='dow', drop_first=True)
tnp_19 = pd.concat([tnp_19, day_dummies], axis=1)

# 创建工作日虚拟变量，drop_first=True 是为了避免虚拟变量陷阱
area_dummies = pd.get_dummies(tnp_20['area_type'], prefix='at', drop_first=True)
tnp_20 = pd.concat([tnp_20, area_dummies], axis=1)

area_dummies = pd.get_dummies(tnp_19['area_type'], prefix='at', drop_first=True)
tnp_19 = pd.concat([tnp_19, area_dummies], axis=1)

In [12]:
trip_controls = ["trip_seconds", "trip_miles", "trip_during_peak"]
weather_controls = ["Avg_Temp_C", "Precipitation_mm", "Snowfall_mm", "Avg_Wind_Speed_mps"]
# substitutes_controls = ["total_rides", "taxi"]
substitutes_controls = ["rides", "daily_bus_rides", "taxi"]
day_of_week_controls = ["dow_1", "dow_2", "dow_3", "dow_4"]
area_type_controls = ["at_1", "at_2"]

control_vars = (
    trip_controls
    + weather_controls
    + substitutes_controls
    # + day_of_week_controls
    # + area_type_controls
)

In [3]:
def export_vif_table_to_tex(df, variables, file_path, custom_labels=None, 
    label="tab:vif_controls", 
    caption="Variance Inflation Factors for Control Variables"):
    """
    生成 AEJ 风格的 VIF 表格并保存为 .tex 文件

    参数：
    - df: 原始 DataFrame（如 tnp_20）
    - variables: 控制变量名列表
    - file_path: 保存的 tex 路径
    - custom_labels: 可选字典，用于替换变量名显示
    """
    X = df[variables].copy()
    X = add_constant(X)

    vif_df = pd.DataFrame()
    vif_df["Variable"] = X.columns
    vif_df["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif_df = vif_df.sort_values(by="VIF", ascending=False).reset_index(drop=True)

    def label_variable(var):
        if custom_labels and var in custom_labels:
            return custom_labels[var].replace("_", "\\_")
        return var.replace("_", "\\_")

    latex_lines = []
    latex_lines.append("\\begin{table}[H]")
    latex_lines.append("\\centering")
    latex_lines.append(f"\\caption{{{caption}}}")
    latex_lines.append(f"\\label{{{label}}}")
    latex_lines.append("\\begin{tabular}{l r}")
    latex_lines.append("\\hline")
    latex_lines.append("Variable & VIF \\\\")
    latex_lines.append("\\hline")

    for _, row in vif_df.iterrows():
        var_name = label_variable(row["Variable"])
        vif_value = f"{row['VIF']:.2f}"
        latex_lines.append(f"{var_name} & {vif_value} \\\\")

    latex_lines.append("\\hline")
    latex_lines.append("\\end{tabular}")
    latex_lines.append("\\vspace{1ex}")
    latex_lines.append("\\begin{minipage}{0.9\\linewidth}")
    latex_lines.append("\\footnotesize")
    latex_lines.append("\\textit{Note:} This table reports variance inflation factors (VIF) for control variables in the placebo year regression. VIF values above 5 are commonly interpreted as indicating potential multicollinearity.")
    latex_lines.append("\\end{minipage}")
    latex_lines.append("\\end{table}")

    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write("\n".join(latex_lines))

    return file_path

In [13]:
control_labels = {
    "trip_seconds": "Trip Duration (s)",
    "trip_miles": "Trip Distance (miles)",
    "trip_during_peak": "Peak Hour Trip",
    "taxi": "Taxi Trips",
    # "total_rides": "Public Transportation Rides",
    "rides": "``L'' Rail System Rides",
    "daily_bus_rides": "Bus Rides",
    "Avg_Temp_C": "Temperature (°C)",
    "Precipitation_mm": "Precipitation (mm)",
    "Snowfall_mm": "Snowfall (mm)",
    "Avg_Wind_Speed_mps": "Wind Speed (m/s)",
    "dow_1": "Tuesday",
    "dow_2": "Wednesday",
    "dow_3": "Thursday",
    "dow_4": "Friday",
    "at_1": "Loop",
    "at_2": "Tourist/Transit Area",
}

export_vif_table_to_tex(
    df=tnp_20,
    variables=control_vars,
    file_path="../../output/data_summarize/vif_20.tex",
    custom_labels=control_labels,
    label="tab:vif_controls_20",
    caption="Variance Inflation Factors for Control Variables on Treatment Year"
)

export_vif_table_to_tex(
    df=tnp_19,
    variables=control_vars,
    file_path="../../output/data_summarize/vif_19.tex",
    custom_labels=control_labels,
    label="tab:vif_controls_19",
    caption="Variance Inflation Factors for Control Variables on Placebo Year"
)

'../../output/data_summarize/vif_19.tex'