In [14]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.preprocessing import StandardScaler

MERGED_DATA_DIR = "../../data/merged data"
OUTPUT_DIR = "../../output/assumption"

# Load the dataset
tnp_20 = pd.read_csv(os.path.join(MERGED_DATA_DIR, "2020", "merged_tnp_data.csv"))
tnp_19 = pd.read_csv(os.path.join(MERGED_DATA_DIR, "2019", "merged_tnp_data.csv"))

In [15]:
# 添加年份指示变量
tnp_20["is_2020"] = 1
tnp_19["is_2020"] = 0

scaler = StandardScaler()
cols_to_scale = ["daily_bus_rides", "rides"]
tnp_20[cols_to_scale] = scaler.fit_transform(tnp_20[cols_to_scale])
tnp_19[cols_to_scale] = scaler.fit_transform(tnp_19[cols_to_scale])

# 拼接数据
merged_df = pd.concat([tnp_20, tnp_19], ignore_index=True)
merged_df["price"] = merged_df["fare"] + merged_df["additional_charges"]

# 确保日期变量是 datetime 类型（后续 RDiT 会用到）
merged_df["trip_start_date"] = pd.to_datetime(merged_df["trip_start_date"])

# 创建工作日虚拟变量，drop_first=True 是为了避免虚拟变量陷阱
day_dummies = pd.get_dummies(merged_df['day_of_week'], prefix='dow', drop_first=True)

# 合并到原始数据中
merged_df = pd.concat([merged_df, day_dummies], axis=1)

# 创建地区虚拟变量，drop_first=True 是为了避免虚拟变量陷阱
area_dummies = pd.get_dummies(merged_df['area_type'], prefix='at', drop_first=True)

# 合并到原始数据中
merged_df = pd.concat([merged_df, area_dummies], axis=1)

In [16]:
tnp_20.to_stata("../../data/STATA/tnp_20.dta", write_index=False)
tnp_19.to_stata("../../data/STATA/tnp_19.dta", write_index=False)
merged_df.to_stata("../../data/STATA/merged_df.dta", write_index=False)