In [6]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from workalendar.usa import Illinois
import geopandas as gpd
from shapely.wkt import loads

RAW_DATA_DIR = "../../data/raw data"
PROCESSED_DATA_DIR = "../../data/processed data"

In [7]:
cta = pd.read_csv(os.path.join(RAW_DATA_DIR, "chicago_cta_L.csv"))
cta_geo = pd.read_csv(os.path.join(RAW_DATA_DIR, "chicago_cta_L_stations.csv"))

In [8]:
cta["date"] = pd.to_datetime(cta["date"])
start_date = pd.to_datetime("2018-11-01")
end_date = pd.to_datetime("2020-03-01")
cta = cta[(cta["date"] > start_date) & (cta["date"] < end_date)]

cta = cta.merge(cta_geo, left_on="station_id", right_on="map_id", how="left")

cta = cta.dropna(subset=['location'])

In [9]:
# === 1. 读取 tract 边界并构建 GeoDataFrame ===
tract_df = pd.read_csv("../../data/raw data/chicago_census_blocks_boundaries.csv")

# 解析 geometry
tract_df["geometry"] = tract_df["the_geom"].apply(loads)
tract_gdf = gpd.GeoDataFrame(tract_df, geometry="geometry", crs="EPSG:4326")

# 只保留 tract 边界和 tract ID
tract_gdf = tract_gdf[["GEOID10", "geometry"]]  # 确保你有 GEOID10 字段

# === 2. 解析 CTA 站点为 GeoDataFrame ===
from shapely.geometry import Point
import ast  # 用于解析字符串格式的字典

# === 解析 location 字段为 Point 类型 ===
def extract_point(loc_str):
    try:
        loc = ast.literal_eval(loc_str)
        lat = float(loc["latitude"])
        lon = float(loc["longitude"])
        return Point(lon, lat)  # 注意顺序：Point(x, y) = Point(longitude, latitude)
    except Exception as e:
        return None

cta["geometry"] = cta["location"].apply(extract_point)

# 移除解析失败的记录
cta = cta.dropna(subset=["geometry"])

# 转换为 GeoDataFrame
cta_gdf = gpd.GeoDataFrame(cta, geometry="geometry", crs="EPSG:4326")

# === 3. 做空间连接：cta_gdf 内点 → tract_gdf 多边形 ===
cta_joined = gpd.sjoin(cta_gdf, tract_gdf, how="left", predicate="within")
print(cta_joined.columns)

cta_joined = cta_joined.dropna(subset=['GEOID10'])


Index(['station_id', 'stationname', 'date', 'daytype', 'rides', 'stop_id',
       'direction_id', 'stop_name', 'station_name', 'station_descriptive_name',
       'map_id', 'ada', 'red', 'blue', 'g', 'brn', 'p', 'pexp', 'y', 'pnk',
       'o', 'location', ':@computed_region_awaf_s7ux',
       ':@computed_region_6mkv_f3dw', ':@computed_region_vrxf_vc4k',
       ':@computed_region_bdys_3d7i', ':@computed_region_43wa_7qmu',
       'geometry', 'index_right', 'GEOID10'],
      dtype='object')


In [10]:
cta_joined["GEOID10"] = cta_joined["GEOID10"].astype(float).astype(int).astype(str).str.zfill(11)

cta_grouped = (
    cta_joined.groupby(['GEOID10', 'date'])
    .agg({
        "rides": "sum",
        "daytype": "first"
    })
    .reset_index()
)

In [11]:
cta_grouped.to_csv(os.path.join(PROCESSED_DATA_DIR, "cleaned_cta_L_data.csv"), index=False)