In [38]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm

INPUT_CSV_FILE = 'transport_data/mobike_shanghai_sample_updated.csv'  
OUTPUT_CSV_FILE = 'SH-S2S-net.csv'

geolocator = Nominatim(user_agent="my-location-converter-app")

geocode = RateLimiter(geolocator.reverse, min_delay_seconds=1, error_wait_seconds=10)


def get_suburb_from_coords(lat, lon):
    try:
        geolocator = Nominatim(user_agent="street_finder")
        location = geolocator.reverse((lat, lon), exactly_one=True)
        address = location.raw['address']
        street = address.get('suburb', '') 
        town = address.get('town', '')

        if street != '':
            return street
        
        elif street == '' and town != '':
            return town
        else: return "Not info"
            
    except Exception as e:
        return f"{e}"

df = pd.read_csv(INPUT_CSV_FILE)
tqdm.pandas(desc="generating network")
df['start_street'] = df.progress_apply(
    lambda row: get_suburb_from_coords(row['start_location_y'], row['start_location_x']),
    axis=1
)

df['end_street'] = df.progress_apply(
    lambda row: get_suburb_from_coords(row['end_location_y'], row['end_location_x']),
    axis=1
)


output_df = df[[
    'start_street',
    'end_street',
    'start_time',
    'end_time'
]].copy() 


output_df.rename(columns={
    'start_street': 'start',
    'end_street': 'end'
}, inplace=True)



output_df.to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8-sig')

print(f"处理完成！结果已保存到文件: {OUTPUT_CSV_FILE}")

generating network: 100%|██████████| 102361/102361 [29:14:47<00:00,  1.03s/it]  
generating network: 100%|██████████| 102361/102361 [29:32:31<00:00,  1.04s/it]  

处理完成！结果已保存到文件: SH-S2S-net.csv





In [9]:
import pandas as pd
import numpy as np

def build_dynamic_network(
    input_csv: str,
    output_csv: str,
    window: str = "1H",
    offset: str = "1H",
    time_col: str = "start_time",
    start_col: str = "start",
    end_col: str = "end",
    drop_regexes = (r"not info", r"https"),
    encoding: str = "utf-8"
) -> pd.DataFrame:
    """
    读取CSV -> 清洗 -> 按给定窗口大小(window)与滑动步长(offset)构建动态网络层 -> 导出(start,end,layer)
    
    说明：
    - 窗口与步长均支持 pandas 的 Timedelta 格式（如 "1H", "30min", "90min"）。
    - 窗口对齐每天 00:00，窗口起点按 offset 等距推进：t0, t0+offset, t0+2*offset, ...
    - 若 offset < window（重叠滑窗），单条记录会被分配到所有覆盖其 start_time 的窗口层，导出多行。
    - layer 从 0 开始，按窗口起点时间排序编号。
    """
    # === 读取 ===
    df = pd.read_csv(input_csv, encoding=encoding)

    # 去掉字符串两端空白
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].astype(str).str.strip()

    # === 清洗：正则剔除 ===
    if drop_regexes:
        for patt in drop_regexes:
            mask = df.apply(lambda s: s.astype(str).str.lower().str.contains(patt, na=False))
            df = df[~mask.any(axis=1)].copy()

    # === 解析时间 ===
    if time_col not in df.columns:
        raise KeyError(f"时间列 '{time_col}' 不在输入数据中。可通过参数 time_col 指定。")
    df[time_col] = pd.to_datetime(df[time_col], errors="coerce", infer_datetime_format=True)
    df = df.dropna(subset=[time_col]).copy()

    # === 参数校验 ===
    win_td = pd.to_timedelta(window)
    step_td = pd.to_timedelta(offset)
    if win_td <= pd.Timedelta(0):
        raise ValueError(f"非法的窗口大小: {window}")
    if step_td <= pd.Timedelta(0):
        raise ValueError(f"非法的步长(offset): {offset}")

    # === 设定锚点（全局最早日期的 00:00） ===
    anchor_start = df[time_col].min().normalize()

    # === 生成所有窗口起点（覆盖数据时间范围） ===
    t_min = df[time_col].min()
    t_max = df[time_col].max()

    # 从 anchor_start 开始，步进 offset，直到能覆盖到 t_max 所在窗口
    # 末端至少到 t_max 的窗口起点：max_start >= t_max - win_td
    starts = []
    cur = anchor_start
    last_needed_start = (t_max - win_td)
    while cur <= t_max:
        starts.append(cur)
        cur += step_td
        # 若已经超过最后需要的起点且也超过 t_max，适当提前结束
        if cur > t_max and cur > last_needed_start:
            break

    # 先准备窗口 -> layer 的映射
    unique_starts = pd.Index(sorted(pd.to_datetime(starts)))
    window_to_layer = {ts: i for i, ts in enumerate(unique_starts)}

    # 数组与单位
    starts_arr = unique_starts.values  # datetime64[ns]
    starts_ns_arr = starts_arr.astype("datetime64[ns]").astype("int64")
    win_ns = int(pd.to_timedelta(window).value)
    step_ns = int(pd.to_timedelta(offset).value)
    anchor_ns = int(pd.Timestamp(anchor_start).value)

    def covering_window_starts(t: pd.Timestamp):
        t_ns = int(t.value)
        n_last = (t_ns - anchor_ns) // step_ns
        span = int(np.ceil(win_ns / step_ns))
        n_min = max(0, n_last - (span - 1))
        res = []
        for n in range(n_min, n_last + 1):
            if 0 <= n < len(starts_ns_arr):
                s_ns = int(starts_ns_arr[n])
                if s_ns <= t_ns < s_ns + win_ns:
                    res.append(pd.to_datetime(s_ns))
        return res

    # 展开行：为每条记录复制到其所有覆盖窗口
    rows = []
    for _, r in df.iterrows():
        t = r[time_col]
        ws = covering_window_starts(t)
        for s in ws:
            layer = window_to_layer[s]
            rows.append((r[start_col], r[end_col], layer))

    out = pd.DataFrame(rows, columns=["start", "end", "layer"]).sort_values("layer")
    out.to_csv(output_csv, index=False, encoding=encoding)
    print(f"Done. Wrote {len(out)} rows to {output_csv}")
    return out


build_dynamic_network(
    input_csv="transport_data/SH-S2S-net.csv",
    output_csv="SH-S2S-edges.csv",
    window="3h",   # 窗口大小
    offset="30min",   # 步长（可改为 "30min" 以使用重叠滑窗）
    time_col="start_time",
    start_col="start",
    end_col="end",
    drop_regexes=(r"not info", r"https")
)

  df[time_col] = pd.to_datetime(df[time_col], errors="coerce", infer_datetime_format=True)


Done. Wrote 594909 rows to SH-S2S-edges.csv


Unnamed: 0,start,end,layer
344900,莘庄镇,莘庄镇,0
274870,长寿路街道,甘泉路街道,0
371841,大场镇,祁连一村二社区,0
334230,真如镇街道,长征镇,0
6654,大桥街道,北外滩街道,0
...,...,...,...
125901,陆家嘴街道,东明路街道,1487
255682,大场镇,大场镇,1487
450383,外滩街道,豫园街道,1487
144081,甘泉路街道,甘泉路街道,1487
