In [13]:
import pandas as pd
# Read data (note: use the original file path and prefix with raw string r"")
df1 = pd.read_csv("final_predict_df.csv")
df2 = pd.read_csv("gdot_sites_information.csv")

# Convert data types to strings and strip whitespace from both ends
df1['Site_ID'] = df1['Site_ID'].astype(str).str.strip()
df2['cosit'] = df2['cosit'].astype(str).str.strip()

# Create a mapping dictionary: keys are 'cosit', values are (latitude, longitude)
mapping_dict = df2.set_index('cosit')[['latitude', 'longitude']].to_dict(orient='index')
# Example: {'000000010183': {'latitude': xxx, 'longitude': yyy}, ...}

# Define a function to match Site_ID to latitude and longitude using substring containment
def get_lat_lon(site_id):
    for key, latlon in mapping_dict.items():
        if site_id in key:
            return latlon['latitude'], latlon['longitude']
    return None, None

# Extract all unique Site_IDs and build a mapping to avoid redundant computation
unique_ids = df1['Site_ID'].unique()
siteid_to_latlon = {}
for sid in unique_ids:
    lat, lon = get_lat_lon(sid)
    siteid_to_latlon[sid] = (lat, lon)

# Map the matched results back to df1 and add latitude and longitude columns
df1['latitude'] = df1['Site_ID'].map(lambda x: siteid_to_latlon.get(x, (None, None))[0])
df1['longitude'] = df1['Site_ID'].map(lambda x: siteid_to_latlon.get(x, (None, None))[1])

# Check the result
print(df1.head())

# Save the result to a new file
df1.to_csv("final_predict_pos.csv", index=False)

   Unnamed: 0  sensor_id  weekday  time_slot  weekpart    traffic  \
0           0          0        0          0         0  12.244678   
1           1          0        0          1         0  10.749207   
2           2          0        0          2         0   9.448955   
3           3          0        0          3         0   6.702497   
4           4          0        0          4         0   5.798923   

        Site_ID  latitude  longitude  
0  0000001_0105  31.76743  -82.35349  
1  0000001_0105  31.76743  -82.35349  
2  0000001_0105  31.76743  -82.35349  
3  0000001_0105  31.76743  -82.35349  
4  0000001_0105  31.76743  -82.35349  


In [16]:
import numpy as np
import pandas as pd

def predict_traffic_by_knn(lat, lon, weekday, time_slot, df, k=5):
    """
    利用 kNN 从现有传感器中预测任意坐标点在指定一周几和时隙的流量。

    Args:
        lat (float): 待预测点纬度
        lon (float): 待预测点经度
        weekday (int): 周几编码，0=周一 … 6=周日
        time_slot (int): 15 分钟时隙索引，0…95
        df (DataFrame): 包含列 ['latitude','longitude','weekday','time_slot','traffic'] 的 DataFrame
        k (int): 最近邻数

    Returns:
        float or None: k 个最近传感器流量的平均值；如果没有观测，则返回 None
    """
    # 1. 筛选出同一天、同一时隙的记录
    sub = df[(df['weekday'] == weekday) & (df['time_slot'] == time_slot)]
    if sub.empty:
        return None

    # 2. 计算与目标点的欧氏距离
    coords = sub[['latitude', 'longitude']].to_numpy()
    target = np.array([lat, lon])
    dists = np.linalg.norm(coords - target, axis=1)

    # 3. 取 k 个最小距离的索引
    idx = np.argsort(dists)[:min(k, len(dists))]

    # 4. 返回这 k 个传感器的 traffic 平均
    return sub['traffic'].to_numpy()[idx].mean()


In [17]:
# —— 读取你的预测数据 —— 
# （已包含每个时隙的 traffic，用于 kNN 预测填充）
df_all = pd.read_csv(
    r"final_predict_pos.csv"
)

# 示例：查询 Atlanta 某点在“周三”08:15 的流量
lat, lon = 33.7490, -84.3880
weekday    = 2           # Wednesday
time_slot  = 8 * 4 + 1   # 08:15
pred = predict_traffic_by_knn(lat, lon, weekday, time_slot, df_all, k=5)
print(f"Predicted flow at ({lat}, {lon}) on Wed 08:15: {pred:.2f}")

Predicted flow at (33.749, -84.388) on Wed 08:15: 459.85
