In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cosine
from scipy.spatial.distance import jensenshannon
def generate_road_similarity_graph(file_path, road_feature_cols, k, epsilon=1e-8):
    """
    生成道路特征相似性图（Gf）的完整实现
    :param file_path: 道路特征数据文件路径
    :param road_feature_cols: 道路特征列名列表（需可统计为概率分布）
    :param k: Top-k近邻数
    :param epsilon: 避免零概率的小值
    :return: 邻接矩阵 adj_matrix
    """
    # --------------------------
    # 步骤1: 数据预处理
    # --------------------------
    # 读取原始数据
    data = pd.read_csv(file_path)
    
    # 处理缺失值：假设-1为缺失值，替换为列中位数
    for col in road_feature_cols:
        median = data[col].median()
        data[col] = data[col].replace(-1, median)
    
    # 转换为概率分布（总和归一化为1）
    # 添加epsilon避免零值问题
    data[road_feature_cols] = data[road_feature_cols] + epsilon
    data[road_feature_cols] = data[road_feature_cols].div(data[road_feature_cols].sum(axis=1), axis=0)
    
    # --------------------------
    # 步骤2: 计算JS散度相似性
    # --------------------------
    n_grids = len(data)
    similarity_matrix = np.zeros((n_grids, n_grids))
    
    for i in range(n_grids):
        p_i = data.iloc[i][road_feature_cols].values
        for j in range(i+1, n_grids):
            p_j = data.iloc[j][road_feature_cols].values
            
            # 计算JS散度（注意scipy的JS实现返回sqrt值）
            js_divergence = jensenshannon(p_i, p_j) ** 2  # 需要平方获取实际JS值
            similarity = 1 - js_divergence
            
            # 对称填充矩阵
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity
    
    # --------------------------
    # 步骤3: 构建Top-k邻接矩阵
    # --------------------------
    adj_matrix = np.zeros((n_grids, n_grids))
    for i in range(n_grids):
        # 获取相似性排序（降序）
        sim_scores = similarity_matrix[i, :]
        
        # 排除自身（可选）
        sim_scores[i] = 0
        
        # 选择Top-k索引
        top_k_indices = np.argpartition(sim_scores, -k)[-k:]
        
        # 保留相似性权重
        adj_matrix[i, top_k_indices] = sim_scores[top_k_indices]
    
    return adj_matrix


# def generate_road_similarity_graph(file_path, road_feature_cols, k):
#     # 读取数据
#     data = pd.read_csv(file_path)

#     # 替换-1为列的中位数
#     for col in road_feature_cols:
#         median = data[col].median()
#         data[col] = data[col].replace(-1, median)

#     # 标准化特征
#     scaler = StandardScaler()
#     data[road_feature_cols] = scaler.fit_transform(data[road_feature_cols])

#     # 计算余弦相似度
#     n_grids = len(data)
#     similarity_matrix = np.zeros((n_grids, n_grids))

#     for i in range(n_grids):
#         for j in range(i + 1, n_grids):
#             similarity = 1 - cosine(data.iloc[i][road_feature_cols], data.iloc[j][road_feature_cols])
#             similarity_matrix[i, j] = similarity
#             similarity_matrix[j, i] = similarity

#     # 构建k近邻图
#     adj_matrix = np.zeros((n_grids, n_grids))
#     for i in range(n_grids):
#         sim_scores = similarity_matrix[i, :]
#         top_k_indices = sim_scores.argsort()[-k:][::-1]
#         adj_matrix[i, top_k_indices] = sim_scores[top_k_indices]

#     return adj_matrix


In [6]:
from scipy.spatial.distance import jensenshannon

def generate_poi_similarity_graph(file_path, feature_cols, k):
    # 读取数据
    data = pd.read_csv(file_path)

    # 处理POI数据：加1平滑，避免零概率
    data[feature_cols] = (data[feature_cols] + 1).div((data[feature_cols] + 1).sum(axis=1), axis=0)

    # 计算JS散度
    n_grids = len(data)
    distance_matrix = np.zeros((n_grids, n_grids))

    for i in range(n_grids):
        for j in range(i + 1, n_grids):
            distance = jensenshannon(data.iloc[i][feature_cols], data.iloc[j][feature_cols])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance

    # 将距离转化为相似度
    similarity_matrix = 1 - distance_matrix
    np.fill_diagonal(similarity_matrix, 0)  # 将对角线元素设为0，消除自环

    # 构建k近邻图
    adj_matrix = np.zeros((n_grids, n_grids))
    for i in range(n_grids):
        sim_scores = similarity_matrix[i, :]
        top_k_indices = sim_scores.argsort()[-k:][::-1]
        adj_matrix[i, top_k_indices] = sim_scores[top_k_indices]

    return adj_matrix


In [7]:
from tslearn.metrics import dtw_path

def generate_risk_similarity_graph(file_path, k):
    # 读取事故风险数据
    data = pd.read_csv(file_path)

    # 提取网格编号和时间片段
    grid_ids = data['grid_id'].unique()
    time_slots = data['time_slot'].unique()

    # 重塑数据为网格 x 时间片的矩阵
    risk_matrix = data.pivot(index='grid_id', columns='time_slot', values='risk_label')

    # 计算DTW距离
    n_grids = len(grid_ids)
    distance_matrix = np.zeros((n_grids, n_grids))

    for i in range(n_grids):
        for j in range(i + 1, n_grids):
            series_i = risk_matrix.iloc[i].values
            series_j = risk_matrix.iloc[j].values

            # 计算DTW距离
            _, distance = dtw_path(series_i, series_j)
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance

        print(f'{i + 1}/{n_grids}')

    # 将距离转化为相似度
    similarity_matrix = 1 / (1 + distance_matrix)
    np.fill_diagonal(similarity_matrix, 0)  # 将对角线元素设为0，消除自环

    # 构建k近邻图
    adj_matrix = np.zeros((n_grids, n_grids))
    for i in range(n_grids):
        sim_scores = similarity_matrix[i, :]
        top_k_indices = sim_scores.argsort()[-k:][::-1]
        adj_matrix[i, top_k_indices] = sim_scores[top_k_indices]

    return adj_matrix


In [8]:
def process_multiple_samples(sample_files, k, names):
    for file, name in zip(sample_files, names):
        f1, f2 = file
        # 生成道路相似度图
        road_feature_cols = ['rating_min', 'rating_max', 'rating_mean', 'rating_median', 'road_density']
        road_adj_matrix = generate_road_similarity_graph(f1, road_feature_cols, k)
        np.save(f'/root/autodl-tmp/npy/new_road_adj_matrix_{name}.npy', road_adj_matrix)
        print("生成道路相似度图",name)

        # 生成POI相似度图
        feature_cols = [f'faci_dom_{i}_count' for i in range(1, 19)]
        poi_adj_matrix = generate_poi_similarity_graph(f1, feature_cols, k)
        np.save(f'/root/autodl-tmp/npy/new_poi_adj_matrix_{name}.npy', poi_adj_matrix)
        print("生成POI相似度图",name)

        # 生成事故风险图
        risk_adj_matrix = generate_risk_similarity_graph(f2, k)
        np.save(f'/root/autodl-tmp/npy/new_risk_adj_matrix_{name}.npy', risk_adj_matrix)
        print("生成事故风险图",name)

In [9]:
sample_files = [('save/grid_data_c_1.csv', 'save/grid_data_c_2.csv')]
sample_names = [ 'c']
process_multiple_samples(sample_files, 8, sample_names)

生成道路相似度图 c
生成POI相似度图 c
1/65
2/65
3/65
4/65
5/65
6/65
7/65
8/65
9/65
10/65
11/65
12/65
13/65
14/65
15/65
16/65
17/65
18/65
19/65
20/65
21/65
22/65
23/65
24/65
25/65
26/65
27/65
28/65
29/65
30/65
31/65
32/65
33/65
34/65
35/65
36/65
37/65
38/65
39/65
40/65
41/65
42/65
43/65
44/65
45/65
46/65
47/65
48/65
49/65
50/65
51/65
52/65
53/65
54/65
55/65
56/65
57/65
58/65
59/65
60/65
61/65
62/65
63/65
64/65
65/65
生成事故风险图 c


In [10]:
# sample_files = [('save/grid_data_f_1.csv', 'save/grid_data_f_2.csv'), ('save/grid_data_c_1.csv', 'save/grid_data_c_2.csv')]
# sample_names = ['f', 'c']
# process_multiple_samples(sample_files, 8, sample_names)

In [11]:
# sample_files = [('save/grid_data_f_1.csv', 'save/grid_data_f_3.csv'), ('save/grid_data_c_1.csv', 'save/grid_data_c_3.csv')]
# sample_names = ['f', 'c']
# process_multiple_samples(sample_files, 8, sample_names)