In [3]:
import pandas as pd
import numpy as np
import os
import json
from collections import defaultdict


def calculate_jaccard(set1, set2):
    """计算两个集合的Jaccard系数"""
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0


def calculate_overlap(set1, set2):
    """计算重叠系数：交集大小 / 较小集合大小"""
    intersection = len(set1.intersection(set2))
    denominator = min(len(set1), len(set2))
    return intersection / denominator if denominator > 0 else 0


def calculate_modified_similarity(set1, set2, lambda_param=0.6):
    """计算修改版相似度，适用于大社区
    结合Jaccard系数和重叠度，λ控制平衡"""
    jaccard = calculate_jaccard(set1, set2)
    overlap1 = len(set1.intersection(set2)) / len(set1) if len(set1) > 0 else 0
    overlap2 = len(set1.intersection(set2)) / len(set2) if len(set2) > 0 else 0

    # 使用最大重叠度
    max_overlap = max(overlap1, overlap2)

    # 组合相似度
    return lambda_param * jaccard + (1 - lambda_param) * max_overlap


def load_community_data(date):
    """加载特定月份的社区数据"""
    file_path = f'../visualization/assets/data/{date}/handle/rank{date}.csv'
    if not os.path.exists(file_path):
        print(f"文件不存在: {file_path}")
        return None

    df = pd.read_csv(file_path)

    # 构建社区字典 {社区ID: 节点集合}
    communities = {}
    for _, row in df.iterrows():
        comm_id = row['community']
        node_id = row['id']

        if comm_id not in communities:
            communities[comm_id] = set()
        communities[comm_id].add(node_id)

    return communities


def analyze_community_events(t0_date, t1_date):
    """分析两个时间点之间的社区事件
    采用分层处理策略，区分大小社区，优化事件分配逻辑"""
    print(f"分析 {t0_date} 和 {t1_date} 之间的社区事件")

    # 加载两个时间点的社区数据
    t0_communities = load_community_data(t0_date)
    t1_communities = load_community_data(t1_date)

    if t0_communities is None or t1_communities is None:
        print("缺少社区数据，跳过分析")
        return None

    # 跟踪已分配事件的社区
    t0_assigned = set()
    t1_assigned = set()

    # 事件记录
    events = []

    # 将社区分为大社区和小社区
    t0_large_communities = {cid: nodes for cid, nodes in t0_communities.items() if len(nodes) >= 100}
    t0_small_communities = {cid: nodes for cid, nodes in t0_communities.items() if len(nodes) < 100}
    t1_large_communities = {cid: nodes for cid, nodes in t1_communities.items() if len(nodes) >= 100}
    t1_small_communities = {cid: nodes for cid, nodes in t1_communities.items() if len(nodes) < 100}

    print(f"t0大社区数量: {len(t0_large_communities)}, 小社区数量: {len(t0_small_communities)}")
    print(f"t1大社区数量: {len(t1_large_communities)}, 小社区数量: {len(t1_small_communities)}")

    # 步骤1: 处理大社区的延续事件 (优先级最高)
    for t0_comm_id, t0_nodes in sorted(t0_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
        if t0_comm_id in t0_assigned:
            continue

        best_match = None
        best_similarity = 0
        for t1_comm_id, t1_nodes in sorted(t1_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
            if t1_comm_id in t1_assigned:
                continue

            # 使用修改版相似度
            similarity = calculate_modified_similarity(t0_nodes, t1_nodes)
            if similarity > best_similarity:
                best_similarity = similarity
                best_match = (t1_comm_id, t1_nodes)

        # 高相似度表示延续事件
        if best_match and best_similarity >= 0.5:
            t1_comm_id, t1_nodes = best_match

            # 判断是纯延续还是伴随规模变化
            t0_size = len(t0_nodes)
            t1_size = len(t1_nodes)
            size_change_ratio = t1_size / t0_size if t0_size > 0 else float('inf')

            event_type = "延续"
            # 判断是否有明显的增长或减少
            if size_change_ratio > 1.2:  # 增长超过20%
                event_type = "增加"
            elif size_change_ratio < 0.8:  # 减少超过20%
                event_type = "减少"

            events.append({
                "source_date": t0_date,
                "source_community": t0_comm_id,
                "target_date": t1_date,
                "target_community": t1_comm_id,
                "event_type": event_type,
                "similarity": best_similarity,
                "size_change_ratio": size_change_ratio
            })

            t0_assigned.add(t0_comm_id)
            t1_assigned.add(t1_comm_id)

    # 步骤2: 处理大社区的分裂事件 - 优化后的逻辑
    for t0_comm_id, t0_nodes in sorted(t0_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
        if t0_comm_id in t0_assigned:
            continue

        # 查找可能的分裂目标（包括大小社区）
        split_candidates = []

        # 先检查大社区
        for t1_comm_id, t1_nodes in sorted(t1_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
            if t1_comm_id in t1_assigned:
                continue

            overlap = len(t0_nodes.intersection(t1_nodes)) / len(t0_nodes)
            # 降低阈值从0.2到0.15
            if overlap >= 0.15:
                split_candidates.append((t1_comm_id, overlap, len(t1_nodes)))

        # 再检查小社区
        for t1_comm_id, t1_nodes in sorted(t1_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
            if t1_comm_id in t1_assigned:
                continue

            overlap = len(t0_nodes.intersection(t1_nodes)) / len(t0_nodes)
            # 降低阈值从0.1到0.05
            if overlap >= 0.05:
                split_candidates.append((t1_comm_id, overlap, len(t1_nodes)))

        # 按重叠比例排序
        split_candidates.sort(key=lambda x: x[1], reverse=True)

        # 降低候选数量要求和累积覆盖率要求
        if len(split_candidates) >= 1:
            cumulative_overlap = sum(overlap for _, overlap, _ in split_candidates)

            # 降低累积覆盖率要求，并允许单个高重叠候选的特殊情况
            if (len(split_candidates) >= 2 and cumulative_overlap >= 0.4) or \
               (len(split_candidates) == 1 and cumulative_overlap >= 0.6):
                selected_targets = []
                selected_overlap = 0

                # 降低累积覆盖率终止条件
                for t1_comm_id, overlap, _ in split_candidates:
                    if selected_overlap >= 0.7:  # 从0.8降低到0.7
                        break

                    selected_targets.append(t1_comm_id)
                    selected_overlap += overlap
                    t1_assigned.add(t1_comm_id)

                if selected_targets:
                    events.append({
                        "source_date": t0_date,
                        "source_community": t0_comm_id,
                        "target_date": t1_date,
                        "target_community": selected_targets,
                        "event_type": "分裂",
                        "overlap_score": selected_overlap
                    })
                    t0_assigned.add(t0_comm_id)

    # 步骤3: 处理大社区的合并事件 - 优化后的逻辑
    for t1_comm_id, t1_nodes in sorted(t1_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
        if t1_comm_id in t1_assigned:
            continue

        # 查找可能的合并来源（包括大小社区）
        merge_candidates = []

        # 先检查大社区
        for t0_comm_id, t0_nodes in sorted(t0_large_communities.items(), key=lambda x: len(x[1]), reverse=True):
            if t0_comm_id in t0_assigned:
                continue

            overlap = len(t0_nodes.intersection(t1_nodes)) / len(t1_nodes)
            # 降低阈值从0.2到0.15
            if overlap >= 0.15:
                merge_candidates.append((t0_comm_id, overlap, len(t0_nodes)))

        # 再检查小社区
        for t0_comm_id, t0_nodes in sorted(t0_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
            if t0_comm_id in t0_assigned:
                continue

            overlap = len(t0_nodes.intersection(t1_nodes)) / len(t1_nodes)
            # 降低阈值从0.1到0.05
            if overlap >= 0.05:
                merge_candidates.append((t0_comm_id, overlap, len(t0_nodes)))

        # 按重叠比例排序
        merge_candidates.sort(key=lambda x: x[1], reverse=True)

        # 降低候选数量要求和累积覆盖率要求
        if len(merge_candidates) >= 1:
            cumulative_overlap = sum(overlap for _, overlap, _ in merge_candidates)

            # 降低累积覆盖率要求，并允许单个高重叠候选的特殊情况
            if (len(merge_candidates) >= 2 and cumulative_overlap >= 0.4) or \
               (len(merge_candidates) == 1 and cumulative_overlap >= 0.6):
                selected_sources = []
                selected_overlap = 0

                # 降低累积覆盖率终止条件
                for t0_comm_id, overlap, _ in merge_candidates:
                    if selected_overlap >= 0.7:  # 从0.8降低到0.7
                        break

                    selected_sources.append(t0_comm_id)
                    selected_overlap += overlap
                    t0_assigned.add(t0_comm_id)

                if selected_sources:
                    events.append({
                        "source_date": t0_date,
                        "source_community": selected_sources,
                        "target_date": t1_date,
                        "target_community": t1_comm_id,
                        "event_type": "合并",
                        "overlap_score": selected_overlap
                    })
                    t1_assigned.add(t1_comm_id)

    # 步骤4: 处理剩余大社区的消亡和新生事件
    for t0_comm_id, t0_nodes in t0_large_communities.items():
        if t0_comm_id not in t0_assigned and len(t0_nodes) >= 100:
            events.append({
                "source_date": t0_date,
                "source_community": t0_comm_id,
                "target_date": t1_date,
                "target_community": None,
                "event_type": "消亡",
                "similarity": 0
            })
            t0_assigned.add(t0_comm_id)

    for t1_comm_id, t1_nodes in t1_large_communities.items():
        if t1_comm_id not in t1_assigned and len(t1_nodes) >= 100:
            events.append({
                "source_date": t0_date,
                "source_community": None,
                "target_date": t1_date,
                "target_community": t1_comm_id,
                "event_type": "新生",
                "similarity": 0
            })
            t1_assigned.add(t1_comm_id)

    # 步骤5: 处理小社区事件（优先级较低）
    # 处理小社区延续事件
    for t0_comm_id, t0_nodes in sorted(t0_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
        if t0_comm_id in t0_assigned:
            continue

        best_match = None
        best_overlap = 0

        for t1_comm_id, t1_nodes in sorted(t1_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
            if t1_comm_id in t1_assigned:
                continue

            # 使用重叠系数处理小社区
            overlap = calculate_overlap(t0_nodes, t1_nodes)
            if overlap > best_overlap:
                best_overlap = overlap
                best_match = (t1_comm_id, t1_nodes)

        # 降低高重叠阈值以增加小社区的延续事件
        if best_match and best_overlap >= 0.5:  # 从0.6降低到0.5
            t1_comm_id, t1_nodes = best_match

            # 判断是纯延续还是伴随规模变化
            t0_size = len(t0_nodes)
            t1_size = len(t1_nodes)
            size_change_ratio = t1_size / t0_size if t0_size > 0 else float('inf')

            event_type = "延续"
            # 判断是否有明显的增长或减少
            if size_change_ratio > 1.3:  # 小社区阈值设置更宽松
                event_type = "增加"
            elif size_change_ratio < 0.7:
                event_type = "减少"

            events.append({
                "source_date": t0_date,
                "source_community": t0_comm_id,
                "target_date": t1_date,
                "target_community": t1_comm_id,
                "event_type": event_type,
                "similarity": best_overlap,
                "size_change_ratio": size_change_ratio
            })

            t0_assigned.add(t0_comm_id)
            t1_assigned.add(t1_comm_id)

    # 步骤6: 处理小社区参与的分裂事件
    for t0_comm_id, t0_nodes in sorted(t0_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
        if t0_comm_id in t0_assigned or len(t0_nodes) < 30:  # 忽略太小的社区
            continue

        # 查找可能的分裂目标
        split_candidates = []

        for t1_comm_id, t1_nodes in sorted(t1_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
            if t1_comm_id in t1_assigned:
                continue

            # 使用重叠系数计算
            overlap = calculate_overlap(t0_nodes, t1_nodes)
            if overlap >= 0.3:  # 较小社区使用更高的重叠系数阈值
                split_candidates.append((t1_comm_id, overlap, len(t1_nodes)))

        # 需要至少2个候选社区且累积重叠度足够高
        if len(split_candidates) >= 2:
            cumulative_overlap = sum(overlap for _, overlap, _ in split_candidates)

            if cumulative_overlap >= 0.6:  # 小社区要求更高的覆盖率
                selected_targets = []

                for t1_comm_id, overlap, _ in split_candidates:
                    selected_targets.append(t1_comm_id)
                    t1_assigned.add(t1_comm_id)

                if selected_targets:
                    events.append({
                        "source_date": t0_date,
                        "source_community": t0_comm_id,
                        "target_date": t1_date,
                        "target_community": selected_targets,
                        "event_type": "分裂",
                        "overlap_score": cumulative_overlap
                    })
                    t0_assigned.add(t0_comm_id)

    # 步骤7: 处理小社区参与的合并事件
    for t1_comm_id, t1_nodes in sorted(t1_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
        if t1_comm_id in t1_assigned or len(t1_nodes) < 30:  # 忽略太小的社区
            continue

        # 查找可能的合并来源
        merge_candidates = []

        for t0_comm_id, t0_nodes in sorted(t0_small_communities.items(), key=lambda x: len(x[1]), reverse=True):
            if t0_comm_id in t0_assigned:
                continue

            # 使用重叠系数计算
            overlap = calculate_overlap(t1_nodes, t0_nodes)
            if overlap >= 0.3:  # 较小社区使用更高的重叠系数阈值
                merge_candidates.append((t0_comm_id, overlap, len(t0_nodes)))

        # 需要至少2个候选社区且累积重叠度足够高
        if len(merge_candidates) >= 2:
            cumulative_overlap = sum(overlap for _, overlap, _ in merge_candidates)

            if cumulative_overlap >= 0.6:  # 小社区要求更高的覆盖率
                selected_sources = []

                for t0_comm_id, overlap, _ in merge_candidates:
                    selected_sources.append(t0_comm_id)
                    t0_assigned.add(t0_comm_id)

                if selected_sources:
                    events.append({
                        "source_date": t0_date,
                        "source_community": selected_sources,
                        "target_date": t1_date,
                        "target_community": t1_comm_id,
                        "event_type": "合并",
                        "overlap_score": cumulative_overlap
                    })
                    t1_assigned.add(t1_comm_id)

    return events


def generate_all_events():
    """生成所有月份之间的社区事件"""
    all_events = []
    months = list(range(202401, 202411))

    for i in range(len(months) - 1):
        t0 = months[i]
        t1 = months[i + 1]

        events = analyze_community_events(t0, t1)
        if events:
            all_events.extend(events)

    # 保存事件数据
    output_dir = '../visualization/assets/data/events'
    os.makedirs(output_dir, exist_ok=True)

    # 转为可JSON序列化的格式并保存
    events_json = []
    for event in all_events:
        event_copy = event.copy()
        # 确保集合被转换为列表
        if isinstance(event_copy.get('source_community'), list) and len(event_copy['source_community']) > 0:
            if isinstance(event_copy['source_community'][0], set):
                event_copy['source_community'] = [list(comm) for comm in event_copy['source_community']]
        if isinstance(event_copy.get('target_community'), list) and len(event_copy['target_community']) > 0:
            if isinstance(event_copy['target_community'][0], set):
                event_copy['target_community'] = [list(comm) for comm in event_copy['target_community']]
        events_json.append(event_copy)

    with open(f'{output_dir}/community_events.json', 'w') as f:
        json.dump(events_json, f, indent=2)

    # 生成CSV格式
    events_df = pd.DataFrame(all_events)
    events_df.to_csv(f'{output_dir}/community_events.csv', index=False)

    print(f"生成了 {len(all_events)} 个社区事件")
    return all_events


# 执行生成所有事件
all_events = generate_all_events()

分析 202401 和 202402 之间的社区事件
t0大社区数量: 8, 小社区数量: 13
t1大社区数量: 9, 小社区数量: 50
分析 202402 和 202403 之间的社区事件
t0大社区数量: 9, 小社区数量: 50
t1大社区数量: 9, 小社区数量: 12
分析 202403 和 202404 之间的社区事件
t0大社区数量: 9, 小社区数量: 12
t1大社区数量: 8, 小社区数量: 15
分析 202404 和 202405 之间的社区事件
t0大社区数量: 8, 小社区数量: 15
t1大社区数量: 7, 小社区数量: 20
分析 202405 和 202406 之间的社区事件
t0大社区数量: 7, 小社区数量: 20
t1大社区数量: 7, 小社区数量: 26
分析 202406 和 202407 之间的社区事件
t0大社区数量: 7, 小社区数量: 26
t1大社区数量: 8, 小社区数量: 22
分析 202407 和 202408 之间的社区事件
t0大社区数量: 8, 小社区数量: 22
t1大社区数量: 6, 小社区数量: 23
分析 202408 和 202409 之间的社区事件
t0大社区数量: 6, 小社区数量: 23
t1大社区数量: 10, 小社区数量: 14
分析 202409 和 202410 之间的社区事件
t0大社区数量: 10, 小社区数量: 14
t1大社区数量: 11, 小社区数量: 34
生成了 159 个社区事件
