In [4]:
import json
import random
import string
from collections import Counter

def generate_random_name(prefix=''):
    """生成随机名称"""
    return prefix + ''.join(random.choices(string.ascii_letters + string.digits, k=5))

def find_frequent_subsequences(sequences, length, removed_subsequences):
    """查找频繁出现的子序列"""
    subseq_count = Counter()
    for seq in sequences:
        # 过滤掉已经被移除的子序列
        filtered_seq = [x for x in seq if x not in removed_subsequences]
        for i in range(len(filtered_seq) - length + 1):
            sub = tuple(filtered_seq[i:i + length])
            if not any(item.startswith('<removed_') for item in sub):
                subseq_count[sub] += 1
    return [subseq for subseq, count in subseq_count.items() if count > 1]

def replace_subsequences(seq, subsequences_map, placeholder):
    """将子序列替换为占位符"""
    replaced_seq = []
    i = 0
    while i < len(seq):
        matched = False
        for length in range(len(subsequences_map), 1, -1):  # 从最长到最短检查
            if i + length <= len(seq):
                sub = tuple(seq[i:i + length])
                if sub in subsequences_map:
                    replaced_seq.append(subsequences_map[sub])
                    i += length
                    matched = True
                    break
        if not matched:
            replaced_seq.append(seq[i])
            i += 1
    return replaced_seq

def process_sequences(sequences, lengths, placeholders):
    """逐步处理序列"""
    processed_sequences = sequences[:]
    all_node_maps = {}
    removed_subsequences = set()
    
    for length, placeholder in zip(lengths, placeholders):
        # 查找频繁子序列（排除已经被替换的子序列）
        subsequences = find_frequent_subsequences(processed_sequences, length, removed_subsequences)
        
        # 生成节点映射
        node_map = {subseq: generate_random_name(prefix=placeholder) for subseq in subsequences}
        all_node_maps.update(node_map)
        
        # 更新已移除的子序列集合
        removed_subsequences.update(node_map.keys())
        # print('removed_subsequences', removed_subsequences)
        
        # 替换子序列
        processed_sequences = [replace_subsequences(seq, node_map, placeholder) for seq in processed_sequences]
    
    return processed_sequences, all_node_maps

def generate_flows(sequences, node_map):
    """生成流动数据"""
    flows = []
    
    reverse_node_map = {v: k for k, v in node_map.items()}

    # 遍历每条序列
    for seq in sequences:
        print("sequences", seq)

        # 创建流动数据的列表
        updated_seq = []
        
        # 遍历序列中的每个项
        for x in seq:
            # 如果 x 以 '<removed' 开头
            if x.startswith('<removed'):
                # 从反向映射中查找对应的键
                print("x", x)
                if x in reverse_node_map:
                    original_keys = reverse_node_map[x]
                    print("updated_seq", list(original_keys))
                    updated_seq.append(x)

                continue
            
            # if x not in node_map:
            #     # 如果该项不在节点映射中，则生成新的名称并更新节点映射
            #     # 如果是 <removed开头的情况，那么就跳过
            #     new_name = generate_random_name(prefix='transaction')
            #     node_map[x] = new_name
            
            # 使用节点名称或原始名称
            # updated_seq.append(node_map[x])
        
        # 添加到流动数据列表
        flow = {"thru": updated_seq, "value": 1}
        flows.append(flow)
    print('node_map', node_map)
    return flows

def format_nodes(node_map):
    """格式化节点"""
    nodes = []
    for subseq, name in node_map.items():
        nodes.append({"disp": subseq, "name": name})
    return nodes

def separate_removed_subsequences(sequences, removed_subsequences):
    """将已经移除的子序列单独处理"""
    individual_nodes = {}
    for subseq in removed_subsequences:
        name = generate_random_name(prefix='<removed>')
        individual_nodes[tuple(subseq)] = name
    return individual_nodes

# 原始数据
sequences = [
    ['Name1', 'Name2', 'Name3', 'Name4', 'Name5', 'Name6', 'Name7', 'Name8'],
    ['Name1', 'Name2', 'Name3', 'Name9', 'Name10', 'Name8'],
    ['Name1', 'Name2', 'Name3', 'Name9', 'Name11', 'Name8'],
    ['Name1', 'Name2', 'Name3', 'Name9', 'Name7', 'Name8'],
    ['Name1', 'Name2', 'Name3', 'Name9', 'Name12', 'Name13'],
    ['Name3', 'Name9', 'Name10', 'Name14', 'Name15', 'Name16'],
    ['Name17', 'Name4', 'Name5', 'Name6', 'Name7', 'Name8'],
    ['Name3', 'Name9', 'Name10', 'Name14', 'Name18'],
    ['Name3', 'Name9', 'Name11', 'Name19', 'Name18'],
    ['Name9', 'Name10', 'Name19', 'Name18'],
    ['Name20', 'Name6', 'Name7', 'Name8'],
    ['Name21', 'Name11', 'Name19', 'Name18'],
    ['Name22', 'Name10', 'Name8'],
    ['Name22', 'Name11', 'Name8'],
    ['Name1', 'Name14', 'Name18'],
    ['Name1', 'Name23', 'Name24'],
    ['Name25', 'Name11', 'Name8'],
    ['Name1', 'Name23', 'Name26'],
    ['Name27', 'Name23', 'Name26'],
    ['Name21', 'Name11', 'Name8']
]

# 定义最小子序列长度和占位符
lengths = [4, 3, 2]
placeholders = ['<removed_4>', '<removed_3>', '<removed_2>']

# 逐步处理序列
processed_sequences, node_map = process_sequences(sequences, lengths, placeholders)

# 生成占位符的单独节点
removed_subsequences = {subseq for subseq, name in node_map.items() if '<removed>' in name}
individual_nodes = separate_removed_subsequences(sequences, removed_subsequences)
node_map.update(individual_nodes)

# 生成流动数据
flows = generate_flows(processed_sequences, node_map)

# 格式化节点
nodes = format_nodes(node_map)

# 生成最终结果
result = {
    "Data source": "[Robert J. MacG. Dawson](http://www.amstat.org/publications/jse/v3n3/datasets.dawson.html)",
    "nodes": nodes,
    "flows": flows
}

print(result)

print(node_map)

# 保存为 JSON 文件
with open('result_with_check11.json', 'w') as f:
    json.dump(result, f, indent=4)

sequences ['Name1', 'Name2', 'Name3', '<removed_4>U7Snm', 'Name8']
x <removed_4>U7Snm
updated_seq ['Name4', 'Name5', 'Name6', 'Name7']
sequences ['<removed_4>ZeoXo', '<removed_2>ukYxH']
x <removed_4>ZeoXo
updated_seq ['Name1', 'Name2', 'Name3', 'Name9']
x <removed_2>ukYxH
updated_seq ['Name10', 'Name8']
sequences ['<removed_4>ZeoXo', '<removed_2>ZnnZu']
x <removed_4>ZeoXo
updated_seq ['Name1', 'Name2', 'Name3', 'Name9']
x <removed_2>ZnnZu
updated_seq ['Name11', 'Name8']
sequences ['<removed_4>ZeoXo', '<removed_2>9RKb7']
x <removed_4>ZeoXo
updated_seq ['Name1', 'Name2', 'Name3', 'Name9']
x <removed_2>9RKb7
updated_seq ['Name7', 'Name8']
sequences ['<removed_4>ZeoXo', 'Name12', 'Name13']
x <removed_4>ZeoXo
updated_seq ['Name1', 'Name2', 'Name3', 'Name9']
sequences ['<removed_4>8DAY8', 'Name15', 'Name16']
x <removed_4>8DAY8
updated_seq ['Name3', 'Name9', 'Name10', 'Name14']
sequences ['Name17', '<removed_4>U7Snm', 'Name8']
x <removed_4>U7Snm
updated_seq ['Name4', 'Name5', 'Name6', 'Name7'