In [25]:
import random
from faker import Faker
import json
from datetime import datetime, timedelta

# 初始化Faker库
fake = Faker('zh_CN')

# 固定的交易机构列表
institutions = [fake.company() for _ in range(10)]

# 生成单个交易
def generate_trade(trade_id, seller, buyer, start_time, price):
    volume = random.randint(1, 100)
    time_dl = start_time + timedelta(days=random.randint(0, 25), hours=random.randint(0, 23), minutes=random.randint(0, 59), seconds=random.randint(0, 59))

    return {
        "trade_id": trade_id,
        "seller": seller,
        "buyer": buyer,
        "volume": volume,
        "price": round(price, 2),
        "time_dl": time_dl.strftime("%Y-%m-%d %H:%M:%S%z")
    }

# 生成基础子序列
def generate_subsequence(base_price, length, start_id):
    subsequence = []
    price = base_price
    start_seller = random.choice(institutions) + fake.name()
    for trade_id in range(start_id, start_id + length):
        buyer = random.choice(institutions) + fake.name()
        # 10%的概率价格增加3%
        if random.random() < 0.1:
            price *= 1.03
        else:
            price += random.uniform(0.1, 2.0)
        trade = generate_trade(trade_id, start_seller, buyer, datetime.now(), price)
        subsequence.append(trade)
        start_seller = buyer  # 更新卖方
    return subsequence

# 生成模拟链路数据
def generate_mock_data(num_paths, max_trades_per_path, base_subsequences):
    mock_data = []
    next_trade_id = 1
    
    for _ in range(num_paths):
        num_trades = random.randint(5, max_trades_per_path)
        path_trades = []
        start_time = datetime(2023, 12, 1, 0, 0, 0)
        end_time = start_time + timedelta(days=25)
        profit_anomaly_score = round(random.uniform(50, 150), 2)
        institution_counts_anomaly_score = round(random.uniform(1, 5), 2)
        price_standard_deviation_anomaly_score = round(random.uniform(0, 1), 2)
        price_change_anomaly_score = round(random.uniform(0, 1), 2)

        # 随机选择几个子序列组合成完整链路
        subsequences_to_use = random.sample(base_subsequences, k=random.randint(1, min(len(base_subsequences), num_trades // 2)))
        
        # 维护链路的连续性
        if subsequences_to_use:
            # 选择第一个子序列的卖方
            first_subsequence = subsequences_to_use.pop(0)
            path_trades.extend(first_subsequence)
            next_trade_id += len(first_subsequence)

            for subseq in subsequences_to_use:
                # 确保上一个子序列的买方与当前子序列的卖方匹配
                last_buyer = path_trades[-1]['buyer']
                if subseq[0]['seller'] == last_buyer:
                    path_trades.extend(subseq)
                    next_trade_id += len(subseq)
                else:
                    # 如果不匹配，则尝试调整当前子序列
                    for i in range(len(subseq)):
                        if subseq[i]['seller'] == last_buyer:
                            path_trades.extend(subseq[i:])
                            next_trade_id += len(subseq[i:])
                            break

        # 确保链路长度符合要求
        path_trades = path_trades[:num_trades]
        
        # 确保链路首尾相接
        if path_trades and path_trades[0]['seller'] != path_trades[-1]['buyer']:
            path_trades[-1]['buyer'] = path_trades[0]['seller']
        
        mock_data.append({
            "path": f"{path_trades[0]['seller']} to {path_trades[-1]['buyer']}",
            "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S"),
            "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S"),
            "path_trades": path_trades,
            "profit_anomaly_score": profit_anomaly_score,
            "institution_counts_anomaly_score": institution_counts_anomaly_score,
            "price_standard_deviation_anomaly_score": price_standard_deviation_anomaly_score,
            "price_change_anomaly_score": price_change_anomaly_score
        })

    return mock_data

# 生成基础子序列
base_subsequences = []
for _ in range(20):
    start_id = len([t for sublist in base_subsequences for t in sublist]) + 1
    base_subsequences.append(generate_subsequence(100.0, random.randint(5, 10), start_id))

# 生成大量模拟数据
mock_data = generate_mock_data(num_paths=10, max_trades_per_path=8, base_subsequences=base_subsequences)

# 打印数据
print(json.dumps(mock_data, indent=4, ensure_ascii=False))


[
    {
        "path": "东方峻景信息有限公司张秀英 to 东方峻景信息有限公司张秀英",
        "start_time": "2023-12-01 00:00:00",
        "end_time": "2023-12-26 00:00:00",
        "path_trades": [
            {
                "trade_id": 11,
                "seller": "东方峻景信息有限公司张秀英",
                "buyer": "精芯科技有限公司杜志强",
                "volume": 57,
                "price": 101.1,
                "time_dl": "2024-08-11 03:42:48"
            },
            {
                "trade_id": 12,
                "seller": "精芯科技有限公司杜志强",
                "buyer": "凌云网络有限公司向瑜",
                "volume": 58,
                "price": 102.78,
                "time_dl": "2024-08-14 04:16:31"
            },
            {
                "trade_id": 13,
                "seller": "凌云网络有限公司向瑜",
                "buyer": "凌云网络有限公司曹凤兰",
                "volume": 29,
                "price": 103.23,
                "time_dl": "2024-08-23 02:21:41"
            },
            {
                "trade_id": 14,
                "selle

In [26]:
import json
from collections import defaultdict

# 假设 mock_data 是之前生成的链路数据
mock_data = mock_data

# 提取子序列
def extract_subsequences(path_trades):
    subsequences = []
    for i in range(len(path_trades)):
        for j in range(i + 1, len(path_trades) + 1):
            subsequence = path_trades[i:j]
            if len(subsequence) > 1:
                subsequences.append(tuple((t['seller'], t['buyer']) for t in subsequence))
    return subsequences

# 标识公共子序列并进行合并
def merge_common_subsequences(data):
    subsequence_map = defaultdict(list)
    
    for item in data:
        path_trades = item['path_trades']
        subsequences = extract_subsequences(path_trades)
        for subseq in subsequences:
            subsequence_map[subseq].append(item)
    
    # 创建一个映射表，将公共子序列映射到其所在的链路
    merged_subsequences = defaultdict(list)
    for subseq, paths in subsequence_map.items():
        if len(paths) > 1:
            merged_subsequences[subseq] = list(set(path['path'] for path in paths))
    
    return merged_subsequences

# 将合并后的数据转换为结构化数据
def convert_to_structured_data(merged_subsequences):
    structured_data = []
    
    for subseq, paths in merged_subsequences.items():
        structured_data.append({
            "subsequence": subseq,
            "paths": paths
        })
    
    return structured_data

# 合并公共子序列并生成结构化数据
merged_subsequences = merge_common_subsequences(mock_data)
structured_data = convert_to_structured_data(merged_subsequences)

# 打印结构化数据
print(json.dumps(structured_data, indent=4, ensure_ascii=False))


[
    {
        "subsequence": [
            [
                "东方峻景信息有限公司张秀英",
                "精芯科技有限公司杜志强"
            ],
            [
                "精芯科技有限公司杜志强",
                "凌云网络有限公司向瑜"
            ]
        ],
        "paths": [
            "东方峻景信息有限公司张秀英 to 东方峻景信息有限公司张秀英"
        ]
    },
    {
        "subsequence": [
            [
                "东方峻景信息有限公司张秀英",
                "精芯科技有限公司杜志强"
            ],
            [
                "精芯科技有限公司杜志强",
                "凌云网络有限公司向瑜"
            ],
            [
                "凌云网络有限公司向瑜",
                "凌云网络有限公司曹凤兰"
            ]
        ],
        "paths": [
            "东方峻景信息有限公司张秀英 to 东方峻景信息有限公司张秀英"
        ]
    },
    {
        "subsequence": [
            [
                "东方峻景信息有限公司张秀英",
                "精芯科技有限公司杜志强"
            ],
            [
                "精芯科技有限公司杜志强",
                "凌云网络有限公司向瑜"
            ],
            [
                "凌云网络有限公司向瑜",
                "凌云网络有限公司曹凤兰"
            ],
  