In [1]:
from chain_project.path_linking import summarize_paths, extract_database, merge_subsequences, prefix_span, restore_trades
from chain_project.main import process_transactions

def map_hashes_to_names(database):
    # 生成一个唯一的名称映射
    hash_to_name = {}
    name_counter = 1
    
    for seq in database:
        for hash_value in seq:
            if hash_value not in hash_to_name:
                hash_to_name[hash_value] = f"Name{name_counter}"
                name_counter += 1
    
    # 使用名称映射替换哈希
    mapped_database = []
    for seq in database:
        mapped_seq = [hash_to_name[hash_value] for hash_value in seq]
        mapped_database.append(mapped_seq)
    
    return mapped_database, hash_to_name

file_path='static/bond_2005496_2006_2402.csv'
inst_list = ['长线资本基金孙姣', '国金证券股份严佳', '华创证券有限马延威', '潍坊银行股份王梓涵', '鄂尔多斯银行郭宁', '粤开证券股份周荃', '交通银行股份何嘉隆', '华源证券股份钱淑雯']

data = process_transactions(file_path, inst_list)
json_output = summarize_paths(data)
trade_hashes = extract_database(json_output)

In [14]:
# 原始数据
database = trade_hashes

sequences, hash_to_name = map_hashes_to_names(database)

# # 打印结果
# print("Mapped Database:")
# for seq in mapped_database:
#     print(seq)

# print("\nHash to Name Mapping:")
# for hash_value, name in hash_to_name.items():
#     print(f"{hash_value}: {name}")


In [15]:
from collections import defaultdict
def generate_subsequences(seq, min_length=1):
    """生成所有可能的子序列"""
    subsequences = set()
    n = len(seq)
    for length in range(min_length, n + 1):
        for start in range(n - length + 1):
            subsequences.add(tuple(seq[start:start + length]))
    return subsequences

def find_frequent_subsequences(sequences, min_occurrences):
    """找到在至少min_occurrences个序列中出现的子序列"""
    subseq_counts = defaultdict(int)
    num_sequences = len(sequences)
    
    # 统计所有子序列的出现次数
    for seq in sequences:
        subsequences = generate_subsequences(seq, min_length=3)  # 子序列最小长度为3
        unique_subsequences = set(subsequences)  # 去重
        for subseq in unique_subsequences:
            subseq_counts[subseq] += 1

    # 过滤出在至少min_occurrences个序列中出现的子序列
    frequent_subsequences = {subseq: count for subseq, count in subseq_counts.items() if count >= min_occurrences}
    
    return frequent_subsequences

# 查找至少出现3次的频繁子序列
min_occurrences = 2
frequent_subsequences = find_frequent_subsequences(sequences, min_occurrences)
print(f"Frequent subsequences appearing in at least {min_occurrences} sequences:")
for subseq, count in frequent_subsequences.items():
    print(f"Subsequence: {subseq}, Count: {count}")

Frequent subsequences appearing in at least 2 sequences:
Subsequence: ('Name5', 'Name6', 'Name7'), Count: 2
Subsequence: ('Name5', 'Name6', 'Name7', 'Name8'), Count: 2
Subsequence: ('Name1', 'Name2', 'Name3'), Count: 5
Subsequence: ('Name6', 'Name7', 'Name8'), Count: 3
Subsequence: ('Name4', 'Name5', 'Name6', 'Name7', 'Name8'), Count: 2
Subsequence: ('Name4', 'Name5', 'Name6'), Count: 2
Subsequence: ('Name4', 'Name5', 'Name6', 'Name7'), Count: 2
Subsequence: ('Name3', 'Name9', 'Name10'), Count: 3
Subsequence: ('Name2', 'Name3', 'Name9'), Count: 4
Subsequence: ('Name1', 'Name2', 'Name3', 'Name9'), Count: 4
Subsequence: ('Name3', 'Name9', 'Name11'), Count: 2
Subsequence: ('Name9', 'Name10', 'Name14'), Count: 2
Subsequence: ('Name3', 'Name9', 'Name10', 'Name14'), Count: 2
Subsequence: ('Name11', 'Name19', 'Name18'), Count: 2
