In [None]:
# 示例残片特征提取
F001_features = {
    "edges": {
        "e1":chain_code[0],  # 上边缘曲率序列
        "e2":chain_code[1], # 下边缘特征
        "e3":chain_code[2],
        "e4":chain_code[3]
    },
    "text": {
        "direction": ("down", "right"), # 文字主方向向下，次方向向右
        "content": "方諸佛難願觀察過去諸佛有扵如是五經...",
        "BERT_embedding": "[0.1, 0.2, ..., 0.5]"
    }
}

Nodes: [(0, {'node': <__main__.GraphNode object at 0x000001886FC11DC0>}), (1, {'node': <__main__.GraphNode object at 0x000001886FBF2550>})]
Edges: [(0, 1)]


In [None]:
import numpy as np
from typing import Dict, List, Optional, Tuple
import torch
from transformers import BertModel, BertTokenizer

class GraphNode:
    def __init__(
        self,
        node_id: int,
        freeman_codes: Dict[str, List[int]],  # 四个方向的Freeman编码
        centroid: Tuple[float, float],       # 质心位置 (x, y)
        semantic_vector: Optional[np.ndarray] = None  # BERT语义向量
    ):
        """
        图节点类
        
        参数:
            node_id: 节点唯一标识符
            freeman_codes: 四个方向的Freeman编码字典，键为方向('top', 'bottom', 'left', 'right')
            centroid: 质心坐标(x, y)
            semantic_vector: BERT生成的语义向量(可选)
        """
        self.id = node_id
        self.freeman_codes = freeman_codes  # 存储四个方向的轮廓特征
        self.centroid = centroid            # 质心位置
        self.semantic_vector = semantic_vector  # 语义向量
        
    def __repr__(self):
        return f"GraphNode(id={self.id}, centroid={self.centroid}, freeman_codes={self.freeman_codes})"

class GraphEdge:
    def __init__(self, node1: int, node2: int, weight: Optional[float] = None):
        """
        图边类
        
        参数:
            node1: 第一个节点ID
            node2: 第二个节点ID
            weight: 边权值(可选)
        """
        self.node1 = node1
        self.node2 = node2
        self.weight = weight
        
    def __repr__(self):
        return f"GraphEdge({self.node1} <-> {self.node2}, weight={self.weight})"

class Graph:
    def __init__(self):
        """图结构类"""
        self.nodes: Dict[int, GraphNode] = {}
        self.edges: List[GraphEdge] = []
        self._next_node_id = 0
        
    def add_node(
        self,
        freeman_codes: Dict[str, List[int]],
        centroid: Tuple[float, float],
        text: Optional[str] = None,
        bert_model: Optional[BertModel] = None,
        bert_tokenizer: Optional[BertTokenizer] = None
    ) -> int:
        """
        添加节点到图中
        
        参数:
            freeman_codes: 四个方向的Freeman编码
            centroid: 质心位置
            text: 用于生成BERT向量的文本(可选)
            bert_model: BERT模型(可选)
            bert_tokenizer: BERT分词器(可选)
            
        返回:
            新添加节点的ID
        """
        # 生成语义向量(如果提供了文本和BERT模型)
        semantic_vector = None
        if text is not None and bert_model is not None and bert_tokenizer is not None:
            semantic_vector = self._generate_bert_vector(text, bert_model, bert_tokenizer)
        
        node_id = self._next_node_id
        self.nodes[node_id] = GraphNode(node_id, freeman_codes, centroid, semantic_vector)
        self._next_node_id += 1
        return node_id
    
    def add_edge(self, node1: int, node2: int, weight: Optional[float] = None):
        """
        添加边到图中
        
        参数:
            node1: 第一个节点ID
            node2: 第二个节点ID
            weight: 边权值(可选)
        """
        if node1 not in self.nodes or node2 not in self.nodes:
            raise ValueError("One or both nodes do not exist in the graph")
        self.edges.append(GraphEdge(node1, node2, weight))
    
    def _generate_bert_vector(
        self,
        text: str,
        bert_model: BertModel,
        bert_tokenizer: BertTokenizer
    ) -> np.ndarray:
        """
        使用BERT模型生成文本的语义向量
        
        参数:
            text: 输入文本
            bert_model: BERT模型
            bert_tokenizer: BERT分词器
            
        返回:
            文本的语义向量(numpy数组)
        """
        # 分词和编码
        inputs = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        
        # 获取BERT输出
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # 使用[CLS]标记的隐藏状态作为整个文本的表示
        last_hidden_states = outputs.last_hidden_state
        cls_embedding = last_hidden_states[:, 0, :].numpy()
        
        return cls_embedding
    
    def __repr__(self):
        return f"Graph(nodes={len(self.nodes)}, edges={len(self.edges)})"

# 示例用法
if __name__ == "__main__":
    # 初始化BERT模型和分词器(如果需要生成语义向量)
    bert_model = None
    bert_tokenizer = None
    # 实际使用时取消下面注释
    # from transformers import BertModel, BertTokenizer
    # bert_model = BertModel.from_pretrained('bert-base-uncased')
    # bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # 创建图实例
    graph = Graph()
    
    # 添加节点(示例Freeman编码)
    node1_id = graph.add_node(
        freeman_codes={
            'top': [0, 1, 2, 3, 4, 5, 6, 7],
            'bottom': [7, 6, 5, 4, 3, 2, 1, 0],
            'left': [0, 7, 6, 5],
            'right': [4, 3, 2, 1]
        },
        centroid=(10.5, 20.3),
        text="This is an example text for node 1",
        bert_model=bert_model,
        bert_tokenizer=bert_tokenizer
    )
    
    node2_id = graph.add_node(
        freeman_codes={
            'top': [7, 6, 5, 4],
            'bottom': [3, 2, 1, 0],
            'left': [0, 1, 2, 3],
            'right': [4, 5, 6, 7]
        },
        centroid=(15.2, 18.7),
        text="Another example for node 2",
        bert_model=bert_model,
        bert_tokenizer=bert_tokenizer
    )
    
    # 添加边(空权值)
    graph.add_edge(node1_id, node2_id)
    
    # 打印图信息
    print(graph)
    print("Nodes:")
    for node_id, node in graph.nodes.items():
        print(f"  {node}")
        if node.semantic_vector is not None:
            print(f"    Semantic vector shape: {node.semantic_vector.shape}")
    
    print("Edges:")
    for edge in graph.edges:
        print(f"  {edge}")