In [2]:
import sys
sys.path.append('..')
from LLM.Info import *
instruction_rules = {
        "seed": (
            "当前需要修改或增强种子节点(seed)。规则：\n"
            "1. 建议从 算子含义集op_info中 为 seed_ARGi 寻找同属或同种抽象分类的操作符 进行近义替换；比如M_ts_mean_mid_neighbor和M_ts_std_mid_neighbor\n"
            "2. 参数必须从以下来源选择：ARGi（i=0-9)\n"
        ),
        "root": (
            "当前需要修改或增强根节点(root)。规则：\n"
            "1. 替换 root_ARGi为新的根节点表达式\n"
            "2. 新 root 可以 从已生成的 root_ARG 中选择\n"
            "3. 建议从 算子含义集op_info 中 寻找同属或同种抽象分类的操作符 进行近义替换。"
        ),
        "subtree": (
            "当前需要修改或增强子树(subtree)。规则：\n"
            "当 subtree 不需要掩码（subtree_with_nomask）时，直接替换算子或增强表达式。\n"
            "  - 必须从 ['D_Minute_std','D_Minute_mean','D_Minute_trend'] 或 ['D_Minute_corr','D_Minute_weight_mean'] 任选替换\n"
            "  - 参数必须从以下两个来源中选择：1.表达式中的root_ARG;2.从op_info分钟频的代理变量池中任意选择（ M_* 算子）\n"
            "当 subtree 需要结合代理变量和掩码时使用，生成形如 代理变量 + mask 的分支：\n"
            "  - 代理变量必须从分钟频池选择（ M_* 算子）\n"
            "  - 掩码必须从掩码池选择，即op_info中含有mask的算子（如 Mmask_* 算子）\n"
            "  - 组合格式：mask(proxy_variable)"
        ),
        "tree": (
            "当前需要修改或增强树Tree。规则：\n"
            "从 ['D_ts_harmonic_mean','D_ts_mean','D_ts_std']任选"
        ),
        
    }



In [7]:
import re
import copy
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_community.llms.moonshot import Moonshot

class InstructionProcessor:
    def __init__(self, material, instruction_rules, llm_api_key):

        self.material = material
        self.instruction_rules = instruction_rules
        self.LLM = Moonshot(model="moonshot-v1-auto", api_key=llm_api_key)
        self.target_organs = ["root", "seed", "subtree", "branch", "tree"]
        # 初始化参数含义及树结构
        self.initial_state = self.get_initial_state(material)

    def get_initial_state(self, tree):

        initial_state = {
            'abbreviation_mode': tree,
            'initial_arg_memorizer': {
                'ARG0': 'D_O',
                'ARG1': 'D_C',
                'ARG2': 'D_H',
                'ARG3': 'D_L',
                'ARG4': 'D_V',
                'ARG5': 'M_O',
                'ARG6': 'M_C',
                'ARG7': 'M_H',
                'ARG8': 'M_L',
                'ARG9': 'M_V'
            },
            'initial_meaning_memorizer': {
                'ARG0': '当日开盘价',
                'ARG1': '当日收盘价',
                'ARG2': '当日最高价',
                'ARG3': '当日最低价',
                'ARG4': '当日成交额',
                'ARG5': '日内每一分钟的开盘价',
                'ARG6': '日内每一分钟的收盘价',
                'ARG7': '日内每一分钟的最高价',
                'ARG8': '日内每一分钟的最低价',
                'ARG9': '日内每一分钟的成交额'
            },
            'seed_arg_memorizer': {},
            'seed_meaning_memorizer': {},
            'root_arg_memorizer': {},
            'root_meaning_memorizer': {},
            'branch_arg_memorizer': {},
            'branch_meaning_memorizer': {},
            'trunk_arg_memorizer': {},
            'trunk_meaning_memorizer': {},
            'subtree_arg_memorizer': {},
            'subtree_meaning_memorizer': {},
            'tree_arg_memorizer': {},
            'tree_meaning_memorizer': {},
        }
        return initial_state

    def generate_instruction_set(self):
        """
        生成指令集。
        
        返回:
            dict: 包含所有器官指令的字典，格式为 {organ: [instruction1, instruction2, ...]}
        """
        instructions = {}
        for organ in self.target_organs:
            organ_expressions = self.material.get(organ, [])
            organ_instructions = []
            for expr in organ_expressions:
                args = self.parse_arguments(expr)
                args_source = self.get_args_source(args, self.material)
                
                # 构造 LLM 提示
                system_message1 = SystemMessagePromptTemplate.from_template(
                    "根据以下规则进行算子替换与增强。\n"
                    f"{self.instruction_rules.get(organ, '')}"
                )
                system_message2 = SystemMessagePromptTemplate.from_template(
                    "根据表达式开头判断输入参数是分钟频率的代理变量还是日频变量。"
                    "表达式或算子如果以M开始，表示用于分钟频数据；以D开始表示用于日频数据。"
                )
                user_message = HumanMessagePromptTemplate.from_template(
                    "当前表达式：{expression}\n"
                    "算子含义：{op_meaning}\n"
                    "参数来源：{args_source}\n"
                    "参数含义及树结构：{args_meaning}\n"
                    "请生成改进指令，严格按照规则，用中文直接返回指令（不要解释）。示例：\n"
                    "将 'M_at_div(seed_ARG0, seed_ARG1)' 替换为 'M_cs_zscore(seed_ARG0)'",
                    input_variables=["expression", "op_meaning","op_class", "args_source", "args_meaning"]
                )
                
                op_template = HumanMessagePromptTemplate.from_template(
                    "op_info中操作符的分类规则如下：\n"
                    "1. Action抽象动作分类：{Abstraction_Action_Group}\n"
                    "2. Interface接口协议分类：{Interface_Protocol_Group}\n",
                    input_variables=["Abstraction_Action_Group", "Interface_Protocol_Group"]
                )

                # 生成指令
                chat_prompt = ChatPromptTemplate.from_messages([system_message1, system_message2, user_message,op_template])
                chain = chat_prompt | self.LLM
                response = chain.invoke({
                    "expression": expr,
                    "op_meaning": op_info,
                    "args_source": args_source,
                    "args_meaning": self.initial_state,
                    "Abstraction_Action_Group":Abstraction_Action_Group,
                    "Interface_Protocol_Group":Interface_Protocol_Group
                })
                
                # 解析响应并保存指令
                generated_instructions = [line.strip("- ") for line in response.split("\n") if line]
                organ_instructions.extend(generated_instructions)
            
            if organ_instructions:
                instructions[organ] = organ_instructions
        
        return instructions

    def transform_instruction_set(self, instruction_set):
        """    
        参数:
            instruction_set (dict): 生成的指令集，格式为 {organ: [instruction1, ...]}
        
        返回:
            list: 分组后的指令列表，格式为 [[organ1_instructions], [organ2_instructions], ...]
        """
        grouped_instructions = []
        for organ in self.target_organs:
            if organ in instruction_set:
                grouped_instructions.append(instruction_set[organ])
        return grouped_instructions

    def make_product(self):
        """
        应用所有生成的指令，生成最终的树结构（product）。
        """
        # 生成所有指令并分组
        instruction_set = self.generate_instruction_set()
        # 打印生成的指令集
        print("生成的指令集：")
        for organ, instructions in instruction_set.items():
            print(f"{organ}:")
            for instr in instructions:
                print(f"    {instr}")
            print()  # 添加换行

        # 打印转换后的指令列表
        print("转换后的指令列表：")
        grouped_instructions = self.transform_instruction_set(instruction_set)
        print(grouped_instructions)
        # 深拷贝原始树结构
        new_tree = copy.deepcopy(self.material)
        
        # 按顺序应用所有指令
        for organ_instructions in grouped_instructions:
            for instruction in organ_instructions:
                self._apply_instruction(new_tree, instruction)
        print("应用指令后的树结构：")
        return new_tree

    def _apply_instruction(self, tree, instruction):
        """
        内部方法：应用单条指令到树结构。
    
        参数:
            tree (dict): 当前树结构。
            instruction (str): 单条指令，如 "将 root_ARG0 替换为 M_cs_zscore(seed_ARG0)"
        """
        # 改进后的正则表达式
        pattern = r"将\s*['\"](.*?)['\"]\s*替换为\s*['\"](.*?)['\"]"
        match = re.search(pattern, instruction)  # 正确提取
        if not match:
            raise ValueError(f"指令格式错误，无法解析：{instruction}")
    
        target, replacement = match.groups()
        self._replace_node(tree, target, replacement)

    def _replace_node(self, tree, target, replacement):
        """
        递归替换树中的目标节点。效率提升改进？？
        """
        for key in list(tree.keys()):
            if key == target:
                tree[key] = replacement
            elif isinstance(tree[key], dict):
                self._replace_node(tree[key], target, replacement)
            elif isinstance(tree[key], list):
                for i, item in enumerate(tree[key]):
                    if isinstance(item, dict):
                        self._replace_node(item, target, replacement)
                    elif item == target:
                        tree[key][i] = replacement

    def parse_arguments(self, expression):
        """提取表达式参数，提取seed_ARG等"""
        return re.findall(r"\((.*?)\)", expression)[0].split(", ") if "(" in expression else []

    def get_args_source(self, args, tree_structure):
        """生成参数来源描述。"""
        sources = []
        for arg in args:
            if "_ARG" in arg:
                organ = arg.split("_ARG")[0]
                sources.append(f"{arg} 来自 {organ} 的 {tree_structure[organ][int(arg[-1])]}")
        return "；".join(sources)
    

    def run(self):
        # 生成树并打印指令集
        print(self.make_product())

In [4]:
if __name__ == "__main__":
    from RPN.RPNbuilder import RPN_Producer
    import warnings
    warnings.filterwarnings('ignore',category=RuntimeWarning)

    rpn_producer=RPN_Producer()
    from RPN.RPNbuilder import RPN_Parser
    rpn_producer.run()
    rpns=rpn_producer.tree
    parser=RPN_Parser(rpns[6])
    parser.get_tree_structure()
    parser.parse_tree()
    material=parser.tree2dict()




In [8]:
material

{'tree': ['D_ts_std(subtree_ARG0, 10)'],
 'subtree': ['D_Minute_weight_mean(trunk_ARG0, trunk_ARG1)'],
 'branch': [],
 'trunk': ['M_at_div(M_at_prod(M_cs_norm_spread(root_ARG0, root_ARG1), M_at_div(root_ARG2, root_ARG3)), M_cs_norm_spread(M_cs_umr(root_ARG4, root_ARG5), M_cs_norm_spread(root_ARG6, root_ARG7)))',
  'M_toD_standard(root_ARG8, root_ARG9)'],
 'root': ['M_cs_rank(seed_ARG0)',
  'M_cs_scale(seed_ARG1)',
  'M_cs_rank(seed_ARG2)',
  'M_cs_rank(seed_ARG3)',
  'M_cs_zscore(seed_ARG4)',
  'M_at_div(seed_ARG5, seed_ARG6)',
  'M_cs_rank(seed_ARG7)',
  'M_cs_scale(seed_ARG8)',
  'M_cs_scale(seed_ARG9)',
  'D_ts_norm(seed_ARG10, 3)'],
 'seed': ['M_ts_mean_right_neighbor(ARG6, 2)',
  'M_ts_mean_mid_neighbor(ARG9, 2)',
  'M_ts_mean_mid_neighbor(ARG8, 5)',
  'M_ts_mean_mid_neighbor(ARG9, 2)',
  'M_ts_mean_mid_neighbor(ARG9, 1)',
  'M_ts_mean_right_neighbor(ARG7, 1)',
  'M_ts_mean_left_neighbor(ARG7, 20)',
  'M_ts_mean_mid_neighbor(ARG9, 2)',
  'M_ts_mean_left_neighbor(ARG9, 60)',
  'M_t

In [None]:
processor = InstructionProcessor(material, instruction_rules, llm_api_key='')

# 生成最终的树结构
processor.run()

生成的指令集：
root:
    将 'M_cs_rank(seed_ARG0)' 替换为 'M_cs_scale(seed_ARG0)'
    将 'M_cs_scale(seed_ARG1)' 替换为 'M_cs_zscore(seed_ARG1)'
    将 'M_cs_rank(seed_ARG2)' 替换为 'M_cs_scale(seed_ARG2)'
    将 'M_cs_rank(seed_ARG3)' 替换为 'M_cs_zscore(seed_ARG3)'
    将 'M_cs_zscore(seed_ARG4)' 替换为 'M_cs_scale(seed_ARG4)'
    将 'M_at_div(seed_ARG5, seed_ARG6)' 替换为 'M_at_prod(seed_ARG5, seed_ARG6)'
    将 'M_cs_rank(seed_ARG7)' 替换为 'M_cs_scale(seed_ARG7)'
    将 'M_cs_scale(seed_ARG8)' 替换为 'M_cs_zscore(seed_ARG8)'
    将 'M_cs_scale(seed_ARG9)' 替换为 'M_cs_zscore(seed_ARG9)'
    将 'D_ts_norm(seed_ARG10, 3)' 替换为 'D_cs_zscore(seed_ARG10)'

seed:
    将 'M_ts_mean_right_neighbor(ARG6, 2)' 替换为 'M_ts_std_right_neighbor(ARG6, 2)'
    将 'M_ts_mean_mid_neighbor(ARG9, 2)' 替换为 'M_ts_std_mid_neighbor(ARG9, 2)'
    将 'M_ts_mean_mid_neighbor(ARG8, 5)' 替换为 'M_ts_std_mid_neighbor(ARG8, 5)'
    将 'M_ts_mean_mid_neighbor(ARG9, 2)' 替换为 'M_ts_std_mid_neighbor(ARG9, 2)'
    将 'M_ts_mean_mid_neighbor(ARG9, 1)' 替换为 'M_ts_std_mid_neig