In [1]:
from pathlib import Path
from typing import Annotated, Union

import typer
from peft import AutoPeftModelForCausalLM, PeftModelForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast
)

ModelType = Union[PreTrainedModel, PeftModelForCausalLM]
TokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]

def load_model_and_tokenizer(
        model_dir: Union[str, Path], trust_remote_code: bool = True
) -> tuple[ModelType, TokenizerType]:
    model_dir = Path(model_dir).expanduser().resolve()
    if (model_dir / 'adapter_config.json').exists():
        model = AutoPeftModelForCausalLM.from_pretrained(
            model_dir, trust_remote_code=trust_remote_code, device_map='auto'
        )
        tokenizer_dir = model.peft_config['default'].base_model_name_or_path
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_dir, trust_remote_code=trust_remote_code, device_map='auto'
        )
        tokenizer_dir = model_dir
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_dir, trust_remote_code=trust_remote_code, encode_special_tokens=True, use_fast=False
    )
    return model, tokenizer

In [3]:
model_dir = "../../output/checkpoint-1000"

model, tokenizer = load_model_and_tokenizer(model_dir)

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
import pandas as pd
from tools.common_utils import highlight_diff, read_jsonl

json_file_path = 'data/dev.jsonl'
data_list = read_jsonl(json_file_path)

df = pd.DataFrame(data_list)
# df.head()

In [57]:
%%time
index = 999
row = df.loc[index]
single_user_row = row["messages"][0]
single_assis_row = row["messages"][1]

messages = [single_user_row]
output = single_assis_row["content"]

inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_tensors="pt"
        ).to(model.device)
generate_kwargs = {
            "input_ids": inputs,
            "max_new_tokens": 1024,
            "do_sample": True,
            "top_p": 0.8,
            "temperature": 0.8,
            "repetition_penalty": 1.2,
            "eos_token_id": model.config.eos_token_id,
        }
outputs = model.generate(**generate_kwargs)
response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True).strip()

CPU times: user 11.1 s, sys: 55.3 ms, total: 11.1 s
Wall time: 11.1 s


In [58]:
print(response)
print('='*60)
print(output)

<output>
    思考过程：
        首先可以确定这是一个股票查询类任务，我们可以将该问题分解为以下几个步骤：
    
    1. 确定股票名称，query中的股票标准名应该是“捷昌驱动”；
    2. 将股票名称转化为股票代码，调用 股票查询-代码(api_0) 获取“捷昌驱动”的股票代码；
    3. 根据获取到的股票代码查询该股票的总市值，调用 股票查询-总市值(api_1) 获取“捷昌驱动”的总市值；
    4. 同样根据获取到的股票代码查询该股票的静态市盈率，调用 股票查询-静态市盈率(api_2) 获取“捷昌驱动”的静态市盈率；
    5. 最后输出总市值和静态市盈率，即 api_1 和 api_2 的结果。
    于是最终标准的json格式结果为:
        {"relevant APIs": [{"api_id": "0", "api_name": "代码", "required_parameters": [["捷昌驱动"]], "rely_apis": [], "tool_name": "股票查询"}, {"api_id": "1", "api_name": "总市值", "required_parameters": ["api_0的结果"], "rely_apis": ["0"], "tool_name": "股票查询"}, {"api_id": "2", "api_name": "静态市盈率", "required_parameters": ["api_0的结果"], "rely_apis": ["0"], "tool_name": "股票查询"}], "result": ["api_1的结果", "api_2的结果"]}
</output>
<output>
    思考过程：
        1. 首先，根据问题描述和提供的标准名列表，可以确定这是一个股票查询类任务。问题中提到的产品标准名是“捷昌驱动”，这是一个股票名称。
        2. 为了查询股票的相关信息，首先需要将股票名称转换为股票代码。因此，第一步是调用股票查询类中的“代码”API（api_0），输入股票名称“捷昌驱动”，获取对应的股票代码。
        3. 获取股票代码后，接下来需要查询该股票的总市值和静态市盈率。这两个指标都属于

---

In [8]:
from tools.standard_name_utils import optimize_parameters
data_stock = pd.read_excel('raw_data/标准名.xlsx',sheet_name='股票标准名')
data_fund = pd.read_excel('raw_data/标准名.xlsx',sheet_name='基金标准名')

fund_standard_name = data_fund['标准基金名称'].to_list()
stock_standard_name = data_stock['标准股票名称'].to_list()

In [9]:
from random import random

total_eval_count = 0.0
correct_count = 0.0

for index, row in df.iterrows():
    if random() <= 0.9:
        continue
    
    single_user_row = row["messages"][0]
    single_assis_row = row["messages"][1]
    
    messages = [single_user_row]
    output = single_assis_row["content"]
    
    inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_tensors="pt"
        ).to(model.device)
    
    generate_kwargs = {
            "input_ids": inputs,
            "max_new_tokens": 1024,
            "do_sample": True,
            "top_p": 0.8,
            "temperature": 0.8,
            "repetition_penalty": 1.2,
            "eos_token_id": model.config.eos_token_id,
        }
    
    outputs = model.generate(**generate_kwargs)
    response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True).strip()
    
    optimized_resp = optimize_parameters(response, fund_standard_name, stock_standard_name)
    
    total_eval_count += 1
    if response == output:
        correct_count += 1
    else:
        print("-----data index-----")
        print(index)
        print("-----query input-----")
        print(input)
        print("-----output diff-----")
        print(highlight_diff(output, response))
        print(response)
        print()
    
    if total_eval_count == 20:
        break
    
print("预测正确的比例：" + f"{correct_count / total_eval_count :.2%}")


KeyboardInterrupt



In [None]:
total_eval_count = 0

with open('data/submit.txt','w', encoding="utf-8") as n:
    for index, row in test_df.iterrows():  
        single_user_row = row["messages"][0]
        single_assis_row = row["messages"][1]
        
        messages = [single_user_row]
        output = single_assis_row["content"]
        
        inputs = tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                tokenize=True,
                return_tensors="pt"
            ).to(model.device)
        
        generate_kwargs = {
                "input_ids": inputs,
                "max_new_tokens": 1024,
                "do_sample": True,
                "top_p": 0.8,
                "temperature": 0.8,
                "repetition_penalty": 1.2,
                "eos_token_id": model.config.eos_token_id,
            }
        
        outputs = model.generate(**generate_kwargs)
        response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True).strip()
        
        optimized_resp = optimize_parameters(response, fund_standard_name, stock_standard_name)        
        total_eval_count += 1
        
        n.write(optimized_resp+'\n')
        
        if total_eval_count % 10 == 0:
            print("现在是第" + f"{total_eval_count}" + "条数据")
            break