In [9]:
from typing import List

from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from sentence_transformers import InputExample, SentenceTransformer
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from zhipuai import ZhipuAI
from sentence_transformers import losses
import re
import json
from langchain_core.documents import Document
from tqdm import tqdm
from torch.utils.data import DataLoader
class QaPairs():
    '''存储List[dict]类型数据'''

    def __init__(self, qa_pairs: List[dict]):
        self.qa_pairs = qa_pairs
        
 
    def save_json(self, path: str):
        '''将数据存储为json格式'''

        with open(path, "w", encoding='utf-8') as f:
            json.dump(self.qa_pairs, f, ensure_ascii=False, indent=4)

    @classmethod
    def from_json(cls, path:str) -> 'QaPairs':
        '''读取json格式数据'''

        with open(path) as f:
            data = json.load(f)
        return cls(data)


llm_list = ['glm-4-flash', 'glm-4', 'glm-4v', 'glm-3-turbo', 'gpt-3.5-turbo']

PROMPT = '''
下面是上下文信息。
 
--------------------- 
{context_str} 
--------------------- 
 
给定上下文信息，没有先验知识。 
仅根据下面的查询生成问题。 
 
你是一位老师/教授。你的任务是为即将到来的测验/考试设置{num_questions_per_page}个问题以及问题涉及到的原文内容
在整个文件中，问题的性质应该是多样化的。
将问题限制在提供的上下文信息之内。
按照一下格式输出：
问题1：
问题

原文内容1：
原文内容

的形式回答
'''

def list_generate_qa_pairs(
        texts: List[str],
        num_questions_per_page: int = 2,
        model: str = 'glm-4',
) -> QaPairs:
    '''借助大模型从给定的texts里提取出问题与对应的答案'''

    if model not in llm_list:
        raise ValueError('你选择的模型暂时不被支持'
                            '''请使用'glm-4', 'glm-4v', 'glm-3-turbo' 中的一个作为model的参数''')
    elif model in llm_list[:3]:
        llm = ZhipuAI()
    qa_pairs = []

    for text in tqdm(texts):
        if len(text) > 200:
            prompt = PROMPT.format(
                context_str=text,
                num_questions_per_page=num_questions_per_page
            )
            response = llm.chat.completions.create(
                model=model,
                messages=[
                    {"role": "user", "content": prompt},
                ],
            )
            matches = re.findall(
                r'问题\d+：(.*?)原文内容\d+：(.*?)((?=问题\d+：)|$)',
                response.choices[0].message.content,
                re.DOTALL
            )
            for _, match in enumerate(matches):
                qa = {
                    'query': match[0].strip(),
                    'answer': match[1].strip()
                }
                qa_pairs.append(qa)
    return QaPairs(qa_pairs=qa_pairs)

def docs_generate_qa_pairs(
        docs: List[Document], 
        num_questions_per_page: int = 1,
        model: str = 'glm-4'
) -> QaPairs:
    '''借助大模型从给定的docs里提取出问题与对应的答案'''
    list_doc = [doc.page_content for doc in docs]
    return list_generate_qa_pairs(list_doc, num_questions_per_page, model=model)


def docs_generate_pdf_qa_pairs(
        pdf_pages: List[Document],
        num_questions_per_page: int = 1     ,
        model: str = 'glm-4-flash',
) -> QaPairs:
    '''
    借助大模型从给定的texts里提取出问题、答案
    返回结果为问题、答案、所属页码
    '''

    if model not in llm_list:
        raise ValueError('你选择的模型暂时不被支持'
                            '''请使用'glm-4', 'glm-4v', 'glm-3-turbo'中的一个作为model的参数''')
    elif model in llm_list[:3]:
        llm = ZhipuAI(
            api_key="652a160546149ef4e3ec0ff881beebfe.D3UaKuk7FmiUn9WQ"
        )

    qa_pairs = []

    for page in tqdm(pdf_pages):
        if len(page.page_content) > 200:
            prompt = PROMPT.format(
                context_str=page.page_content,
                num_questions_per_page=num_questions_per_page
            )
            response = llm.chat.completions.create(
                model=model,
                messages=[
                    {"role": "user", "content": prompt},
                ],
            )
            matches = re.findall(
                r'问题\d+：(.*?)原文内容\d+：(.*?)((?=问题\d+：)|$)',
                response.choices[0].message.content,
                re.DOTALL
            )
            for _, match in enumerate(matches):
                qa = {
                    'query': match[0].strip(),
                    'answer': match[1].strip(),
                    'page_num': page.metadata['page']
                }
                qa_pairs.append(qa)
    return QaPairs(qa_pairs=qa_pairs)



loader = PyMuPDFLoader(file_path="./24徐涛《核心考案》高清无水印PDF【公众号：薄荷考研】.pdf")

pdf_pages = loader.load()[231:238]

qa_from_pdf = docs_generate_pdf_qa_pairs(pdf_pages)
print(qa_from_pdf.qa_pairs)
qa_from_pdf.save_json("train_dataset.json")

qa_pairs = QaPairs.from_json('train_dataset.json')
examples = []
# 将单个qa对转为InputExample并存入列表
# examples = [InputExample(texts=[qa_pair['query'], qa_pair['answer']]) for qa_pair in qa_pairs.qa_pairs]
for i in range(len(qa_pairs)):
    example = qa_pairs[i]
    examples.append(InputExample(texts=[qa_pair['query'], qa_pair['answer']))
                                       
train_examples = examples[:3]
dev_examples = examples[3:]                                        

# 将数据集转换为DataLoader形式
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
model = SentenceTransformer(model_name_or_path="./AI-ModelScope/tao-8k", device="cuda", cache_folder='./', trust_remote_code=True)
train_loss = losses.ContrastiveLoss(model=model)
# 实例化评估器，将每次训练后的模型在验证集上测试性能
evaluator = BinaryClassificationEvaluator.from_input_examples(dev_examples, name='med-dev')
# 定义模型保存路径
model_save_path='./trained_tao'
# 微调模型
model.fit([(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=10,
          output_path=model_save_path,
          )


SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (1246743808.py, line 174)

In [1]:
from modelscope import snapshot_download
snapshot_download('AI-ModelScope/tao-8k', cache_dir='./')

Downloading [added_tokens.json]: 100%|██████████| 82.0/82.0 [00:00<00:00, 156B/s]
Downloading [config.json]: 100%|██████████| 871/871 [00:00<00:00, 1.27kB/s]
Downloading [configuration.json]: 100%|██████████| 47.0/47.0 [00:00<00:00, 110B/s]
Downloading [pytorch_model.bin]: 100%|██████████| 636M/636M [00:03<00:00, 212MB/s]  
Downloading [README.md]: 100%|██████████| 24.8k/24.8k [00:00<00:00, 46.7kB/s]
Downloading [special_tokens_map.json]: 100%|██████████| 125/125 [00:00<00:00, 288B/s]
Downloading [tokenizer.json]: 100%|██████████| 429k/429k [00:00<00:00, 902kB/s]
Downloading [tokenizer_config.json]: 100%|██████████| 1.08k/1.08k [00:00<00:00, 2.62kB/s]
Downloading [vocab.txt]: 100%|██████████| 107k/107k [00:00<00:00, 241kB/s]


'./AI-ModelScope/tao-8k'

In [1]:
from typing import List
from sentence_transformers import InputExample, SentenceTransformer
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from zhipuai import ZhipuAI
from sentence_transformers import losses
import re
import json
from langchain_core.documents import Document
from tqdm import tqdm
from torch.utils.data import DataLoader
from zhipuai import ZhipuAI
class QaPairs():
    '''存储List[dict]类型数据'''

    def __init__(self, qa_pairs: List[dict]):
        self.qa_pairs = qa_pairs
        

    def save_json(self, path: str):
        '''将数据存储为json格式'''

        with open(path, "w", encoding='utf-8') as f:
            json.dump(self.qa_pairs, f, ensure_ascii=False, indent=4)

    @classmethod
    def from_json(cls, path:str) -> 'QaPairs':
        '''读取json格式数据'''

        with open(path) as f:
            data = json.load(f)
        return cls(data)

qa_pairs = QaPairs.from_json('./train_dataset.json')
examples = []
# 将单个qa对转为InputExample并存入列表
examples = [InputExample(texts=[qa_pair['query'], qa_pair['answer']]) for qa_pair in qa_pairs.qa_pairs]
train_examples = examples[:150]
dev_examples = examples[150:]                                        

# 将数据集转换为DataLoader形式
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
print("创建模型")
model = SentenceTransformer(model_name_or_path="AI-ModelScope/tao-8k", device='cuda', cache_folder='./', trust_remote_code=True)
model.max_seq_length = 256
train_loss = losses.MultipleNegativesRankingLoss(model=model)
print("创建评估器")
# 实例化评估器，将每次训练后的模型在验证集上测试性能
evaluator = BinaryClassificationEvaluator.from_input_examples(dev_examples, name='dev')
# 定义模型保存路径
model_save_path='./trained_tao'  
# 微调模型
print("开始微调")
model.fit([(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=10,
          output_path=model_save_path,
          )


  from tqdm.autonotebook import tqdm, trange


创建模型


No sentence-transformers model found with name AI-ModelScope/tao-8k. Creating a new one with mean pooling.
Detected kernel version 4.19.91, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


创建评估器
开始微调
[2024-10-07 20:32:45,832] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Step,Training Loss,Validation Loss,Dev Cosine Accuracy,Dev Cosine Accuracy Threshold,Dev Cosine F1,Dev Cosine F1 Threshold,Dev Cosine Precision,Dev Cosine Recall,Dev Cosine Ap,Dev Dot Accuracy,Dev Dot Accuracy Threshold,Dev Dot F1,Dev Dot F1 Threshold,Dev Dot Precision,Dev Dot Recall,Dev Dot Ap,Dev Manhattan Accuracy,Dev Manhattan Accuracy Threshold,Dev Manhattan F1,Dev Manhattan F1 Threshold,Dev Manhattan Precision,Dev Manhattan Recall,Dev Manhattan Ap,Dev Euclidean Accuracy,Dev Euclidean Accuracy Threshold,Dev Euclidean F1,Dev Euclidean F1 Threshold,Dev Euclidean Precision,Dev Euclidean Recall,Dev Euclidean Ap,Dev Max Accuracy,Dev Max Accuracy Threshold,Dev Max F1,Dev Max F1 Threshold,Dev Max Precision,Dev Max Recall,Dev Max Ap
5,No log,No log,0.988506,0.866557,0,0,0,0,-0.0,0.988506,658.337585,0,0,0,0,-0.0,0.988506,349.931885,0,0,0,0,-0.0,0.988506,14.190307,0,0,0,0,-0.0,0.988506,658.337585,0,0,0,0,-0.0
10,No log,No log,0.988506,0.866535,0,0,0,0,-0.0,0.988506,658.266541,0,0,0,0,-0.0,0.988506,349.945862,0,0,0,0,-0.0,0.988506,14.190795,0,0,0,0,-0.0,0.988506,658.266541,0,0,0,0,-0.0
15,No log,No log,0.988506,0.866497,0,0,0,0,-0.0,0.988506,658.144043,0,0,0,0,-0.0,0.988506,349.969482,0,0,0,0,-0.0,0.988506,14.191639,0,0,0,0,-0.0,0.988506,658.144043,0,0,0,0,-0.0
20,No log,No log,0.988506,0.866444,0,0,0,0,-0.0,0.988506,657.97345,0,0,0,0,-0.0,0.988506,350.00238,0,0,0,0,-0.0,0.988506,14.192799,0,0,0,0,-0.0,0.988506,657.97345,0,0,0,0,-0.0
25,No log,No log,0.988506,0.866375,0,0,0,0,-0.0,0.988506,657.753906,0,0,0,0,-0.0,0.988506,350.044434,0,0,0,0,-0.0,0.988506,14.194283,0,0,0,0,-0.0,0.988506,657.753906,0,0,0,0,-0.0
30,No log,No log,0.988506,0.866291,0,0,0,0,-0.0,0.988506,657.483643,0,0,0,0,-0.0,0.988506,350.096252,0,0,0,0,-0.0,0.988506,14.196138,0,0,0,0,-0.0,0.988506,657.483643,0,0,0,0,-0.0
35,No log,No log,0.988506,0.86619,0,0,0,0,-0.0,0.988506,657.162964,0,0,0,0,-0.0,0.988506,350.155609,0,0,0,0,-0.0,0.988506,14.198342,0,0,0,0,-0.0,0.988506,657.162964,0,0,0,0,-0.0
40,No log,No log,0.988506,0.866072,0,0,0,0,-0.0,0.988506,656.791565,0,0,0,0,-0.0,0.988506,350.227783,0,0,0,0,-0.0,0.988506,14.20097,0,0,0,0,-0.0,0.988506,656.791565,0,0,0,0,-0.0
45,No log,No log,0.988506,0.865937,0,0,0,0,-0.0,0.988506,656.37207,0,0,0,0,-0.0,0.988506,350.309998,0,0,0,0,-0.0,0.988506,14.204041,0,0,0,0,-0.0,0.988506,656.37207,0,0,0,0,-0.0
50,No log,No log,0.988506,0.865788,0,0,0,0,-0.0,0.988506,655.906738,0,0,0,0,-0.0,0.988506,350.398865,0,0,0,0,-0.0,0.988506,14.207369,0,0,0,0,-0.0,0.988506,655.906738,0,0,0,0,-0.0




Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



In [None]:
qa_pairs = QaPairs.from_json('train_dataset.json')
examples = []
# 将单个qa对转为InputExample并存入列表
examples = [InputExample(texts=[qa_pair['query'], qa_pair['answer']]) for qa_pair in qa_pairs.qa_pairs]
train_examples = examples[:150]
dev_examples = examples[150:]                                        

# 将数据集转换为DataLoader形式
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
print("创建model")
model = SentenceTransformer(model_name_or_path="AI-ModelScope/tao-8k", device="cuda", cache_folder='./', trust_remote_code=True)
train_loss = losses.ContrastiveLoss(model=model)
# 实例化评估器，将每次训练后的模型在验证集上测试性能
print("创建评估器")
evaluator = BinaryClassificationEvaluator.from_input_examples(dev_examples, name='dev')
# 定义模型保存路径
model_save_path='./trained_tao'
# 微调模型
print("开始训练")
model.fit([(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=10,
          output_path=model_save_path,
          )


In [2]:
from modelscope import snapshot_download
snapshot_download('AI-ModelScope/bge-small-zh-v1.5', cache_dir='./')

Downloading [1_Pooling/config.json]: 100%|██████████| 190/190 [00:00<00:00, 323B/s]
Downloading [config.json]: 100%|██████████| 776/776 [00:00<00:00, 1.75kB/s]
Downloading [config_sentence_transformers.json]: 100%|██████████| 124/124 [00:00<00:00, 263B/s]
Downloading [configuration.json]: 100%|██████████| 47.0/47.0 [00:00<00:00, 80.6B/s]
Downloading [model.safetensors]: 100%|██████████| 91.4M/91.4M [00:01<00:00, 75.7MB/s]
Downloading [modules.json]: 100%|██████████| 349/349 [00:00<00:00, 586B/s]
Downloading [pytorch_model.bin]: 100%|██████████| 91.4M/91.4M [00:00<00:00, 128MB/s] 
Downloading [README.md]: 100%|██████████| 27.5k/27.5k [00:00<00:00, 54.9kB/s]
Downloading [sentence_bert_config.json]: 100%|██████████| 52.0/52.0 [00:00<00:00, 73.1B/s]
Downloading [special_tokens_map.json]: 100%|██████████| 125/125 [00:00<00:00, 216B/s]
Downloading [tokenizer.json]: 100%|██████████| 429k/429k [00:00<00:00, 774kB/s]
Downloading [tokenizer_config.json]: 100%|██████████| 367/367 [00:00<00:00, 71

'./AI-ModelScope/bge-small-zh-v1___5'

In [9]:
from datasets import load_dataset
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from typing import List
import sentence_transformers
from sentence_transformers import InputExample, SentenceTransformer
from sentence_transformers.evaluation import BinaryClassificationEvaluator, MSEEvaluator,RerankingEvaluator
# from zhipuai import ZhipuAI
from sentence_transformers import losses
import re
import json
from langchain_core.documents import Document
from tqdm import tqdm
from torch.utils.data import DataLoader
class QaPairs():
    '''存储List[dict]类型数据'''

    def __init__(self, qa_pairs: List[dict]):
        self.qa_pairs = qa_pairs
        

    def save_json(self, path: str):
        '''将数据存储为json格式'''

        with open(path, "w", encoding='utf-8') as f:
            json.dump(self.qa_pairs, f, ensure_ascii=False, indent=4)

    @classmethod
    def from_json(cls, path:str) -> 'QaPairs':
        '''读取json格式数据'''

        with open(path) as f:
            data = json.load(f)
        return cls(data)


qa_pairs = QaPairs.from_json('train_dataset1.json')
query=[]
answer=[]
i=0
for qa_pair in qa_pairs.qa_pairs:
            if i == 800:
                break
            i=i+1
            query.append(qa_pair['query'])
            answer.append(qa_pair['answer'])

dataset = Dataset.from_dict({
    "query": query,
    "answer": answer
})
print(dataset)
query=[]
answer=[]
i=800
for qa_pair in qa_pairs.qa_pairs:
            if i == 990:
                break
            i=i+1
            query.append(qa_pair['query'])
            answer.append(qa_pair['answer'])

eval_dataset = Dataset.from_dict({
    "query": query,
    "answer": answer
})
label = [1]*190

print(dataset)
print("创建模型")
model = SentenceTransformer(model_name_or_path="AI-ModelScope/tao-8k", device='cuda', cache_folder='./', trust_remote_code=True)
model.max_seq_length = 200
evaluator = BinaryClassificationEvaluator(query, answer, label, name='dev')
evaluator(model)
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="./",
    # Optional training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    logging_steps=10,
    run_name="mpnet-base-all-nli-triplet",  # Will be used in W&B if `wandb` is installed
)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

# 定义模型保存路径
model_save_path='./trained_tao'
# 微调模型
print("开始微调")
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    loss=train_loss,
    eval_dataset=eval_dataset,
    evaluator=evaluator
)
trainer.train()
print("训练结束！")

 
model.save_pretrained(model_save_path)  

TypeError: Parameters to generic types must be types. Got {'anchor': '根据文本，党的十九大确立了哪一思想作为指导思想？', 'positive': '—（党的十九大的举行）\n—（确立习近平新时代中国特色社会主义思想为指导思想）\n坚持党的全面领.

In [3]:
from datasets import load_dataset
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from typing import List
import sentence_transformers
from sentence_transformers import InputExample, SentenceTransformer
from sentence_transformers.evaluation import BinaryClassificationEvaluator, MSEEvaluator,RerankingEvaluator
# from zhipuai import ZhipuAI
from sentence_transformers import losses
import re
import json
from langchain_core.documents import Document
from tqdm import tqdm
from torch.utils.data import DataLoader

with open('data.json', 'r', encoding='utf-8') as f:
    # 使用json.load函数加载JSON数据,dataset为dict类型
    dataset = json.load(f)
    
train_anchor = []
train_positive = []
train_negative = []
eval_anchor = []
eval_positive = []
eval_negative = []
train_dataset = []
for i in range(0, 351):
    # dict = {}
    # dict['anchor'] = dataset['anchor'][i]
    # dict['positive'] = dataset['positive'][i]
    # dict['negative'] = dataset['negative'][i]
    # train_dataset.append(dict)
    train_anchor.append(dataset['anchor'][i])
    train_positive.append(dataset['positive'][i])
    train_negative.append(dataset['negative'][i])
train_dataset =  Dataset.from_dict({
    "anchor": train_anchor,
    "positive": train_positive,
    "negative": train_negative
})
eval_dataset = []
for i in range(351, 438):
    # dict = {}
    # dict['anchor'] = dataset['anchor'][i]
    # dict['positive'] = dataset['positive'][i]
    # dict['negative'] = dataset['negative'][i]
    # eval_dataset.append(dict)
    eval_anchor.append(dataset['anchor'][i])
    eval_positive.append(dataset['positive'][i])
    eval_negative.append(dataset['negative'][i])

eval_dataset =  Dataset.from_dict({
    "anchor": eval_anchor,
    "positive": eval_positive,
    "negative": eval_negative
})
print(train_dataset)
print(eval_dataset)
print("创建模型")
model = SentenceTransformer(model_name_or_path="AI-ModelScope/tao-8k", device='cuda', cache_folder='./', trust_remote_code=True)
model.max_seq_length = 200
evaluator = TripletEvaluator(
    anchors=eval_anchor,
    positives=eval_positive,
    negatives=eval_negative,
    name="dev"
)

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="./",
    # Optional training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    logging_steps=10,
    run_name="mpnet-base-all-nli-triplet",  # Will be used in W&B if `wandb` is installed
)

train_loss = losses.MultipleNegativesRankingLoss(model=model)

# 定义模型保存路径
model_save_path='./trained_tao1'
# 微调模型
print("开始微调")
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    eval_dataset=eval_dataset,
    evaluator=evaluator
)
trainer.train()
print("训练结束！")

model.save_pretrained(model_save_path)

Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 351
})
Dataset({
    features: ['anchor', 'positive', 'negative'],
    num_rows: 87
})
创建模型


No sentence-transformers model found with name AI-ModelScope/tao-8k. Creating a new one with mean pooling.
Detected kernel version 4.19.91, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


开始微调


Step,Training Loss,Validation Loss,Dev Cosine Accuracy,Dev Dot Accuracy,Dev Manhattan Accuracy,Dev Euclidean Accuracy,Dev Max Accuracy
10,0.8806,0.191515,1.0,0.0,1.0,1.0,1.0
20,0.2872,0.126954,1.0,0.0,1.0,1.0,1.0
30,0.1698,0.146247,1.0,0.0,1.0,1.0,1.0
40,0.0344,0.147544,1.0,0.0,1.0,1.0,1.0
50,0.0397,0.139132,1.0,0.0,1.0,1.0,1.0
60,0.0151,0.133784,1.0,0.0,1.0,1.0,1.0
70,0.0126,0.136157,1.0,0.0,1.0,1.0,1.0
80,0.0043,0.140094,1.0,0.0,1.0,1.0,1.0
90,0.003,0.141698,1.0,0.0,1.0,1.0,1.0
100,0.0034,0.141937,1.0,0.0,1.0,1.0,1.0


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

训练结束！


In [1]:
from datasets import load_dataset
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from typing import List
import sentence_transformers
from sentence_transformers import InputExample, SentenceTransformer
from sentence_transformers.evaluation import BinaryClassificationEvaluator, MSEEvaluator,RerankingEvaluator
# from zhipuai import ZhipuAI
from sentence_transformers import losses
import re
import json
from langchain_core.documents import Document
from tqdm import tqdm
from torch.utils.data import DataLoader
class QaPairs():
    '''存储List[dict]类型数据'''

    def __init__(self, qa_pairs: List[dict]):
        self.qa_pairs = qa_pairs
        

    def save_json(self, path: str):
        '''将数据存储为json格式'''

        with open(path, "w", encoding='utf-8') as f:
            json.dump(self.qa_pairs, f, ensure_ascii=False, indent=4)

    @classmethod
    def from_json(cls, path:str) -> 'QaPairs':
        '''读取json格式数据'''

        with open(path) as f:
            data = json.load(f)
        return cls(data)

qa_pairs = QaPairs.from_json('ttt.json')
print(qa_pairs.qa_pairs)
query=[1,2,3,4]
answer=['a','b','c','d']
dataset = Dataset.from_dict({
    "query": query,
    "answer": answer
})
print(dataset)

[{'query': '1111', 'answer': 'aaaaaa', 'page_num': 15}, {'query': '222', 'answer': 'bbbbbb。', 'page_num': 15}]
Dataset({
    features: ['query', 'answer'],
    num_rows: 4
})
