In [1]:
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForQuestionAnswering, TrainingArguments, Trainer
import torch

# 加载本地数据
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

train_data = load_data('train.json')
dev_data = load_data('dev.json')
test_data = load_data('test.json')

# 将数据转换为datasets格式
def convert_to_dataset(data):
    return Dataset.from_pandas(pd.DataFrame(data))

train_dataset = convert_to_dataset(train_data)
dev_dataset = convert_to_dataset(dev_data)
test_dataset = convert_to_dataset(test_data)

datasets = DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset,
    'test': test_dataset
})


  from .autonotebook import tqdm as notebook_tqdm
2024-06-10 16:48:00.653500: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-10 16:48:00.690863: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from transformers import BertTokenizer, BertForQuestionAnswering, TrainingArguments, Trainer
import torch

# 加载预训练的BERT模型和tokenizer
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# 数据预处理
def preprocess_function(examples):
    questions = [q.strip() for q in examples["问题"]]
    inputs = tokenizer(
        questions,
        examples["条款"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    start_positions = []
    end_positions = []
    for i, answer in enumerate(examples["答案"]):
        answer_tokens = tokenizer.tokenize(answer)
        answer_ids = tokenizer.convert_tokens_to_ids(answer_tokens)
        input_ids = inputs["input_ids"][i].tolist()
        start_idx = input_ids.index(answer_ids[0])
        end_idx = start_idx + len(answer_ids) - 1
        start_positions.append(start_idx)
        end_positions.append(end_idx)
    inputs["start_positions"] = torch.tensor(start_positions)
    inputs["end_positions"] = torch.tensor(end_positions)
    return inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True, remove_columns=["ID", "产品名", "条款", "问题", "答案"])

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
)

# 开始训练
trainer.train()


In [None]:
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")


In [None]:
from transformers import pipeline

# 加载保存的模型和tokenizer
model_name = "./saved_model"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# 创建问答管道
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# 定义问答函数
def answer_question(question, context):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# 示例问题和文本
question = "保单生效前投保人申请退保并递交齐全退保资料时，保险公司将如何处理保险费？"
context = "退保 投保人要求解除合同，自本公司接到保险合同解除申请书时起，本合同效力终止。保单生效前申请退保并递交齐全退保资料的，将全额退还保险费。若投保人在保单生效后申请退保的，退保会遭受一定损失，保险公司将退还最低现金价值。退保金额支付方式如下:（1）通过支付宝平台办理退保，退保资金将由支付宝退回您缴费的账户；（2）通过保险公司办理，退保资金将由中华财险退回投保人的银行账户；本计划各保险产品的现金价值采用以下方式计算。"

# 获取答案
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")
