In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup##在这里删除了adamw
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW#根据提示用的
from tqdm import *
from tqdm.auto import tqdm  # 使用auto以确保在不同环境下都能正常显示
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter #tensorboard
import random

writer0 = SummaryWriter(log_dir = './output/randeng/output')

class CustomDataset(Dataset):
    def __init__(self, tokenizer, data, max_len=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt_text, completion_text = self.data[idx]
        input_encoding = self.tokenizer(prompt_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(completion_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        return {
            'input_ids': input_encoding.input_ids.squeeze(),
            'attention_mask': input_encoding.attention_mask.squeeze(),
            'labels': target_encoding.input_ids.squeeze()
        }

# 加载数据并分割为训练集和测试集
def load_and_split_data(tokenizer, file_path):
    df = pd.read_excel(file_path, engine='openpyxl')
    # df = df.iloc[:20]
    data = [(str(row['问题']), str(row['答案'])) for _, row in df.iterrows()]
    # train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_data = data
    test_data = random.sample(data,len(data)//5)
    print("---------data loaded---------")
    return CustomDataset(tokenizer, train_data), CustomDataset(tokenizer, test_data)

def train_and_evaluate(model, tokenizer, file_path, epochs=50, batch_size=1, lr=5e-5):
    train_dataset, test_dataset = load_and_split_data(tokenizer, file_path)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=1)  # 测试时batch_size设为1简化处理
    best_acc = 0
    
    cnt = 0
    num_dict = [str(i) for i in range(100)]

    for epoch in range(epochs):
        all_loss = 0.0
        # 训练部分
        # print(epoch)
        model.train()
        optimizer = AdamW(model.parameters(), lr=lr)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['labels'].to(model.device)
            labels[labels == tokenizer.pad_token_id] = -100

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            all_loss += loss.item()

            # progress_bar.set_postfix(loss=loss.item())
            #train_loss.append(loss.item())
        writer0.add_scalar('train_loss', all_loss/len(train_loader), epoch)

        # 评估部分
        model.eval()
        correct_predictions = 0
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(model.device)
                attention_mask = batch['attention_mask'].to(model.device)
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
                
                _ = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = _.loss
                all_loss += loss.item()

                # 假设模型生成的第一个输出就是预测答案
                predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
                # predicted_text = predicted_text.replace("<extra_id_0>",'')
                # print(predicted_text)
                
                actual_text = tokenizer.decode(batch['labels'][0], skip_special_tokens=True)
                # print(actual_text)

                if predicted_text.strip().lower() == actual_text.strip().lower():
                    correct_predictions += 1

        acc = correct_predictions / len(test_loader)
        print(f"Test Accuracy: {acc:.4f}")
        writer0.add_scalar('test_acc', acc, epoch)
        writer0.add_scalar('test_loss', all_loss/len(test_loader), epoch)
        #return train_loss
        
        # 每次保存
        if best_acc<acc:
            best_acc = acc
            save_path = "./output/randeng" #+ num_dict[cnt]
            cnt += 1
            if cnt == 10:
                model.save_pretrained("./output/randeng20")
                tokenizer.save_pretrained("./output/randeng20")
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
            print("模型和分词器已保存。")

'''if __name__ == "__main__":
    tokenizer = T5Tokenizer.from_pretrained("./model/flan-t5-base/")
    model = T5ForConditionalGeneration.from_pretrained("./model/flan-t5-base/")

    file_path = "./test.xlsx"
    train_and_evaluate(model, tokenizer, file_path)'''

if __name__ == "__main__":
    load_path = "./output/randeng10"
    # load_path = input('模型地址(模型保存在'./randeng'）'：)
    tokenizer = T5Tokenizer.from_pretrained(load_path)
    model = T5ForConditionalGeneration.from_pretrained(load_path).to('cuda')  # 将模型移动到GPU
    print("---------model loaded---------")
    file_path = "./data/数据全.xlsx"
    train_and_evaluate(model, tokenizer, file_path,20,1,1e-6)
    print("---------train finished---------")
    
    #plt.plot(loss)

    # # 训练和评估完成后保存模型
    # save_path = "./output/randeng"
    # model.save_pretrained(save_path)
    # tokenizer.save_pretrained(save_path)
    # print("模型和分词器已保存。")
    writer0.close()


In [11]:
import os
import pandas as pd
from tqdm import *
from tqdm.auto import tqdm
# os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
import torch
from torch import cuda
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import Dataset, DataLoader
path = './output/randeng10/'
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSeq2SeqLM.from_pretrained(path) 
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)
model.to(device)

class CustomDataset(Dataset):
    def __init__(self, tokenizer, data, max_len=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt_text, completion_text = self.data[idx]
        input_encoding = self.tokenizer(prompt_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(completion_text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        return {
            'input_ids': input_encoding.input_ids.squeeze(),
            'attention_mask': input_encoding.attention_mask.squeeze(),
            'labels': target_encoding.input_ids.squeeze()
        }

cuda


In [15]:
def postprocess(text):
    return text.replace(".", "").replace('</>','')

def answer_fn(text, top_k=50):
    encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=256, return_tensors="pt").to(device)
    out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_length=512,temperature=0.5,do_sample=True,repetition_penalty=1.4 ,top_k=top_k,top_p=0.95)
    result = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
    # print(type(out["sequences"]))
    return postprocess(result[0]) 

def test_data(file_path):
    turns = int(input('请输入轮数: '))
    df = pd.read_excel(file_path, engine='openpyxl')
    if turns == -1 : turns = len(df)
    df = df.sample(turns)
    data = [(str(row['问题']), str(row['答案'])) for _, row in df.iterrows()]
    print("---------data loaded---------")
    with open('test_data_output.txt','w') as w:
        test_dataset = CustomDataset(tokenizer,data)
        test_loader = DataLoader(test_dataset, batch_size=1)
        model.eval()
        correct_predictions = 0
        cnt = 0
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(model.device)
                attention_mask = batch['attention_mask'].to(model.device)
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)

                # 假设模型生成的第一个输出就是预测答案
                result = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
                # predicted_text = predicted_text.replace("<extra_id_0>",'')
                # print(predicted_text)
                
                a = tokenizer.decode(batch['labels'][0], skip_special_tokens=True).strip().lower()
                # print(actual_text)
                q = data[cnt][0]
                
                # w.write(f"问题: {q}\n")
                # w.write(f"答案: {a}\n")
                # w.write(f"生成: {result}\n")
                # w.write("*"*100)
                # w.write('\n')

                if result == a:
                    correct_predictions += 1
                else:
                    w.write(f"{cnt}\n")
                
                cnt+=1
                if cnt == turns:
                    break

        acc = correct_predictions / cnt
        w.write(f"Test Accuracy: {(acc):.4f}")
        print("----ended----")
        # for q, a in tqdm(data, desc=f"Testing"):
        #     w.write(f"问题: {q}\n")
        #     # a = actual_text = tokenizer.decode(a, skip_special_tokens=True)
        #     w.write(f"答案: {a}\n")
        #     result=answer_fn(q, top_k=50)
        #     w.write(f"生成: {result}\n")
        #     w.write("*"*100)
        #     w.write('\n')
        #     if result.strip().lower() == a.strip().lower():
        #         acc += 1
        # w.write(f"Test Accuracy: {(acc / turns):.4f}")
def eval_data(file_path,turns = -1):
    df = pd.read_excel(file_path, engine='openpyxl')
    if turns == -1:
        turns = int(input(f'请输入测试数: {len(df)}: '))
        if turns > len(df):
            turns = len(df)
        print(f"测试数目：{turns}")
        print('*'*100)
    df = df.sample(turns)
    data = [(str(row['问题']), str(row['答案'])) for _, row in df.iterrows()]
    # data = data.iloc[:100]
    acc = 0
    for q, a in tqdm(data, desc=f"Evaluating ") :
        result=answer_fn(q, top_k=50)
        if result.strip().lower() == a.strip().lower():
            acc += 1
            
    print(f"Test Accuracy: {acc / len(data):.4f}")
    return acc



# test_path = './data/webQA/me_train.xlsx'
test_path = './data/数据全.xlsx'

while True:
    text = input('请输入问题:')
    # test_path = input('输入测试数据地址:')
    if text == 'q':
        break
    elif text == '2':
        test_data(test_path)
        break
    elif text == '3':
        eval_data(test_path)
        break
    elif text == '4':
        eval_data(test_path,turns=100)
        break
    result=answer_fn(text, top_k=50)
    print("模型生成:",result)
    print('*'*100)

请输入问题: 2
请输入轮数:  -1


---------data loaded---------


Evaluating:   0%|          | 0/4927 [00:00<?, ?it/s]

----ended----


In [20]:
pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Package                       Version
----------------------------- ----------------
absl-py                       0.14.1
alabaster                     0.7.12
anyio                         3.6.2
apex                          0.1
appdirs                       1.4.4
argon2-cffi                   21.1.0
asgiref                       3.4.1
attrs                         21.2.0
audioread                     2.1.9
Babel                         2.11.0
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
beautifulsoup4                4.10.0
bleach                        4.1.0
blis                          0.7.4
brotlipy                      0.7.0
cachetools                    4.2.4
catalogue                     2.0.6
certifi                       2024.2.2
cffi                          1.14.6
chardet                       4.0.0
charset-normalizer            2.0.0
click                         8.0.1
cloudpickle                   2.2.1
codecov                       2.1.12
colora