In [1]:
import torch
import transformers
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import pandas as pd
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
def load_raw_bgl():
    raw_bgl=pd.read_csv('/home/jpy/graduation_design_final/BGL/BGL_2k.log_structured.csv')
    raw_bgl["Label"]=raw_bgl["Label"].apply(lambda x: int(x != "-"))
    labels = raw_bgl['Label'].tolist()
    contents = raw_bgl['Content'].to_list()
    label_content_tuples =list(zip(labels,contents))

    return label_content_tuples

class TestData(Dataset):
    def __init__(self, label_content_tuples, tokenizer, max_length=512):
        self.data = label_content_tuples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        labels, content, = self.data[idx]
        
        # 编码输入内容
        input_text = content
        encoding = self.tokenizer(input_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        
        # 编码标签

        # 返回编码后的内容和标签
        return {
            'input_texts':input_text,
            'input_ids': encoding['input_ids'].squeeze(0),  # 去除多余的批次维度
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels':labels
        }

In [20]:
import torch
print(torch.version.cuda)

12.4


In [17]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_save_path = "/home/jpy/graduation_design_final/Flan_T5_base_tuning"
tokenizer_save_path = "/home/jpy/graduation_design_final/Tokenizer"
# 加载模型和 tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_save_path)
tokenizer = T5Tokenizer.from_pretrained(tokenizer_save_path)

model.to('cuda')

model.eval()


test_data_tuple=load_raw_bgl()
testdata=TestData(test_data_tuple,tokenizer=tokenizer)
dataLoader=DataLoader(testdata,batch_size=10)
ans_data=[]
with torch.no_grad():
    for batch in tqdm(dataLoader, desc="Processing", unit="batch"):        
        input_texts =batch['input_texts']
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels= batch['labels']
        outputs = model.generate(input_ids=input_ids,attention_mask=attention_mask,max_length=512)
        output_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        for  i,output_text in enumerate(output_texts):
            trio_tuple=(labels[i],input_texts[i],output_texts[i])
            ans_data.append(trio_tuple)
             


Processing: 100%|██████████| 200/200 [00:24<00:00,  8.03batch/s]


In [18]:
import pandas as pd
df = pd.DataFrame(ans_data, columns=['Label', 'Content', 'EventTemplate'])
# 保存为 CSV
df.to_csv('output.csv', index=False)