In [None]:
import json
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from transformers import DefaultDataCollator
from datasets import Dataset
import pandas as pd
import torch

def preprocess_data(json_data):
    processed_data = []

    for sample in json_data:
        sample_id = sample["id"]
        kbs = sample["kbs"]
        text = sample["text"]

        input_sequence = ""
        for kb_entry in kbs.values():
          obj = kb_entry[0]
          subject = kb_entry[2][0][1]
          predicate = kb_entry[2][0][0]

          input_text = subject + "," + predicate + "," + obj
          input_sequence += input_text + ","

        for sentence in text:
            processed_data.append({"id": sample_id, "input": input_sequence.strip(), "target": sentence})

    return processed_data

# Load JSON data
with open("C:/Users/lsong/Documents/bart/train.json", "r") as file:
    train_data = json.load(file)

with open("C:/Users/lsong/Documents/bart/val.json", "r") as file:
    val_data = json.load(file)


#batch_size = 100  # 每批处理的样本数量
#num_samples = len(processed_data)

#for i in range(0, num_samples, batch_size):
#    batch = processed_data[i:i+batch_size]
#    print(batch)

# 定义训练参数
training_args = TrainingArguments(
    output_dir="C:/Users/lsong/Documents/bart/fintune_model",  # 保存微调模型的路径
    num_train_epochs=30,  # 训练轮数
    per_device_train_batch_size=8,  # 批量大小
    learning_rate=2e-5,  # 学习率
    weight_decay=0.01,  # 权重衰减
    logging_dir="C:/Users/lsong/Documents/bart/finetune_logs",  # 日志保存路径
    logging_steps=100,  # 每隔多少步打印一次日志
)

# 加载预训练BART模型和tokenizer
model_name = "facebook/bart-base"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# 准备训练数据（假设你的训练数据已经预处理为input和target）
train_data = preprocess_data(train_data)
val_data = preprocess_data(val_data)

#print(train_data)

def preprocess_function(examples):
    inputs = examples['input']
    targets = examples['target']

    # Set the maximum sequence length
    max_length = 512

    # Tokenize and pad the inputs
    inputs_encodings = tokenizer.batch_encode_plus(inputs, padding='max_length', truncation=True, max_length=max_length)

    # Tokenize and pad the targets
    targets_encodings = tokenizer.batch_encode_plus(targets, padding='max_length', truncation=True, max_length=max_length)

    # Convert the lists to tensors
    inputs_encodings = {k: torch.tensor(v) for k, v in inputs_encodings.items()}
    targets_encodings = {k: torch.tensor(v) for k, v in targets_encodings.items()}

    encodings = {
        'input_ids': inputs_encodings['input_ids'],
        'attention_mask': inputs_encodings['attention_mask'],
        'labels': targets_encodings['input_ids'],
    }
    return encodings


train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)

train_data = Dataset.from_dict(train_df).map(preprocess_function, batched=True)
val_data = Dataset.from_dict(val_df).map(preprocess_function, batched=True)

data_collator = DefaultDataCollator()

# 创建Trainer对象并执行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
)

trainer.train()

In [None]:
import json
def preprocess_new_data(json_data):
    input_data = []

    for data_item in json_data:
        if "object" in data_item:
            data_id = int(data_item["id"])  # Convert the 'id' to an integer
            if isinstance(data_item["object"], list):
                object_list = data_item["object"]
            else:
                object_list = [data_item["object"]]
            target_text = data_item["text"]

            input_text_list = []
            for obj in object_list:
                input_text_list.append(",".join(obj["name"]))
            
            input_text = ",".join(input_text_list)

            input_data.append({"id": data_id, "input": input_text, "target": target_text})

    return input_data


# Load JSON data
with open("C:/Users/lsong/Documents/bart/sem/train.json", "r") as file:
    train_data = json.load(file)
    
train_data = preprocess_new_data(train_data)
#val_data = preprocess_data(val_data)


print(train_data)

In [4]:
import json
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from transformers import DefaultDataCollator
from datasets import Dataset, concatenate_datasets
import pandas as pd
import torch

# 加载预训练BART模型和tokenizer
model_name = "facebook/bart-base"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = examples['input']
    targets = examples['target']

    # Set the maximum sequence length
    max_length = 1024

    # Tokenize and pad the inputs
    inputs_encodings = tokenizer.batch_encode_plus(inputs, padding='max_length', truncation=True, max_length=max_length)

    # Tokenize and pad the targets
    targets_encodings = tokenizer.batch_encode_plus(targets, padding='max_length', truncation=True, max_length=max_length)

    # Convert the lists to tensors
    inputs_encodings = {k: torch.tensor(v) for k, v in inputs_encodings.items()}
    targets_encodings = {k: torch.tensor(v) for k, v in targets_encodings.items()}

    encodings = {
        'input_ids': inputs_encodings['input_ids'],
        'attention_mask': inputs_encodings['attention_mask'],
        'labels': targets_encodings['input_ids'],
    }
    return encodings

def preprocess_data(json_data):
    processed_data = []

    for sample in json_data:
        sample_id = sample["id"]
        kbs = sample["kbs"]
        text = sample["text"]

        input_sequence = ""
        for kb_entry in kbs.values():
          obj = kb_entry[0]
          subject = kb_entry[2][0][1]
          predicate = kb_entry[2][0][0]

          input_text = subject + "," + predicate + "," + obj
          input_sequence += input_text + ","

        for sentence in text:
            processed_data.append({"id": sample_id, "input": input_sequence.strip(), "target": sentence})

    return processed_data

# Load JSON data
with open("C:/Users/lsong/Documents/bart/train.json", "r") as file:
    train_data = json.load(file)

with open("C:/Users/lsong/Documents/bart/val.json", "r") as file:
    val_data = json.load(file)
# 准备训练数据（假设你的训练数据已经预处理为input和target）
train_data = preprocess_data(train_data)
val_data = preprocess_data(val_data)

train_df = pd.DataFrame(train_data)
val_df = pd.DataFrame(val_data)

train_data = Dataset.from_dict(train_df).map(preprocess_function, batched=True)
val_data = Dataset.from_dict(val_df).map(preprocess_function, batched=True)
    
def preprocess_new_data(json_data):
    input_data = []

    for data_item in json_data:
        if "object" in data_item:
            data_id = int(data_item["id"])  # Convert the 'id' to an integer
            if isinstance(data_item["object"], list):
                object_list = data_item["object"]
            else:
                object_list = [data_item["object"]]
            target_text = data_item["text"]

            input_text_list = []
            for obj in object_list:
                input_text_list.append(",".join(obj["name"]))
            
            input_text = ",".join(input_text_list)

            input_data.append({"id": data_id, "input": input_text, "target": target_text})

    return input_data

# Load the new JSON data
with open("C:/Users/lsong/Documents/bart/sem/train.json", "r") as file:
    new_train_data = json.load(file)

with open("C:/Users/lsong/Documents/bart/sem/val.json", "r") as file:
    new_val_data = json.load(file)
    
new_train_data = preprocess_new_data(new_train_data)
new_val_data = preprocess_new_data(new_val_data)

new_train_df = pd.DataFrame(new_train_data)
new_val_df = pd.DataFrame(new_val_data)

new_train_dataset = Dataset.from_dict(new_train_df).map(preprocess_function, batched=True)
new_val_dataset = Dataset.from_dict(new_val_df).map(preprocess_function, batched=True)

combined_train_dataset = concatenate_datasets([train_data, new_train_dataset])
combined_val_dataset = concatenate_datasets([val_data, new_val_dataset])

#combined_train_data = train_data["id"] + new_train_dataset["id"]
#combined_train_input_data = train_data["input"] + new_train_dataset["input"]
#combined_train_target_data = train_data["target"] + new_train_dataset["target"]

#combined_train_dataset = Dataset.from_dict({"id": combined_train_data, "input": combined_train_input_data, "target": combined_train_target_data})

#combined_val_data = val_data["id"] + new_val_dataset["id"]
#combined_val_input_data = val_data["input"] + new_val_dataset["input"]
#combined_val_target_data = val_data["target"] + new_val_dataset["target"]

#combined_val_dataset = Dataset.from_dict({"id": combined_val_data, "input": combined_val_input_data, "target": combined_val_target_data})


# 定义训练参数
training_args = TrainingArguments(
    output_dir="C:/Users/lsong/Documents/bart/fintune_model/web_sem",  # 保存微调模型的路径
    num_train_epochs=30,  # 训练轮数
    per_device_train_batch_size=4,  # 批量大小
    learning_rate=2e-5,  # 学习率
    weight_decay=0.01,  # 权重衰减
    logging_dir="C:/Users/lsong/Documents/bart/finetune_logs/web_sem",  # 日志保存路径
    logging_steps=100,  # 每隔多少步打印一次日志
    fp16=True
)

data_collator = DefaultDataCollator()

# 创建Trainer对象并执行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=combined_train_dataset,
    eval_dataset=combined_val_dataset,
    data_collator=data_collator,
)

trainer.train()

                                                             
 99%|█████████▉| 257300/259620 [118:21:19<1:00:50,  1.57s/it]

{'loss': 0.006, 'learning_rate': 1.856559587088822e-07, 'epoch': 29.73}


                                                             
 99%|█████████▉| 257400/259620 [118:23:58<59:40,  1.61s/it]

{'loss': 0.0059, 'learning_rate': 1.7795239195747632e-07, 'epoch': 29.74}


                                                           
 99%|█████████▉| 257500/259620 [118:26:37<55:23,  1.57s/it]

{'loss': 0.0057, 'learning_rate': 1.7024882520607044e-07, 'epoch': 29.76}


                                                             
 99%|█████████▉| 257600/259620 [118:29:17<53:25,  1.59s/it]

{'loss': 0.0061, 'learning_rate': 1.6254525845466454e-07, 'epoch': 29.77}


                                                           
 99%|█████████▉| 257700/259620 [118:31:56<51:28,  1.61s/it]

{'loss': 0.0056, 'learning_rate': 1.5491872737077269e-07, 'epoch': 29.78}


                                                           
 99%|█████████▉| 257800/259620 [118:34:34<48:01,  1.58s/it]

{'loss': 0.0059, 'learning_rate': 1.4721516061936678e-07, 'epoch': 29.79}


                                                           
 99%|█████████▉| 257900/259620 [118:37:13<46:02,  1.61s/it]

{'loss': 0.006, 'learning_rate': 1.3951159386796087e-07, 'epoch': 29.8}


                                                           
 99%|█████████▉| 258000/259620 [118:39:50<41:41,  1.54s/it]

{'loss': 0.0065, 'learning_rate': 1.3180802711655497e-07, 'epoch': 29.81}


                                                           
 99%|█████████▉| 258100/259620 [118:42:29<39:27,  1.56s/it]

{'loss': 0.0057, 'learning_rate': 1.2410446036514906e-07, 'epoch': 29.82}


                                                           
 99%|█████████▉| 258200/259620 [118:45:07<36:47,  1.55s/it]

{'loss': 0.0061, 'learning_rate': 1.1640089361374318e-07, 'epoch': 29.84}


                                                           
 99%|█████████▉| 258300/259620 [118:47:39<33:01,  1.50s/it]

{'loss': 0.0057, 'learning_rate': 1.0869732686233727e-07, 'epoch': 29.85}


                                                           
100%|█████████▉| 258400/259620 [118:50:11<30:57,  1.52s/it]

{'loss': 0.0057, 'learning_rate': 1.0099376011093137e-07, 'epoch': 29.86}


                                                           
100%|█████████▉| 258500/259620 [118:52:43<28:14,  1.51s/it]

{'loss': 0.0057, 'learning_rate': 9.329019335952547e-08, 'epoch': 29.87}


                                                           
100%|█████████▉| 258600/259620 [118:55:17<26:13,  1.54s/it]

{'loss': 0.0055, 'learning_rate': 8.558662660811957e-08, 'epoch': 29.88}


                                                           
100%|█████████▉| 258700/259620 [118:57:49<23:23,  1.53s/it]

{'loss': 0.0057, 'learning_rate': 7.788305985671366e-08, 'epoch': 29.89}


                                                           
100%|█████████▉| 258800/259620 [119:00:21<20:49,  1.52s/it]

{'loss': 0.0061, 'learning_rate': 7.017949310530777e-08, 'epoch': 29.91}


                                                           
100%|█████████▉| 258900/259620 [119:02:53<18:09,  1.51s/it]

{'loss': 0.006, 'learning_rate': 6.247592635390186e-08, 'epoch': 29.92}


                                                           
100%|█████████▉| 259000/259620 [119:05:26<15:44,  1.52s/it]

{'loss': 0.0062, 'learning_rate': 5.477235960249596e-08, 'epoch': 29.93}


                                                           
100%|█████████▉| 259100/259620 [119:08:01<13:02,  1.51s/it]

{'loss': 0.0059, 'learning_rate': 4.706879285109006e-08, 'epoch': 29.94}


                                                           
100%|█████████▉| 259200/259620 [119:10:32<10:41,  1.53s/it]

{'loss': 0.0055, 'learning_rate': 3.936522609968416e-08, 'epoch': 29.95}


                                                           
100%|█████████▉| 259300/259620 [119:13:04<08:07,  1.52s/it]

{'loss': 0.006, 'learning_rate': 3.166165934827825e-08, 'epoch': 29.96}


                                                           
100%|█████████▉| 259400/259620 [119:15:36<05:31,  1.51s/it]

{'loss': 0.0053, 'learning_rate': 2.3958092596872356e-08, 'epoch': 29.97}


                                                           
100%|█████████▉| 259500/259620 [119:18:08<03:01,  1.52s/it]

{'loss': 0.0061, 'learning_rate': 1.6254525845466452e-08, 'epoch': 29.99}


                                                           
100%|█████████▉| 259600/259620 [119:20:42<00:30,  1.53s/it]

{'loss': 0.007, 'learning_rate': 8.55095909406055e-09, 'epoch': 30.0}


                                                           
100%|██████████| 259620/259620 [119:21:13<00:00,  1.66s/it]

{'train_runtime': 429673.1445, 'train_samples_per_second': 2.417, 'train_steps_per_second': 0.604, 'train_loss': 0.014765281499456105, 'epoch': 30.0}





TrainOutput(global_step=259620, training_loss=0.014765281499456105, metrics={'train_runtime': 429673.1445, 'train_samples_per_second': 2.417, 'train_steps_per_second': 0.604, 'train_loss': 0.014765281499456105, 'epoch': 30.0})

In [27]:
import json
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from transformers import DefaultDataCollator
from datasets import Dataset, concatenate_datasets
import pandas as pd
import torch

# 加载预训练BART模型和tokenizer
model_name = "C:/Users/lsong/Documents/bart/fintune_model/web/checkpoint-64000"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

def preprocess_function(examples):
    inputs = examples['input']
    targets = examples['target']

    # Set the maximum sequence length
    max_length = 1024

    # Tokenize and pad the inputs
    inputs_encodings = tokenizer.batch_encode_plus(inputs, padding='max_length', truncation=True, max_length=max_length)

    # Tokenize and pad the targets
    targets_encodings = tokenizer.batch_encode_plus(targets, padding='max_length', truncation=True, max_length=max_length)

    # Convert the lists to tensors
    inputs_encodings = {k: torch.tensor(v) for k, v in inputs_encodings.items()}
    targets_encodings = {k: torch.tensor(v) for k, v in targets_encodings.items()}

    encodings = {
        'input_ids': inputs_encodings['input_ids'],
        'attention_mask': inputs_encodings['attention_mask'],
        'labels': targets_encodings['input_ids'],
    }
    return encodings
    
def preprocess_new_data(json_data):
    input_data = []

    for data_item in json_data:
        if "object" in data_item:
            data_id = int(data_item["id"])  # Convert the 'id' to an integer
            if isinstance(data_item["object"], list):
                object_list = data_item["object"]
            else:
                object_list = [data_item["object"]]
            target_text = data_item["text"]

            input_text_list = []
            for obj in object_list:
                input_text_list.append(",".join(obj["name"]))
            
            input_text = ",".join(input_text_list)

            input_data.append({"id": data_id, "input": input_text, "target": target_text})

    return input_data

# Load the new JSON data
with open("C:/Users/lsong/Documents/bart/sem/train.json", "r") as file:
    new_train_data = json.load(file)

with open("C:/Users/lsong/Documents/bart/sem/val.json", "r") as file:
    new_val_data = json.load(file)
    
new_train_data = preprocess_new_data(new_train_data)
new_val_data = preprocess_new_data(new_val_data)

new_train_df = pd.DataFrame(new_train_data)
new_val_df = pd.DataFrame(new_val_data)

new_train_dataset = Dataset.from_dict(new_train_df).map(preprocess_function, batched=True)
new_val_dataset = Dataset.from_dict(new_val_df).map(preprocess_function, batched=True)


# 定义训练参数
training_args = TrainingArguments(
    output_dir="C:/Users/lsong/Documents/bart/fintune_model/sem_on_web",  # 保存微调模型的路径
    num_train_epochs=30,  # 训练轮数
    per_device_train_batch_size=4,  # 批量大小
    learning_rate=2e-5,  # 学习率
    weight_decay=0.01,  # 权重衰减
    logging_dir="C:/Users/lsong/Documents/bart/finetune_logs/sem_on_web",  # 日志保存路径
    logging_steps=100,  # 每隔多少步打印一次日志
    fp16=True
)

data_collator = DefaultDataCollator()

# 创建Trainer对象并执行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_train_dataset,
    eval_dataset=new_val_dataset,
    data_collator=data_collator,
)

trainer.train()

  5%|▍         | 100/2100 [02:00<40:18,  1.21s/it]

{'loss': 0.6173, 'learning_rate': 1.904761904761905e-05, 'epoch': 1.43}


 10%|▉         | 200/2100 [04:00<38:13,  1.21s/it]

{'loss': 0.4948, 'learning_rate': 1.8095238095238097e-05, 'epoch': 2.86}


 14%|█▍        | 300/2100 [05:59<36:16,  1.21s/it]

{'loss': 0.4548, 'learning_rate': 1.7142857142857142e-05, 'epoch': 4.29}


 19%|█▉        | 400/2100 [07:59<34:13,  1.21s/it]

{'loss': 0.3965, 'learning_rate': 1.6190476190476193e-05, 'epoch': 5.71}


 24%|██▍       | 500/2100 [09:58<32:01,  1.20s/it]

{'loss': 0.3924, 'learning_rate': 1.523809523809524e-05, 'epoch': 7.14}


 29%|██▊       | 600/2100 [12:00<30:13,  1.21s/it]

{'loss': 0.3624, 'learning_rate': 1.4285714285714287e-05, 'epoch': 8.57}


 33%|███▎      | 700/2100 [13:58<20:49,  1.12it/s]

{'loss': 0.3374, 'learning_rate': 1.3333333333333333e-05, 'epoch': 10.0}


 38%|███▊      | 800/2100 [15:58<26:11,  1.21s/it]

{'loss': 0.3282, 'learning_rate': 1.2380952380952383e-05, 'epoch': 11.43}


 43%|████▎     | 900/2100 [17:58<24:12,  1.21s/it]

{'loss': 0.2989, 'learning_rate': 1.1428571428571429e-05, 'epoch': 12.86}


 48%|████▊     | 1000/2100 [19:57<22:10,  1.21s/it]

{'loss': 0.2885, 'learning_rate': 1.0476190476190477e-05, 'epoch': 14.29}


 52%|█████▏    | 1100/2100 [21:58<20:06,  1.21s/it]

{'loss': 0.2875, 'learning_rate': 9.523809523809525e-06, 'epoch': 15.71}


 57%|█████▋    | 1200/2100 [23:57<17:56,  1.20s/it]

{'loss': 0.2663, 'learning_rate': 8.571428571428571e-06, 'epoch': 17.14}


 62%|██████▏   | 1300/2100 [25:57<16:06,  1.21s/it]

{'loss': 0.2659, 'learning_rate': 7.61904761904762e-06, 'epoch': 18.57}


 67%|██████▋   | 1400/2100 [27:55<10:25,  1.12it/s]

{'loss': 0.2488, 'learning_rate': 6.666666666666667e-06, 'epoch': 20.0}


 71%|███████▏  | 1500/2100 [29:55<12:53,  1.29s/it]

{'loss': 0.251, 'learning_rate': 5.7142857142857145e-06, 'epoch': 21.43}


 76%|███████▌  | 1600/2100 [31:56<09:59,  1.20s/it]

{'loss': 0.2313, 'learning_rate': 4.761904761904762e-06, 'epoch': 22.86}


 81%|████████  | 1700/2100 [33:54<07:59,  1.20s/it]

{'loss': 0.2412, 'learning_rate': 3.80952380952381e-06, 'epoch': 24.29}


 86%|████████▌ | 1800/2100 [35:53<05:59,  1.20s/it]

{'loss': 0.232, 'learning_rate': 2.8571428571428573e-06, 'epoch': 25.71}


 90%|█████████ | 1900/2100 [37:51<03:58,  1.19s/it]

{'loss': 0.2168, 'learning_rate': 1.904761904761905e-06, 'epoch': 27.14}


 95%|█████████▌| 2000/2100 [39:50<02:00,  1.20s/it]

{'loss': 0.2224, 'learning_rate': 9.523809523809525e-07, 'epoch': 28.57}


100%|██████████| 2100/2100 [41:49<00:00,  1.20s/it]

{'loss': 0.2219, 'learning_rate': 0.0, 'epoch': 30.0}
{'train_runtime': 2509.6344, 'train_samples_per_second': 3.323, 'train_steps_per_second': 0.837, 'train_loss': 0.3169720340910412, 'epoch': 30.0}





TrainOutput(global_step=2100, training_loss=0.3169720340910412, metrics={'train_runtime': 2509.6344, 'train_samples_per_second': 3.323, 'train_steps_per_second': 0.837, 'train_loss': 0.3169720340910412, 'epoch': 30.0})

In [3]:
import json
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = "C:/Users/lsong/Documents/bart/fintune_model/web/checkpoint-64000"  # 或者是您微调后保存的模型路径

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained(model_name)

def preprocess_data(json_data):
    processed_data = []

    for sample in json_data:
        sample_id = sample["id"]
        kbs = sample["kbs"]
        text = sample["text"]

        input_sequence = ""
        for kb_entry in kbs.values():
            obj = kb_entry[0]
            subject = kb_entry[2][0][1]
            predicate = kb_entry[2][0][0]

            input_text = subject + "," + predicate + "," + obj
            input_sequence += input_text + ","

        for sentence in text:
            processed_data.append({"id": sample_id, "input": input_sequence.strip(), "target": sentence})

    return processed_data

def preprocess_new_data(json_data):
    input_data = []

    for data_item in json_data:
        if "object" in data_item:
            data_id = int(data_item["id"])  # Convert the 'id' to an integer
            if isinstance(data_item["object"], list):
                object_list = data_item["object"]
            else:
                object_list = [data_item["object"]]
            target_text = data_item["text"]

            input_text_list = []
            for obj in object_list:
                input_text_list.append(",".join(obj["name"]))
            
            input_text = ",".join(input_text_list)

            input_data.append({"id": data_id, "input": input_text, "target": target_text})

    return input_data

# 读取JSON文件
with open("C:/Users/lsong/Documents/bart/sem/test.json", "r") as f:
    input_data = json.load(f)
    
processed_data = preprocess_new_data(input_data)

generated_entries = []  # 存储生成的结果

for entry in processed_data:
    sample_id = entry["id"]
    input_text = entry["input"]

    # 将逗号分隔的三元组转换为字符串列表
    input_triplets = input_text.split(",")

    # 使用BART tokenizer将文本转换为token的索引形式
    input_ids = tokenizer.encode(input_triplets, return_tensors="pt", add_special_tokens=False)

    # 生成文本
    output_ids = model.generate(input_ids, max_length=1000, num_beams=5, no_repeat_ngram_size=2)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # 将生成结果添加到列表中
    generated_entry = {
        "id": sample_id,
        "input_text": input_text,
        "generated_text": output_text
    }
    generated_entries.append(generated_entry)

# 将生成的结果保存为JSON文件
output_file = "C:/Users/lsong/Documents/bart/sem/web_fintune_output.json"
with open(output_file, "w") as f:
    json.dump(generated_entries, f, ensure_ascii=False, indent=4)

print("生成结果已保存到文件:", output_file)


  from .autonotebook import tqdm as notebook_tqdm


生成结果已保存到文件: C:/Users/lsong/Documents/bart/sem/web_fintune_output.json


In [33]:
import json
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = "facebook/bart-base"  # 或者是您微调后保存的模型路径

tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def preprocess_data(json_data):
    processed_data = []

    for sample in json_data:
        sample_id = sample["id"]
        kbs = sample["kbs"]
        text = sample["text"]

        input_sequence = ""
        for kb_entry in kbs.values():
            obj = kb_entry[0]
            subject = kb_entry[2][0][1]
            predicate = kb_entry[2][0][0]

            input_text = subject + "," + predicate + "," + obj
            input_sequence += input_text + ","

        for sentence in text:
            processed_data.append({"id": sample_id, "input": input_sequence.strip(), "target": sentence})

    return processed_data

def preprocess_new_data(json_data):
    input_data = []

    for data_item in json_data:
        if "object" in data_item:
            data_id = int(data_item["id"])  # Convert the 'id' to an integer
            if isinstance(data_item["object"], list):
                object_list = data_item["object"]
            else:
                object_list = [data_item["object"]]
            target_text = data_item["text"]

            input_text_list = []
            for obj in object_list:
                input_text_list.append(",".join(obj["name"]))
            
            input_text = ",".join(input_text_list)

            input_data.append({"id": data_id, "input": input_text, "target": target_text})

    return input_data

# 读取JSON文件
with open("C:/Users/lsong/Documents/bart/sem/test.json", "r") as f:
    input_data = json.load(f)
    
processed_data = preprocess_new_data(input_data)

generated_entries = []  # 存储生成的结果

for entry in processed_data:
    sample_id = entry["id"]
    input_text = entry["input"]

    # 将逗号分隔的三元组转换为字符串列表
    input_triplets = input_text.split(",")

    # 使用BART tokenizer将文本转换为token的索引形式
    input_ids = tokenizer.encode(input_triplets, return_tensors="pt", add_special_tokens=False)

    # 生成文本
    output_ids = model.generate(input_ids, max_length=1000, num_beams=5, no_repeat_ngram_size=2)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # 将生成结果添加到列表中
    generated_entry = {
        "id": sample_id,
        "input_text": input_text,
        "generated_text": output_text
    }
    generated_entries.append(generated_entry)

# 将生成的结果保存为JSON文件
output_file = "C:/Users/lsong/Documents/bart/sem/nofintune_output.json"
with open(output_file, "w") as f:
    json.dump(generated_entries, f, ensure_ascii=False, indent=4)

print("生成结果已保存到文件:", output_file)


生成结果已保存到文件: C:/Users/lsong/Documents/bart/sem/nofintune_output.json


In [20]:
import pandas as pd

# 读取Excel文件
excel_file = "C:/Users/lsong/Documents/bart/sem/images1.xlsx"
df = pd.read_excel(excel_file)

# 提取"textization"列中的文本内容，并按要求处理成目标格式
target_text = "\n\n".join(df["textization"])

# 保存为.target文件
output_file = "C:/Users/lsong/Documents/bart/sem/images1.target"
with open(output_file, "w") as f:
    f.write(target_text)

print("已生成.target文件:", output_file)


已生成.target文件: C:/Users/lsong/Documents/bart/sem/images1.target


In [34]:
import pandas as pd
import json
import re


# Read Excel file
excel_file = 'C:/Users/lsong/Documents/bart/sem/images.xlsx'
df = pd.read_excel(excel_file)

# Read JSON file
json_file = 'C:/Users/lsong/Documents/bart/sem/fintune_output.json'
with open(json_file, 'r', encoding='utf-8') as file:
    json_data = json.load(file)

# Extract ids from JSON data
json_ids = [item['id'] for item in json_data]

def extract_numeric(image):
    match = re.search(r'(\d+)', image)
    return int(match.group(1)) if match else None

# Apply the function to 'image' column and filter rows based on matching ids
filtered_df = df[df['image'].apply(extract_numeric).isin(json_ids)]

# Save filtered DataFrame to Excel
output_file = 'C:/Users/lsong/Documents/bart/sem/images1.xlsx'
filtered_df.to_excel(output_file, index=False)

print("Filtered data saved to", output_file)


Filtered data saved to C:/Users/lsong/Documents/bart/sem/images1.xlsx


In [4]:
import json

# 读取JSON文件
with open('C:/Users/lsong/Documents/bart/sem/web_fintune_output.json', 'r') as file:
    data = json.load(file)

# 创建字典来存储每个ID的第一个条目
filtered_data = {}

# 保留每个ID的第一个条目
for item in data:
    item['text'] = item.pop('generated_text')
    
    if item['id'] not in filtered_data:
        filtered_data[item['id']] = item
    
    if 'input_text' in item:
        del item['input_text']

# 将字典转换回列表形式
filtered_data = list(filtered_data.values())

# 保存格式化输出的JSON文件
with open('C:/Users/lsong/Documents/bart/sem/web_fintune_output.json', 'w') as file:
    json.dump(filtered_data, file, indent=4)
