In [5]:
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, BertTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import Trainer, TrainingArguments, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

In [6]:
data_files = './data_ezsocket/merged_api_para_pcapdata_dataset20k.csv'
# ; is the tab character in Python
ezsocket_dataset = load_dataset("csv", data_files=data_files, delimiter=";")

In [7]:
import re

####以下函数用于讲10进制数token化#####
def format_decimal_as_hexadecimal(decimal_str):
    # Convert the decimal string to an integer
    decimal_number = int(decimal_str)
    
    # Convert the integer to a hexadecimal string
    hex_str = hex(decimal_number)[2:]  # Strip the '0x' prefix
    
    # Ensure the length of the hex string is even
    if len(hex_str) % 2 != 0:
        hex_str = '0' + hex_str
    
    # Split the hex string into pairs of characters
    hex_pairs = [hex_str[i:i+2] for i in range(0, len(hex_str), 2)]
    
    # Join the pairs with commas
    formatted_hex = ','.join(hex_pairs)
    
    return formatted_hex


def convert_number(num_str):
    num = float(num_str)
    if num.is_integer():
        num = int(num)
        sign = "-" if num < 0 else "+"
        num_str = format_decimal_as_hexadecimal(str(num).lstrip('-'))
        return f"num,{sign},{num_str},num"
    else:
        sign = "-" if num < 0 else "+"
        num_str = num_str.lstrip('-')
        integer_part, fractional_part = num_str.split('.')
        combined_num = format_decimal_as_hexadecimal(integer_part + fractional_part.rstrip('0'))
        pos_num = format_decimal_as_hexadecimal(len(fractional_part.rstrip('0')))
        return f"num,{sign},{combined_num},pos,{pos_num},num"

def process_segment(segment):
    parts = segment.split(',')
    for i, part in enumerate(parts):
        if re.match(r'^-?\d+(\.\d+)?$', part):  # Match integers and floating-point numbers
            parts[i] = convert_number(part)
    result = ','.join(parts)
    result = result.replace(",", " ")
    return result
####以上函数用于将10进制数token化：process_segment(segment)#####

####以下函数用于将payload按两位分开，用','隔开#####
def split_payload_into_pairs(text):
    # 将文本按每两个字符分割
    pairs = [text[i:i+2] for i in range(0, len(text), 2)]
    # 用逗号连接分割后的文本
    result = ' '.join(pairs)
    return result
####以上函数用于将payload按两位分开，用','隔开#####

In [8]:
#使用map+lambda清洗数据
# clear_ezsocket_dataset = ezsocket_dataset.map(lambda x: {"Function and Parameters": x["Function and Parameters"].split(',', 1)[1]})
clear_ezsocket_dataset = ezsocket_dataset.map(lambda x: {"Function and Parameters": [o.split(',', 1)[1] for o in x["Function and Parameters"]]}, batched=True) #可加速处理，删除前面的时间戳
clear_ezsocket_dataset = clear_ezsocket_dataset.map(lambda x: {"Function and Parameters": [process_segment(o) for o in x["Function and Parameters"]]}, batched=True) #可加速处理，10进制参数token化
clear_ezsocket_dataset = clear_ezsocket_dataset.map(lambda x: {"Data Segment": [split_payload_into_pairs(o) for o in x["Data Segment"]]}, batched=True) #可加速处理，10进制参数token化

Map:   0%|          | 0/20311 [00:00<?, ? examples/s]

Map:   0%|          | 0/20311 [00:00<?, ? examples/s]

Map:   0%|          | 0/20311 [00:00<?, ? examples/s]

In [9]:
#划分训练集测试集和验证集
ezsocket_dataset_tt = clear_ezsocket_dataset["train"].train_test_split(train_size=0.8, seed=42)
ezsocket_dataset_tvt = ezsocket_dataset_tt["train"].train_test_split(train_size=0.9, seed=42)
ezsocket_dataset_tvt["validation"] = ezsocket_dataset_tvt.pop("test")
ezsocket_dataset_tvt["test"] = ezsocket_dataset_tt["test"]
ezsocket_dataset_tvt
#保存数据集使用：Arrow:	Dataset.save_to_disk()  CSV:	Dataset.to_csv()    JSON:	Dataset.to_json()

DatasetDict({
    train: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 14623
    })
    validation: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 1625
    })
    test: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 4063
    })
})

In [10]:
tokenizer=BertTokenizer(vocab_file='./vocab.txt')

In [11]:
def tokenize_function(examples):
    inputs = tokenizer(examples["Function and Parameters"], padding="max_length", truncation=True, max_length=50)
    targets = tokenizer(examples["Data Segment"], padding="max_length", truncation=True, max_length=210)
    
    inputs["labels"] = targets["input_ids"]
    return inputs


tokenized_datasets = ezsocket_dataset_tvt.map(tokenize_function, batched=True, num_proc=8)

Map (num_proc=8):   0%|          | 0/14623 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=8):   0%|          | 0/1625 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4063 [00:00<?, ? examples/s]

In [12]:
len(tokenizer)

358

In [13]:
config = T5Config(
    vocab_size=358,
    d_model=512,
    d_kv=64,
    d_ff=2048,
    num_layers=6,
    num_heads=8,
    relative_attention_num_buckets=32,
    dropout_rate=0.1,
    layer_norm_epsilon=1e-6,
    initializer_factor=1.0,
    feed_forward_proj="relu",
    is_encoder_decoder=True,
    use_cache=True,
    pad_token_id=261,
    eos_token_id=258,
    decoder_start_token_id=261
)

model = T5ForConditionalGeneration(config)

In [15]:
model.num_parameters()

44240384

In [36]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="longest",
    max_length=210
)

In [37]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_pretrained",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluation_strategy="steps",
    do_train=True,
    do_eval=True,
    logging_steps=100,
    save_steps=1000,
    eval_steps=1000,
    warmup_steps=1000,
    max_steps=100000,
    overwrite_output_dir=True,
    save_total_limit=7,
    fp16=True,
)


In [38]:
print(model.config.decoder_start_token_id)
print(tokenizer.pad_token_id)

261
261


In [39]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

Step,Training Loss,Validation Loss
1000,0.1713,0.133441
2000,0.0893,0.063537
3000,0.0579,0.039577
4000,0.0442,0.035238
5000,0.0377,0.031315
6000,0.0357,0.030832
7000,0.033,0.028762
8000,0.0334,0.028161
9000,0.0301,0.027273
10000,0.0307,0.027064


TrainOutput(global_step=100000, training_loss=0.026661254634857176, metrics={'train_runtime': 7529.7767, 'train_samples_per_second': 106.245, 'train_steps_per_second': 13.281, 'total_flos': 1.05728645234688e+16, 'train_loss': 0.026661254634857176, 'epoch': 54.7})