In [8]:
from datasets import load_dataset


data_files = './data_ezsocket/merged_api_para_pcapdata_dataset20k.csv'
# ; is the tab character in Python
ezsocket_dataset = load_dataset("csv", data_files=data_files, delimiter=";")

In [9]:
import re

####以下函数用于讲10进制数token化#####
def format_decimal_as_hexadecimal(decimal_str):
    # Convert the decimal string to an integer
    decimal_number = int(decimal_str)
    
    # Convert the integer to a hexadecimal string
    hex_str = hex(decimal_number)[2:]  # Strip the '0x' prefix
    
    # Ensure the length of the hex string is even
    if len(hex_str) % 2 != 0:
        hex_str = '0' + hex_str
    
    # Split the hex string into pairs of characters
    hex_pairs = [hex_str[i:i+2] for i in range(0, len(hex_str), 2)]
    
    # Join the pairs with commas
    formatted_hex = ','.join(hex_pairs)
    
    return formatted_hex


def convert_number(num_str):
    num = float(num_str)
    if num.is_integer():
        num = int(num)
        sign = "-" if num < 0 else "+"
        num_str = format_decimal_as_hexadecimal(str(num).lstrip('-'))
        return f"num,{sign},{num_str},num"
    else:
        sign = "-" if num < 0 else "+"
        num_str = num_str.lstrip('-')
        integer_part, fractional_part = num_str.split('.')
        combined_num = format_decimal_as_hexadecimal(integer_part + fractional_part.rstrip('0'))
        pos_num = format_decimal_as_hexadecimal(len(fractional_part.rstrip('0')))
        return f"num,{sign},{combined_num},pos,{pos_num},num"

def process_segment(segment):
    parts = segment.split(',')
    for i, part in enumerate(parts):
        if re.match(r'^-?\d+(\.\d+)?$', part):  # Match integers and floating-point numbers
            parts[i] = convert_number(part)
    result = ','.join(parts)
    result = result.replace(",", " ")
    return result
####以上函数用于将10进制数token化：process_segment(segment)#####

####以下函数用于将payload按两位分开，用','隔开#####
def split_payload_into_pairs(text):
    # 将文本按每两个字符分割
    pairs = [text[i:i+2] for i in range(0, len(text), 2)]
    # 用逗号连接分割后的文本
    result = ' '.join(pairs)
    return result
####以上函数用于将payload按两位分开，用','隔开#####

In [10]:
#使用map+lambda清洗数据
# clear_ezsocket_dataset = ezsocket_dataset.map(lambda x: {"Function and Parameters": x["Function and Parameters"].split(',', 1)[1]})
clear_ezsocket_dataset = ezsocket_dataset.map(lambda x: {"Function and Parameters": [o.split(',', 1)[1] for o in x["Function and Parameters"]]}, batched=True) #可加速处理，删除前面的时间戳
clear_ezsocket_dataset = clear_ezsocket_dataset.map(lambda x: {"Function and Parameters": [process_segment(o) for o in x["Function and Parameters"]]}, batched=True) #可加速处理，10进制参数token化
clear_ezsocket_dataset = clear_ezsocket_dataset.map(lambda x: {"Data Segment": [split_payload_into_pairs(o) for o in x["Data Segment"]]}, batched=True) #可加速处理，10进制参数token化

In [11]:
#划分训练集测试集和验证集
ezsocket_dataset_tt = clear_ezsocket_dataset["train"].train_test_split(train_size=0.8, seed=42)
ezsocket_dataset_tvt = ezsocket_dataset_tt["train"].train_test_split(train_size=0.9, seed=42)
ezsocket_dataset_tvt["validation"] = ezsocket_dataset_tvt.pop("test")
ezsocket_dataset_tvt["test"] = ezsocket_dataset_tt["test"]
ezsocket_dataset_tvt
#保存数据集使用：Arrow:	Dataset.save_to_disk()  CSV:	Dataset.to_csv()    JSON:	Dataset.to_json()

DatasetDict({
    train: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 14623
    })
    validation: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 1625
    })
    test: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 4063
    })
})

In [12]:
from transformers import BertTokenizer, BertForMaskedLM

#手动tokenizer,直接生成词汇表
word_list = set([])
for line in ezsocket_dataset_tvt["train"]["Function and Parameters"]:
    line = line.split()
    word_list.update(set(line)) # ['hello', 'how', 'are', 'you',...]
filtered_list = [item.lower() for item in word_list if len(item) >= 6]

word2idx = {f"{i:02X}".lower(): i for i in range(256)}
word2idx.update({'[MASK]' : 256, '[CLS]' : 257, '[SEP]' : 258, 'num' : 259, 'pos' : 260, '[PAD]' : 261, '+' : 262, '-' : 263})
for i, w in enumerate(filtered_list):
    word2idx[w] = i + 264
with open('vocab.txt', 'w', encoding='utf-8') as file:
    for i in word2idx:
        file.write(f"{i}\n")

# 加载手动生成的词汇表
tokenizer=BertTokenizer(vocab_file='./vocab.txt')

In [13]:
len(tokenizer)

356

In [14]:
def tokenize_function(examples):
    # Remove empty lines
    return tokenizer(
        examples["Function and Parameters"], 
        examples["Data Segment"], 
        padding="max_length", # 进行填充
        truncation=True, # 进行截断
        max_length=210, # 设置句子的长度
        # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
        # receives the `special_tokens_mask`.
        return_special_tokens_mask=True,
    )
tokenized_datasets = ezsocket_dataset_tvt.map(
    tokenize_function,
    batched=True,
    num_proc=8,
    # remove_columns=[text_column_name],
    load_from_cache_file=False,
)


Map (num_proc=8):   0%|          | 0/14623 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=8):   0%|          | 0/1625 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4063 [00:00<?, ? examples/s]

In [15]:
from transformers import BertConfig, BertForMaskedLM, BertTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [16]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [17]:
config = BertConfig(
    vocab_size=len(tokenizer),
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=768,
)
model = BertForMaskedLM(config)

In [21]:

model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(356, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [None]:
training_args = TrainingArguments(
    output_dir="./bert_pretrained",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    # prediction_loss_only=True,
    # no_cuda=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

trainer.train()

In [None]:
#TrainOutput(global_step=256060, training_loss=0.09054873210884934, metrics={'train_runtime': 95449.4276, 'train_samples_per_second': 42.922, 'train_steps_per_second': 2.683, 'total_flos': 2.22759212644584e+17, 'train_loss': 0.09054873210884934, 'epoch': 10.0})
# for obj in trainer.state.log_history:
#     print(obj)

import csv
import pandas as pd

# Convert the log history to a pandas DataFrame
df = pd.DataFrame(trainer.state.log_history)

# Save the DataFrame as a CSV file
df.to_csv('training_log_history_bert_pretrain_500k.csv', index=False)

print("Log history saved as 'training_log_history_bert_pretrain_500k.csv'")

In [None]:
#画图
import matplotlib.pyplot as plt
import numpy as np

# Extract the data from trainer.state.log_history
training_loss = []
validation_loss = []
for entry in trainer.state.log_history:
    if 'loss' in entry:
        training_loss.append((entry['step'], entry['loss']))
    if 'eval_loss' in entry:
        validation_loss.append((entry['step'], entry['eval_loss']))

# Separate the steps and losses
train_steps, train_losses = zip(*training_loss)
val_steps, val_losses = zip(*validation_loss)

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(train_steps, train_losses, label='Training Loss', color='blue')
plt.plot(val_steps, val_losses, label='Validation Loss', color='red')

# Add labels and title
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Add grid for better readability
plt.grid(True, linestyle='--', alpha=0.7)

# Show the plot
plt.show()

# Optionally, save the plot
plt.savefig('loss_plot.png')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract the data from trainer.state.log_history
training_loss = []
validation_loss = []
learning_rates = []
for entry in trainer.state.log_history:
    if 'loss' in entry:
        training_loss.append((entry['step'], entry['loss']))
    if 'eval_loss' in entry:
        validation_loss.append((entry['step'], entry['eval_loss']))
    if 'learning_rate' in entry:
        learning_rates.append((entry['step'], entry['learning_rate']))

# Separate the steps and values
train_steps, train_losses = zip(*training_loss)
val_steps, val_losses = zip(*validation_loss)
lr_steps, lr_values = zip(*learning_rates)

# Create the plot
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot losses
ax1.set_xlabel('Steps')
ax1.set_ylabel('Loss')
ax1.plot(train_steps, train_losses, label='Training Loss', color='blue')
ax1.plot(val_steps, val_losses, label='Validation Loss', color='red')
ax1.tick_params(axis='y')

# Create a second y-axis for learning rate
ax2 = ax1.twinx()
ax2.set_ylabel('Learning Rate')
ax2.plot(lr_steps, lr_values, label='Learning Rate', color='green', linestyle='--')
ax2.tick_params(axis='y')

# Add title and legend
plt.title('Training and Validation Loss with Learning Rate')
fig.legend(loc="upper right", bbox_to_anchor=(1,1), bbox_transform=ax1.transAxes)

# Add grid for better readability
ax1.grid(True, linestyle='--', alpha=0.7)

# Show the plot
plt.show()

# Optionally, save the plot
plt.savefig('loss_and_lr_plot.png')