In [2]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, GPT2TokenizerFast, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

In [5]:
data_files = './data_ezsocket/merged_api_para_pcapdata_dataset20k.csv'
# ; is the tab character in Python
ezsocket_dataset = load_dataset("csv", data_files=data_files, delimiter=";")

In [6]:
import re

####以下函数用于讲10进制数token化#####
def format_decimal_as_hexadecimal(decimal_str):
    # Convert the decimal string to an integer
    decimal_number = int(decimal_str)
    
    # Convert the integer to a hexadecimal string
    hex_str = hex(decimal_number)[2:]  # Strip the '0x' prefix
    
    # Ensure the length of the hex string is even
    if len(hex_str) % 2 != 0:
        hex_str = '0' + hex_str
    
    # Split the hex string into pairs of characters
    hex_pairs = [hex_str[i:i+2] for i in range(0, len(hex_str), 2)]
    
    # Join the pairs with commas
    formatted_hex = ','.join(hex_pairs)
    
    return formatted_hex


def convert_number(num_str):
    num = float(num_str)
    if num.is_integer():
        num = int(num)
        sign = "-" if num < 0 else "+"
        num_str = format_decimal_as_hexadecimal(str(num).lstrip('-'))
        return f"num,{sign},{num_str},num"
    else:
        sign = "-" if num < 0 else "+"
        num_str = num_str.lstrip('-')
        integer_part, fractional_part = num_str.split('.')
        combined_num = format_decimal_as_hexadecimal(integer_part + fractional_part.rstrip('0'))
        pos_num = format_decimal_as_hexadecimal(len(fractional_part.rstrip('0')))
        return f"num,{sign},{combined_num},pos,{pos_num},num"

def process_segment(segment):
    parts = segment.split(',')
    for i, part in enumerate(parts):
        if re.match(r'^-?\d+(\.\d+)?$', part):  # Match integers and floating-point numbers
            parts[i] = convert_number(part)
    result = ','.join(parts)
    result = result.replace(",", " ")
    return result
####以上函数用于将10进制数token化：process_segment(segment)#####

####以下函数用于将payload按两位分开，用','隔开#####
def split_payload_into_pairs(text):
    # 将文本按每两个字符分割
    pairs = [text[i:i+2] for i in range(0, len(text), 2)]
    # 用逗号连接分割后的文本
    result = ' '.join(pairs)
    return result
####以上函数用于将payload按两位分开，用','隔开#####

In [7]:
#使用map+lambda清洗数据
# clear_ezsocket_dataset = ezsocket_dataset.map(lambda x: {"Function and Parameters": x["Function and Parameters"].split(',', 1)[1]})
clear_ezsocket_dataset = ezsocket_dataset.map(lambda x: {"Function and Parameters": [o.split(',', 1)[1] for o in x["Function and Parameters"]]}, batched=True) #可加速处理，删除前面的时间戳
clear_ezsocket_dataset = clear_ezsocket_dataset.map(lambda x: {"Function and Parameters": [process_segment(o) for o in x["Function and Parameters"]]}, batched=True) #可加速处理，10进制参数token化
clear_ezsocket_dataset = clear_ezsocket_dataset.map(lambda x: {"Data Segment": [split_payload_into_pairs(o) for o in x["Data Segment"]]}, batched=True) #可加速处理，10进制参数token化

Map:   0%|          | 0/20311 [00:00<?, ? examples/s]

Map:   0%|          | 0/20311 [00:00<?, ? examples/s]

Map:   0%|          | 0/20311 [00:00<?, ? examples/s]

In [8]:
#划分训练集测试集和验证集
ezsocket_dataset_tt = clear_ezsocket_dataset["train"].train_test_split(train_size=0.8, seed=42)
ezsocket_dataset_tvt = ezsocket_dataset_tt["train"].train_test_split(train_size=0.9, seed=42)
ezsocket_dataset_tvt["validation"] = ezsocket_dataset_tvt.pop("test")
ezsocket_dataset_tvt["test"] = ezsocket_dataset_tt["test"]
ezsocket_dataset_tvt
#保存数据集使用：Arrow:	Dataset.save_to_disk()  CSV:	Dataset.to_csv()    JSON:	Dataset.to_json()

DatasetDict({
    train: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 14623
    })
    validation: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 1625
    })
    test: Dataset({
        features: ['Function and Parameters', 'Data Segment'],
        num_rows: 4063
    })
})

In [9]:
'''
import json

#手动tokenizer,直接生成词汇表,gpt2的词汇表是json格式 
word_list = set([])
for line in ezsocket_dataset_tvt["train"]["Function and Parameters"]:
    line = line.split()
    word_list.update(set(line)) # ['hello', 'how', 'are', 'you',...]
filtered_list = [item.lower() for item in word_list if len(item) >= 6]

word2idx = {f"{i:02X}".lower(): i for i in range(256)}
word2idx.update({'[MASK]' : 256, '[CLS]' : 257, '[SEP]' : 258, 'num' : 259, 'pos' : 260, '[PAD]' : 261, '+' : 262, '-' : 263})
for i, w in enumerate(filtered_list):
    word2idx[w] = i + 264

json_str = json.dumps(word2idx)
with open('vocab.json', 'w') as json_file:
    json_file.write(json_str)
'''


'\nimport json\n\n#手动tokenizer,直接生成词汇表,gpt2的词汇表是json格式 \nword_list = set([])\nfor line in ezsocket_dataset_tvt["train"]["Function and Parameters"]:\n    line = line.split()\n    word_list.update(set(line)) # [\'hello\', \'how\', \'are\', \'you\',...]\nfiltered_list = [item.lower() for item in word_list if len(item) >= 6]\n\nword2idx = {f"{i:02X}".lower(): i for i in range(256)}\nword2idx.update({\'[MASK]\' : 256, \'[CLS]\' : 257, \'[SEP]\' : 258, \'num\' : 259, \'pos\' : 260, \'[PAD]\' : 261, \'+\' : 262, \'-\' : 263})\nfor i, w in enumerate(filtered_list):\n    word2idx[w] = i + 264\n\njson_str = json.dumps(word2idx)\nwith open(\'vocab.json\', \'w\') as json_file:\n    json_file.write(json_str)\n'

In [10]:
# tokenizer = GPT2TokenizerFast(vocab_file="./vocab.json", merges_file="./merges.txt")
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer=BertTokenizer(vocab_file='./vocab.txt')

In [11]:
# Print tokenizer vocabulary size
print(f"Tokenizer vocabulary size: {len(tokenizer)}")
tokenizer.special_tokens_map
tokenizer.encode('[UNK]')

Tokenizer vocabulary size: 358


[257, 357, 258]

In [12]:
# ezsocket_dataset_tvt["train"]["Function and Parameters"][0]
tokenizer("")
tokenizer.vocab

OrderedDict([('00', 0),
             ('01', 1),
             ('02', 2),
             ('03', 3),
             ('04', 4),
             ('05', 5),
             ('06', 6),
             ('07', 7),
             ('08', 8),
             ('09', 9),
             ('0a', 10),
             ('0b', 11),
             ('0c', 12),
             ('0d', 13),
             ('0e', 14),
             ('0f', 15),
             ('10', 16),
             ('11', 17),
             ('12', 18),
             ('13', 19),
             ('14', 20),
             ('15', 21),
             ('16', 22),
             ('17', 23),
             ('18', 24),
             ('19', 25),
             ('1a', 26),
             ('1b', 27),
             ('1c', 28),
             ('1d', 29),
             ('1e', 30),
             ('1f', 31),
             ('20', 32),
             ('21', 33),
             ('22', 34),
             ('23', 35),
             ('24', 36),
             ('25', 37),
             ('26', 38),
             ('27', 39),
          

In [13]:
def tokenize_function(examples):
    # Handle potential None values and convert to empty string if None
    input_texts = [str(text) if text is not None else "" for text in examples["Function and Parameters"]]
    output_texts = [str(text) if text is not None else "" for text in examples["Data Segment"]]
    # print(input_texts)
    # print(tokenizer.sep_token)
    # Splice the input and output strings
    spliced_texts = [input_text +  output_text 
                     for input_text, output_text in zip(input_texts, output_texts)]
    # print(spliced_texts)
    # return spliced_texts
    # Tokenize the spliced texts
    tokenized = tokenizer(spliced_texts, truncation=True, max_length=210)
    
    return tokenized

tokenized_datasets = ezsocket_dataset_tvt.map(tokenize_function, batched=True, num_proc=8)

Map (num_proc=8):   0%|          | 0/14623 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=8):   0%|          | 0/1625 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4063 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets['train'][0]

{'Function and Parameters': 'SetMGNAux num + 95 9c num ',
 'Data Segment': '47 49 4f 50 01 00 01 00 4c 00 00 00 00 00 00 00 e0 37 00 00 01 00 00 00 04 00 00 00 01 00 00 00 0d 00 00 00 6d 6f 63 68 61 53 65 74 44 61 74 61 00 3c 23 77 00 00 00 00 37 00 00 00 0c b0 01 00 00 00 00 00 00 00 00 00 00 00 00 00 03 00 00 00 04 00 00 00 9c 95 00 00',
 'input_ids': [257,
  345,
  259,
  262,
  149,
  156,
  259,
  71,
  73,
  79,
  80,
  1,
  0,
  1,
  0,
  76,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  224,
  55,
  0,
  0,
  1,
  0,
  0,
  0,
  4,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  13,
  0,
  0,
  0,
  109,
  111,
  99,
  104,
  97,
  83,
  101,
  116,
  68,
  97,
  116,
  97,
  0,
  60,
  35,
  119,
  0,
  0,
  0,
  0,
  55,
  0,
  0,
  0,
  12,
  176,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  0,
  0,
  0,
  4,
  0,
  0,
  0,
  156,
  149,
  0,
  0,
  258],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [15]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [22]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
)
model = GPT2LMHeadModel(config)

In [23]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(357, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=357, bias=False)
)

In [18]:
training_args = TrainingArguments(
    output_dir="./gpt2_pretrained",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    save_total_limit=8,
    prediction_loss_only=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

trainer.train()

Step,Training Loss
500,0.2736
1000,0.1835
1500,0.1737
2000,0.1721
2500,0.17
3000,0.1687
3500,0.1675
4000,0.168
4500,0.167
5000,0.1616


TrainOutput(global_step=16005, training_loss=0.16406450695784455, metrics={'train_runtime': 12861.3419, 'train_samples_per_second': 159.27, 'train_steps_per_second': 1.244, 'total_flos': 1.1293563186432e+17, 'train_loss': 0.16406450695784455, 'epoch': 5.0})