In [1]:
from transformers import BertTokenizer, EncoderDecoderModel
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from datasets import load_metric

# Load the tokenizers
src_tokenizer = BertTokenizer(vocab_file='./vocab.txt')
tgt_tokenizer = BertTokenizer(vocab_file='./vocab.txt')  # Or your target language tokenizer

# Load the fine-tuned model
model = T5ForConditionalGeneration.from_pretrained("./t5_pretrained/checkpoint-100000")

# Ensure the model is in evaluation mode and move it to GPU if available
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(358, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(358, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(

In [2]:
import re

####以下函数用于讲10进制数token化#####
def format_decimal_as_hexadecimal(decimal_str):
    # Convert the decimal string to an integer
    decimal_number = int(decimal_str)
    
    # Convert the integer to a hexadecimal string
    hex_str = hex(decimal_number)[2:]  # Strip the '0x' prefix
    
    # Ensure the length of the hex string is even
    if len(hex_str) % 2 != 0:
        hex_str = '0' + hex_str
    
    # Split the hex string into pairs of characters
    hex_pairs = [hex_str[i:i+2] for i in range(0, len(hex_str), 2)]
    
    # Join the pairs with commas
    formatted_hex = ','.join(hex_pairs)
    
    return formatted_hex


def convert_number(num_str):
    num = float(num_str)
    if num.is_integer():
        num = int(num)
        sign = "-" if num < 0 else "+"
        num_str = format_decimal_as_hexadecimal(str(num).lstrip('-'))
        return f"num,{sign},{num_str},num"
    else:
        sign = "-" if num < 0 else "+"
        num_str = num_str.lstrip('-')
        integer_part, fractional_part = num_str.split('.')
        combined_num = format_decimal_as_hexadecimal(integer_part + fractional_part.rstrip('0'))
        pos_num = format_decimal_as_hexadecimal(len(fractional_part.rstrip('0')))
        return f"num,{sign},{combined_num},pos,{pos_num},num"

def process_segment(segment):
    parts = segment.split(',')
    for i, part in enumerate(parts):
        if re.match(r'^-?\d+(\.\d+)?$', part):  # Match integers and floating-point numbers
            parts[i] = convert_number(part)
    result = ','.join(parts)
    result = result.replace(",", " ")
    return result
####以上函数用于将10进制数token化：process_segment(segment)#####

####以下函数用于将payload按两位分开，用','隔开#####
def split_payload_into_pairs(text):
    # 将文本按每两个字符分割
    pairs = [text[i:i+2] for i in range(0, len(text), 2)]
    # 用逗号连接分割后的文本
    result = ' '.join(pairs)
    return result
####以上函数用于将payload按两位分开，用','隔开#####

In [3]:
def translate(src_text, input_max_length=50, output_max_length=220):
    # Tokenize the input text
    src_text = process_segment(src_text)
    inputs = src_tokenizer(src_text, return_tensors="pt", max_length=input_max_length, truncation=True, padding="max_length")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # Generate translation
    outputs = model.generate(input_ids=input_ids, 
                             attention_mask=attention_mask,
                             max_length=output_max_length,
                             num_beams=4,
                             early_stopping=True)

    # Decode the generated tokens
    payload = tgt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return payload

In [4]:
Function_Parameters = "GetAuxAxisDiagnosis,2,2," 
# Print tokenizer vocabulary size
print(f"Tokenizer vocabulary size: {len(tgt_tokenizer)}")

# Tokenize input and print token IDs
tokens = src_tokenizer(Function_Parameters, return_tensors="pt")
print(f"Input token IDs: {tokens['input_ids']}")
print(f"Max token ID: {tokens['input_ids'].max().item()}")

# Check model's embedding layer size
embedding_size = model.get_input_embeddings().num_embeddings
print(f"Model embedding size: {embedding_size}")

# Compare max token ID with embedding size
if tokens['input_ids'].max().item() >= embedding_size:
    print("Warning: Input contains token IDs outside the model's vocabulary range")

Tokenizer vocabulary size: 358
Input token IDs: tensor([[257, 313, 357, 357, 357, 357, 357, 258]])
Max token ID: 357
Model embedding size: 358


In [5]:
from datasets import load_dataset

data_files = './data_ezsocket/merged_api_para_pcapdata_dataset20k.csv'
# ; is the tab character in Python
ezsocket_dataset = load_dataset("csv", data_files=data_files, delimiter=";")
ezsocket_dataset = ezsocket_dataset.map(lambda x: {"Function and Parameters": [o.split(',', 1)[1] for o in x["Function and Parameters"]]}, batched=True) #可加速处理，删除前面的时间戳

Map:   0%|          | 0/20311 [00:00<?, ? examples/s]

In [6]:
from datasets import load_metric

fun_para = ezsocket_dataset['train']['Function and Parameters']
references = ezsocket_dataset['train']['Data Segment']
predictions_comput = []
references_comput = []
for i in range(1000):
    prediction = translate(fun_para[i])
    reference = references[i]
    predictions_comput.append([prediction])
    references_comput.append([re.sub(r"(?<=\w)(?=(?:\w\w)+$)", " ", reference)])

print(predictions_comput)
print(references_comput)
meteor = load_metric('sacrebleu')
results = meteor.compute(predictions=predictions_comput, references=references_comput)
print(results)

[['47 49 4f 50 01 00 01 00 44 00 00 00 00 00 00 00 e0 37 00 00 01 00 17 00 04 00 00 00 01 00 00 00 0d 00 00 00 6d 6f 63 68 61 47 65 74 44 61 74 61 00 3c 23 77 00 00 00 00 43 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 00 00 00'], ['47 49 4f 50 01 00 01 00 44 00 00 00 00 00 00 00 e0 37 00 00 01 00 17 00 04 00 00 00 01 00 00 00 0d 00 00 00 6d 6f 63 68 61 47 65 74 44 61 74 61 00 3c 23 77 00 00 00 00 43 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 00 00 00'], ['47 49 4f 50 01 00 01 00 44 00 00 00 00 00 00 00 e0 37 00 00 01 00 17 00 04 00 00 00 01 00 00 00 0d 00 00 00 6d 6f 63 68 61 47 65 74 44 61 74 61 00 3c 23 77 00 00 00 00 43 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 00 00 00'], ['47 49 4f 50 01 00 01 00 44 00 00 00 00 00 00 00 e0 37 00 00 01 f9 5d 01 04 00 00 00 01 00 00 00 0d 00 00 00 6d 6f 63 68 61 47 65 74 44 61 74 61 00 00 00 00 00 00 00 00 01 00 00 00 02 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00'], ['47 49 4f 50 01 00

  meteor = load_metric('sacrebleu')


{'score': 89.09197565865688, 'counts': [76638, 73846, 71191, 68522], 'totals': [82869, 81869, 80869, 79869], 'precisions': [92.4809035948304, 90.20019787709634, 88.03249700132312, 85.79298601459891], 'bp': 1.0, 'sys_len': 82869, 'ref_len': 80658}


In [7]:
fun_para = ezsocket_dataset['train']['Function and Parameters']
references = ezsocket_dataset['train']['Data Segment']
predictions_comput = []
references_comput = []
for i in range(1000):
    prediction = translate(fun_para[i])
    reference = references[i]
    predictions_comput.append(prediction)
    references_comput.append(re.sub(r"(?<=\w)(?=(?:\w\w)+$)", " ", reference))

meteor = load_metric('meteor')
results = meteor.compute(predictions=predictions_comput, references=references_comput)
print(results)

[nltk_data] Downloading package wordnet to /home/lizedong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/lizedong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/lizedong/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{'meteor': 0.9684239574937917}


In [8]:
fun_para = ezsocket_dataset['train']['Function and Parameters']
references = ezsocket_dataset['train']['Data Segment']
predictions_comput = []
references_comput = []
for i in range(1000):
    prediction = translate(fun_para[i])
    reference = references[i]
    predictions_comput.append(prediction)
    references_comput.append(re.sub(r"(?<=\w)(?=(?:\w\w)+$)", " ", reference))

meteor = load_metric('rouge')
results = meteor.compute(predictions=predictions_comput, references=references_comput)
print(results)

{'rouge1': AggregateScore(low=Score(precision=0.9685605237062124, recall=0.9712724021199358, fmeasure=0.9699029514523668), mid=Score(precision=0.9711933672821116, recall=0.9733195982938596, fmeasure=0.9721131293683134), high=Score(precision=0.9736466015401188, recall=0.9753560651419619, fmeasure=0.9743689104154267)), 'rouge2': AggregateScore(low=Score(precision=0.9449417098963252, recall=0.947296278558594, fmeasure=0.9459695810337028), mid=Score(precision=0.9486711356836915, recall=0.9506690744956237, fmeasure=0.9495480543006618), high=Score(precision=0.9525316977848582, recall=0.9541370923944531, fmeasure=0.9532184893567062)), 'rougeL': AggregateScore(low=Score(precision=0.9654007743796128, recall=0.9679069708906253, fmeasure=0.9665370265189495), mid=Score(precision=0.9683114384457023, recall=0.9703869970715531, fmeasure=0.9692201512407441), high=Score(precision=0.9709616452626488, recall=0.9727045838107942, fmeasure=0.9715590177731117)), 'rougeLsum': AggregateScore(low=Score(precisio