# Data pre-process

for the csv files in batch

In [1]:
import sys
# Add the directory containing embeddings_utils.py to the Python path
sys.path.append('./utils')
import os
import glob

import pandas as pd
import tiktoken
from utils.embeddings_utils import get_embedding
import re

In [2]:
def tokenizer_payload(hex_string, token_length = 4):
    # token_length = 4 
    # new token length according to the max tokens for the embedding; which is original 6   
    regex_pattern = '.{1,' + str(token_length) + '}'
    return ' '.join(re.findall(regex_pattern, hex_string))

In [3]:
def get_embeddings_from_payload(csv_file, benign = True, embedding_model = "text-embedding-ada-002", embedding_encoding = "cl100k_base" ):
    max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
    top_n = 500 # take first 10 packets for test
    filename = os.path.basename(csv_file)

    df = pd.read_csv(csv_file)
    df.dropna(subset = ['tcp.payload'], inplace=True)

    df['tokenizer_content'] = df['tcp.payload'].apply(tokenizer_payload)
    encoding = tiktoken.get_encoding(embedding_encoding)

    df["n_tokens"] = df.tokenizer_content.apply(lambda x: len(encoding.encode(x)))
    df = df[df.n_tokens <= max_tokens].tail(top_n)
    # df = df[df.n_tokens <= max_tokens]

    df_embedding = pd.DataFrame()
    df_embedding["X"] = df.tokenizer_content.apply(lambda x: get_embedding(x, model=embedding_model))
    if(benign):
        df_embedding["y"] = 0
    else:
        df_embedding["y"] = 1
    df_embedding.to_csv("./embeddings/"+filename, index=False)
    return 0

In [4]:
# Set the folder path
folder_path = './test_payload/'  # Replace with your folder path
done_list = ["4Lab_cam_fw16_2.csv", 
             "8webcam932_novideo_load_ip12.csv",
             "12hack3_2.csv"]

# Iterate over all CSV files in the folder
for csv_file in glob.glob(os.path.join(folder_path, '*.csv')):
    filename = os.path.basename(csv_file)
    print(f"Processing file: {csv_file}")
    if filename in done_list:
        print(f"Already done: {csv_file}")
        continue
    if ('attack' in filename) or ('hack' in filename) or ('load' in filename):
        get_embeddings_from_payload(csv_file, benign=False)
    else:
        get_embeddings_from_payload(csv_file, benign=True)

Processing file: ./test_payload/12hack3_2.csv
Already done: ./test_payload/12hack3_2.csv
Processing file: ./test_payload/8webcam932_novideo_load_ip12.csv
Already done: ./test_payload/8webcam932_novideo_load_ip12.csv
Processing file: ./test_payload/4Lab_cam_fw16_2.csv
Already done: ./test_payload/4Lab_cam_fw16_2.csv
Processing file: ./test_payload/6Lab_cam_fw17_3.csv
Processing file: ./test_payload/5Lab_cam_fw16_3.csv
Processing file: ./test_payload/10webcam5020_novideo_hack_ip134.csv
Processing file: ./test_payload/9webcam5020_novideo_load_ip134.csv
Processing file: ./test_payload/3txt_sample10_trans_enc_utf8.csv
Processing file: ./test_payload/7attacked_932_loading.csv
Processing file: ./test_payload/1txt_sample5clean_enc_ip159.csv
Processing file: ./test_payload/2txt_sample9_trans_enc_utf8.csv
Processing file: ./test_payload/11webcam932_novideo_hack_ip12.csv


In [2]:
df_temp = pd.read_csv("./embeddings/4Lab_cam_fw16_2.csv")
df_temp.iloc[:2]["X"]

0    [-0.0051308306865394115, -0.021122252568602562...
1    [-0.016887426376342773, -0.008056876249611378,...
Name: X, dtype: object

In [7]:
df_temp_2 = pd.read_csv("./embeddings_2/4Lab_cam_fw16_2.csv")
df_temp_2.iloc[:2]["X"]

0    [-0.0051308306865394115, -0.021122252568602562...
1    [-0.016887426376342773, -0.008056876249611378,...
Name: X, dtype: object