In [3]:
import pandas as pd
import re
from transformers import RobertaTokenizer
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
import json
import random

data = pd.read_csv("labeled_dataset.csv")
data.drop('Unnamed: 0', axis=1, inplace=True)
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'labeled_dataset.csv'

In [2]:
#### Step 1: Prep Work

# Function to extract text from the JSON content in the DataFrame
# TODO: goal is having a list of all domain_corpus in the jsons, duplicates are needed. removing special chars tbd
def extract_text_from_df(df):
    corpus = []
    entries = df['log_entry'].tolist()
    for entry in entries:
        corpus.append(entry)
    return corpus

# if necessary expand re by: |(\/)   -- [Includes ' and /]
def tokenize_text(corpus, remove_special_chars = True): 
    domain_corpus = []
    for text in corpus:
        if remove_special_chars:
            # set remove_special_chars to True if needed
            text = re.sub(r'[(==+)|(\|)|(\')]', '', text)
        domain_corpus.extend(text.split())
    return domain_corpus

corpus = extract_text_from_df(data)
domain_corpus = tokenize_text(corpus)



#### Step 2: Extract Tokens using BPE

# Load the original RoBERTa tokenizer
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Get the original vocabulary
roberta_vocab = roberta_tokenizer.get_vocab()

# Initialize a BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Adjust the vocabulary size to match RoBERTa's
trainer = trainers.BpeTrainer(vocab_size=50265,min_frequency=4, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
tokenizer.train_from_iterator(domain_corpus, trainer)

# Get the new vocabulary from the trained tokenizer
log_vocab = tokenizer.get_vocab()

In [3]:
#### Step 3: Merge Token List

# vars: 
roBERTa_tokens = roberta_vocab.keys()
log_tokens = log_vocab.keys()
shared_tokens =  list(set(roberta_vocab.keys()).intersection(set(log_vocab.keys())))
unique_roBERTa_tokens = list(roBERTa_tokens - shared_tokens)
unique_log_tokens = list(log_tokens - shared_tokens)
used_unique_roBERTa_tokens = []
all_custom_tokens = []
number_total_tokens = 50265
all_custom_tokens = shared_tokens + unique_log_tokens + used_unique_roBERTa_tokens

while len(all_custom_tokens) < number_total_tokens:
    random_token = random.sample(unique_roBERTa_tokens, 1)[0]
    used_unique_roBERTa_tokens.append(random_token)
    unique_roBERTa_tokens.remove(random_token)
    all_custom_tokens = shared_tokens + unique_log_tokens + used_unique_roBERTa_tokens


# Summary
print(f" Number of tokens in merged tokens (should be 50265): {len(all_custom_tokens)}")
print("")
print(f" Number of shared tokens: {len(shared_tokens)} ({(len(shared_tokens)/ number_total_tokens *100)}%)")
print(f" Number of used unique log tokens: {len(unique_log_tokens)}({(len(unique_log_tokens) / number_total_tokens *100)}%)")
print(f" Number of used unique roBERTa tokens: {len(used_unique_roBERTa_tokens)}({(len(used_unique_roBERTa_tokens) / number_total_tokens *100)}%)")

In [None]:
#### Step 4: Assign Indices
# Create a new vocabulary dictionary
custom_vocab = {}

# Assign original indices to used_roberta_tokens
for token in shared_tokens:
    custom_vocab[token] = roberta_vocab[token]

for token in used_unique_roBERTa_tokens:
    custom_vocab[token] = roberta_vocab[token]

# Assign new indices to distinct tokens, ensuring no conflicts
current_index = max(roberta_vocab.values()) + 1
for token in unique_log_tokens:
    custom_vocab[token] = current_index
    current_index += 1

In [None]:
import json
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

# Load the custom vocabulary
with open("custom_vocab.json", "r") as f:
    custom_vocab = json.load(f)

# Load the RoBERTa merges file and filter out invalid merges
try:
    with open("roberta_base_merges.txt", "r", encoding="utf-8") as f:
        merges = [tuple(line.split()) for line in f.read().split("\n")[:-1]]
except IOError as e:
    print(f"Error loading RoBERTa merges file: {e}")

# Filter merges to only include those with tokens present in the custom vocabulary
filtered_merges = [merge for merge in merges if merge[0] in custom_vocab and merge[1] in custom_vocab]

# Create a new tokenizer with the custom vocabulary and filtered merges
try:
    tokenizer = Tokenizer(models.BPE(vocab=custom_vocab, merges=filtered_merges))
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
except Exception as e:
    print(f"Error creating custom tokenizer: {e}")

# Save the tokenizer
try:
    tokenizer.save("custom_tokenizer.json")
except IOError as e:
    print(f"Error saving custom tokenizer: {e}")

In [None]:
# Load and use the custom tokenizer
custom_tokenizer = Tokenizer.from_file("custom_tokenizer.json")

# Example usage
text_to_encode = "well this is a success"
encoded = custom_tokenizer.encode(text_to_encode)
print(encoded.tokens)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

# Custom tokenizer (from https://arxiv.org/abs/2204.02685)

<font size="4">
For building the tokenizer, we employ a byte pair encoding (BPE) method to build a vocabulary of words and subwords from the cybersecurity corpora, as it is proven to have better performance versus word-based tokenizer. Character based encoding used in BPE allows for the learning of a small subword vocabulary that can encode any input text without introducing any "unknown" tokens. Our objective is to create a vocabulary that retains the tokens already provided in RoBERTa’s tokenizer while also incorporating additional unique cybersecurity related tokens. In this context, we extract 50, 265 tokens from the cybersecurity corpora to generate the initial token vocabulary ΨSec. We intentionally make the size of ΨSec the same with that of the RoBERTa’s token vocabulary ΨRoBERT a as we intended to imitate original RoBERTa’s design.
If ΨSec represents the vocabulary set of SecureBERT, and ΨRoBERT a denotes the vocabulary set of original RoBERTa, both with size of 50, 265, ΨSec shares 32, 592 mutual tokens with ΨRoBERT a leaving 17, 673 tokens contribute uniquely to cybersecurity corpus, such as *firewall, breach, crack, ransomware, malware, phishing, mysql, kaspersky, obfuscated, and vulnerability*, where RoBERTa’s tokenizer analyzes those using byte pairs: 
<br>
<br>
Vmutual = ΨSec ∩ ΨRoBERT a → 32, 592 tokens <br>
Vdistinct = ΨSec − ΨRoBERT a → 17, 673 tokens<br>
<br>
Studies shows utilizing complete words (not subwords) for those are common in specific domain, can enhance the performance during training since alignments may be more challenging to understand during model training, as target tokens often require attention from multiple source tokens. Hence, we choose all mutual terms and assign their original indices, while the remainder new tokens are assigned random indices with no conflict, where the original indices refers to the indices in RoBERTa’s tokenizer, to build our tokenizer. Ultimately, we develop a customized tokenizer with a vocabulary size similar to that of the original model, which includes tokens commonly seen in cybersecurity corpora in addition to cross-domain tokens. Our tokenizer encodes mutual tokens (Vmutual) as original model, ensuring that the model returns the appropriate pre-trained weights, while for new terms (Vdistinct) the indices and accordingly the weights would be random.

</font>

In [None]:
# Overview over unique chars / amount of chars in words; pre-regex

def word_count(words):
    # Create a set 'word_set' to remove duplicate words from the input list.
    word_set = set(words)
    
    # Create an empty dictionary 'word_counts' to store word counts.
    word_counts = {}
    
    # Iterate over the unique words in 'word_set'.
    for word in word_set:
        # Count the occurrences of each word in the input list and store the count in 'word_counts'.
        word_counts[word] = words.count(word)
    
    # Return the 'word_counts' dictionary.
    return word_counts

# Call the 'word_count' function with the 'words' list and print the word counts.
#print(word_count(words)) 
uniques = set("".join(words))
print(uniques)

for char in uniques:
    amount = words.count(char)
    # print(f"Amount of { char} in words: {amount}")