In [1]:
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1" 
# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [2]:
dataset_name = "ics_attack"
EMBEDDING_SIZE = 1600
data_dir = "../datasets/"+dataset_name+"/"
model_output_dir = "../model_outputs/"+dataset_name+"/llm_finetuned_models/"
model_name = "gpt2-xl"
model_path = model_output_dir+model_name+"/epoch_10/"
embeddings_path = "../model_outputs/embeddings/"+model_name+"/epoch_10/"
# model_name = "pt_gpt2-xl"
# model_path = model_output_dir+model_name+"/"
if not os.path.exists(model_path):
    os.makedirs(model_path)
if not os.path.exists(embeddings_path):
    os.makedirs(embeddings_path)

In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class GPT2PromptTuningMixin:
    def set_soft_prompt_embeds(self, soft_prompt_path):
        self.soft_prompt = torch.load(soft_prompt_path)
        self.n_tokens = self.soft_prompt.num_embeddings
        print(f"Set soft prompt! (n_tokens: {self.n_tokens})")

    def initialize_soft_prompt(self, n_tokens=20, initialize_from_vocab=True, random_range=0.5):
        self.n_tokens = n_tokens
        if initialize_from_vocab:
            init_prompt_value = self.transformer.wte.weight[:n_tokens].clone().detach()
        else:
            init_prompt_value = torch.FloatTensor(n_tokens, self.config.n_embd).uniform_(-random_range, random_range)
        self.soft_prompt = nn.Embedding(n_tokens, self.config.n_embd)
        self.soft_prompt.weight = nn.parameter.Parameter(init_prompt_value)

    def _cat_learned_embedding_to_input(self, input_ids):
        inputs_embeds = self.transformer.wte(input_ids)
        learned_embeds = self.soft_prompt.weight.repeat(inputs_embeds.size(0), 1, 1)
        inputs_embeds = torch.cat([learned_embeds, inputs_embeds], dim=1)
        return inputs_embeds

    def _extend_labels(self, labels, ignore_index=-100):
        n_batches = labels.shape[0]
        return torch.cat(
            [torch.full((n_batches, self.n_tokens), ignore_index).to(self.device), labels],
            dim=1,
        )

    def _extend_attention_mask(self, attention_mask):
        n_batches = attention_mask.shape[0]
        return torch.cat(
            [torch.full((n_batches, self.n_tokens), 1).to(self.device), attention_mask],
            dim=1,
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        if input_ids is not None:
            inputs_embeds = self._cat_learned_embedding_to_input(input_ids).to(self.device)
        if labels is not None:
            labels = self._extend_labels(labels).to(self.device)
        if attention_mask is not None:
            attention_mask = self._extend_attention_mask(attention_mask).to(self.device)

        return super().forward(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels, **kwargs)
class GPT2PromptTuningLM(GPT2PromptTuningMixin, GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)

In [4]:
def load_model_and_tokenizer(model_path, soft_prompt_path, device='cuda'):
    # Load the base GPT-2 model
    print("loading soft prompt")
    base_model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
    
    # Initialize the prompt tuning model
    soft_model = GPT2PromptTuningLM.from_pretrained("gpt2-xl", output_hidden_states=True, return_dict=True)
    soft_model.set_soft_prompt_embeds(soft_prompt_path)
    
    # Load the tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")

    # Move model to the specified device
    soft_model.to(device)
    
    return soft_model, tokenizer

In [5]:
# from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import RobertaTokenizer, RobertaForMaskedLM, AutoTokenizer, AutoModelForMaskedLM, GPT2Tokenizer, GPT2LMHeadModel
from peft import PeftModel, PeftConfig
from GPTPromptTuningModel import GPT2PromptTuningLM

def getModel(isPretrained, model_path):

    if(not isPretrained):
        if(model_name=="soft"):
            # Define paths
            model_path = "./trained_models/SOFT_GPT2-XL_E2"
            soft_prompt_path = "./trained_models/SOFT_GPT2-XL_E2/soft_prompt.model"
            # Load the model and tokenizer
            model, tokenizer = load_model_and_tokenizer(model_path, soft_prompt_path)
            return model, tokenizer
            
        if(model_name=="lora"):
            print("Load finetuned model:")
            # Path to your fine-tuned model
            model_path = MODEL_PATHS[model_name]
            base_model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
            base_model.to(device)
            # Load the fine-tuned model
            lora_model = PeftModel.from_pretrained(base_model,model_path,is_trainable=False)
            lora_model.to(device)
            # Load the tokenizer used during fine-tuning
            tokenizer = GPT2Tokenizer.from_pretrained(model_path)
            return lora_model, tokenizer
        if(model_name=="gpt2-xl"):
            print("Load finetuned model:")
            # Load the fine-tuned model
            model = GPT2LMHeadModel.from_pretrained(model_path)
            model.to(device)
            # Load the tokenizer used during fine-tuning
            tokenizer = GPT2Tokenizer.from_pretrained(model_path)
            # Set padding token
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
                tokenizer.pad_token_id = tokenizer.eos_token_id
            return model, tokenizer
        elif(model_name=="SecureBert_E5"):
            print("Load finetuned model:")
            # Path to your fine-tuned model
            model_path = MODEL_PATHS[model_name]
            
            # Load the fine-tuned model
            model = RobertaForMaskedLM.from_pretrained(model_path)
            model.to(device)
            # Load the tokenizer used during fine-tuning
            tokenizer = RobertaTokenizer.from_pretrained(model_path)
            # Set padding token
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
                tokenizer.pad_token_id = tokenizer.eos_token_id
            return model, tokenizer
        elif(model_name=="SecBert_E5"):
            print("Load finetuned model:")
            # Path to your fine-tuned model
            model_path = MODEL_PATHS[model_name]
            
            # Load the fine-tuned model
            model = AutoModelForMaskedLM.from_pretrained(model_path)
            model.to(device)
            
            # Load the tokenizer used during fine-tuning
            tokenizer = AutoTokenizer.from_pretrained(model_path)
            # Set padding token
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
                tokenizer.pad_token_id = tokenizer.eos_token_id
            return model, tokenizer
    else:
        
        if(model_name=="pretrained_SecureBert"):
            print("Load pretrained_SecureBert model:")
            # Load pre-trained SecureBert tokenizer and model
            tokenizer = RobertaTokenizer.from_pretrained("ehsanaghaei/SecureBERT")
            model = RobertaForMaskedLM.from_pretrained("ehsanaghaei/SecureBERT")
            model.to(device)
        elif(model_name=="pretrained_SecBert"):
            print("Load pretrained_SecBert model:")
            # Load pre-trained SecBert tokenizer and model
            tokenizer = AutoTokenizer.from_pretrained("jackaduma/SecRoBERTa")
            
            model = AutoModelForMaskedLM.from_pretrained("jackaduma/SecRoBERTa")
            model.to(device)
        elif(model_name=="pt_gpt2-xl"):
            print("Load pretrained gpt2 model:")
            # Load pre-trained Gpt2 tokenizer and model
            tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
            model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
            model.to(device)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
        return model, tokenizer


In [6]:
isPretrained = (model_name.split("_")[0]=="pt")
model, tokenizer = getModel(isPretrained, model_path)

Load finetuned model:


In [7]:
# # Encode some input text
# # Example1:  Monitor application logs for changes to settings and other events associated with network protocols that may be used to block communications.

# # Example2: Adversaries may cause a sustained or permanent loss of view where the ICS equipment will require local, 
# # hands-on operator intervention; for instance, a restart or manual operation. 
# # By causing a sustained reporting or visibility loss, the adversary can effectively hide the present state of operations. 
# # This loss of view can occur without affecting the physical processes themselves.
# # (Citation: Corero) (Citation: Michael J. Assante and Robert M. Lee) (Citation: Tyson Macaulay)

# # Example3: A vulnerability classified as critical has been found in Tenda TX9 22.03.02.10. 
# # This affects the function sub_42CB94 of the file /goform/SetVirtualServerCfg. 
# # The manipulation of the argument list leads to stack-based buffer overflow. 
# # It is possible to initiate the attack remotely. The exploit has been disclosed to the public and may be used. 
# # The associated identifier of this vulnerability is VDB-261855. 
# # NOTE: The vendor was contacted early about this disclosure but did not respond in any way.

# # Example4:A Path Traversal vulnerability in web component of Ivanti Avalanche before 6.4.3 allows 
# # a remote authenticated attacker to execute arbitrary commands as SYSTEM.
# # Overview. A path traversal attack (also known as directory traversal) aims to access files 
# # and directories that are stored outside the web root folder.

# # Example5: RCE vulnerabilities allow an attacker to execute arbitrary code on a remote device. 
# # An attacker can achieve RCE in a few different ways, including:
# # Injection Attacks, Deserialization Attacks , Out-of-Bounds Write

# input_ids = tokenizer.encode("A remote code execution vulnerability allow an attacker to", return_tensors='pt').to(device)

# # Generate text using the model
# output = model.generate(input_ids, max_length=50, num_return_sequences=1)

# # Decode the generated text
# generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
# print(generated_text)


In [8]:
## Create embeddings. The methods will handle long texts. That is, 
## it will do chunking of tokens and then consolidate when input_text is long.


def retrieveEmbeddings(input_text, tokenizer, model, device):
    # Tokenize the input text to find out if splitting is needed
    tokens = tokenizer.tokenize(input_text)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_tensors = torch.tensor([input_ids]).to(device)
    with torch.no_grad():
        # outputs = model(input_tensors, output_hidden_states=True)
        outputs = model.forward(input_tensors, output_hidden_states=True)
    # Extract the hidden states
    hidden_states = outputs.hidden_states
    last_layer_embeddings = hidden_states[-1]
    
    # Mean pool the embeddings of the last layer across the sequence length dimension
    mean_pooled = last_layer_embeddings.mean(dim=1)
    
    return mean_pooled.squeeze()
   


In [9]:
text = "Adversaries may cause a sustained or permanent loss of view to operators and/or engineers. This may result in a loss of control, or a crash or other incident. (Citation: Department of Homeland Security September 2016) (Citation:"
embd = retrieveEmbeddings(text, tokenizer, model, device)

In [10]:
len(embd)

1600

### Now I will create Embeddigns for the attack+weakness Nodes


In [11]:

# Load JSON file
import json
with open(data_dir+'doc_id_to_desc.json') as f:
    doc_id_to_desc = json.load(f)
with open(data_dir+'doc_id_to_emb_id.json') as f:
    doc_id_to_emb_id = json.load(f)
with open(data_dir+'emb_id_to_doc_id.json') as f:
    emb_id_to_doc_id = json.load(f)


In [12]:
import numpy as np

text_embeddings = [None for _ in range(len(emb_id_to_doc_id))]

count=0
i=0
for doc_id in doc_id_to_desc:
    text_data=doc_id_to_desc[doc_id]
    embedding=[]
    try:
        embedding=retrieveEmbeddings(text_data, tokenizer, model, device)
        count=count+1
    except Exception as e:
        print("Exception:", e)        
        print("i", i)
        print("Len:", len(text_data))
        print(text_data)
        break
    
    text_embeddings[int(doc_id_to_emb_id[doc_id])]=embedding.detach().cpu().numpy()

    if (i%50==0):
        print("Procesed", i,"th object: id:", doc_id)   
    i+=1
print("Processing of ", len(text_embeddings), "objects complete.")
print(count, "objects have valid text descriptions.")

Procesed 0 th object: id: malware--a4a98eab-b691-45d9-8c48-869ef8fefd57
Procesed 50 th object: id: course-of-action--1e7ccfc0-94c8-496e-8d27-032120892291
Procesed 100 th object: id: malware--ac61f1f9-7bb1-465e-9b8a-c2ce8e88baf5
Procesed 150 th object: id: attack-pattern--fab8fc7d-f27f-4fbb-9de6-44740aade05f
Procesed 200 th object: id: x-mitre-data-component--74fa567d-bc90-425c-8a41-3c703abb221c
Procesed 250 th object: id: 1053
Procesed 300 th object: id: 595
Procesed 350 th object: id: 118
Procesed 400 th object: id: 1263
Procesed 450 th object: id: 1277
Procesed 500 th object: id: 1321
Procesed 550 th object: id: 146
Procesed 600 th object: id: 195
Procesed 650 th object: id: 25
Procesed 700 th object: id: 300
Procesed 750 th object: id: 349
Procesed 800 th object: id: 41
Procesed 850 th object: id: 466
Procesed 900 th object: id: 52
Procesed 950 th object: id: 570
Procesed 1000 th object: id: 914
Procesed 1050 th object: id: 759
Procesed 1100 th object: id: 831
Processing of  1136 ob

In [13]:
import numpy as np
text_embeddings_np = np.array(text_embeddings)
np.save(embeddings_path+"text_embeddings.npy",text_embeddings_np)