In [10]:
import torch
from torch import nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import json
import sys
sys.path.append('../')
import config

In [2]:
def generate_text_pretrained(model, tokenizer, prompt, max_length=50):
    with torch.no_grad():
        # Prepare the prompt
        inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=50)
        input_ids = inputs['input_ids'].to('cuda')
        attention_mask = inputs['attention_mask'].to('cuda')

        # Generate text
        outputs = model.generate(input_ids=input_ids, 
                                 attention_mask=attention_mask, 
                                 max_length=max_length, 
                                 num_return_sequences=1,
                                 pad_token_id=tokenizer.eos_token_id)
        
        # Decode the generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return generated_text

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2_model_name = "gpt2-xl"
model = GPT2LMHeadModel.from_pretrained(gpt2_model_name)
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
model.to(device)
model.eval()

Using pad_token, but it is not set yet.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [4]:
# Example usage
prompt = "Can you suggest common weaknesses and vulnerabilities related to the Colonial Pipeline Attack? In May of 2021, a hacker group known as DarkSide gained access to Colonial Pipeline’s network through a compromised VPN password. This was possible, in part, because the system did not have multifactor authentication protocols in place. This made entry into the VPN easier since multiple steps were not required to verify the user’s identity. Even though the compromised password was a “complex password,” malicious actors acquired it as part of a separate data breach."
generated_text_pretrained = generate_text_pretrained(model, tokenizer, prompt, max_length=100)
print("Generated text with pretrained GPT-2:", generated_text_pretrained)

2024-08-05 15:21:09.746558: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Generated text with pretrained GPT-2: Can you suggest common weaknesses and vulnerabilities related to the Colonial Pipeline Attack? In May of 2021, a hacker group known as DarkSide gained access to Colonial Pipeline’s network through a compromised VPN password. This was possible, in part, because the company had not implemented a password policy. The DarkSide group then used the compromised VPN password to gain access to the company's network. The DarkSide group then used the company's network to launch a distributed denial of service (DDoS) attack against


In [5]:
def retrieveEmbeddings(input_text, tokenizer, model, device, chunk_size=110):
    # Tokenize the input text to find out if splitting is needed
    tokens = tokenizer.tokenize(input_text)
    
    # Initialize a list to store embeddings from each chunk
    chunk_embeddings = []
    print(chunk_size)
    # Process text in chunks that fit within the model's limit
    for i in range(0, len(tokens), chunk_size):
        if (len(tokens)>chunk_size):
            print("Chunk", i, "for", len(tokens), "tokens.")
        # Convert chunk to tokens and ensure it's within the model's max input size
        chunk_tokens = tokens[i:i + chunk_size]
        input_ids = tokenizer.convert_tokens_to_ids(chunk_tokens)
        input_tensors = torch.tensor([input_ids]).to(device)
        
        # Forward pass, get hidden states for the chunk
        with torch.no_grad():
            outputs = model(input_tensors, output_hidden_states=True)
        
        # Extract the hidden states
        hidden_states = outputs.hidden_states
        last_layer_embeddings = hidden_states[-1]
        
        # Mean pool the embeddings of the last layer across the sequence length dimension
        mean_pooled = last_layer_embeddings.mean(dim=1)
        chunk_embeddings.append(mean_pooled)
    
    # Concatenate embeddings from all chunks along the batch dimension
    # and then take the mean across the concatenated dimension to get a single embedding
    # print(len(chunk_embeddings))
    all_embeddings = torch.cat(chunk_embeddings, dim=0)
    # print(all_embeddings[0][:10])
    aggregated_embedding = all_embeddings.mean(dim=0)
    # print(aggregated_embedding[:10])
    return aggregated_embedding.squeeze()

In [8]:
prompt_embeddings = retrieveEmbeddings(prompt, tokenizer, model, device)

110


In [9]:
prompt_embeddings.shape

torch.Size([1600])

In [11]:
# Define Transformer Encoder
# Define Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.fc = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = self.activation(self.fc(x))
        return x
# Define Decoder
class Decoder(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(Decoder, self).__init__()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc(x)
        return x

In [12]:
def generate_graph_emb(org_emb):
    checkpoint = torch.load(config.ENCODER_PATH)

    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])
    # Ensure the model is in evaluation mode
    encoder.eval()
    decoder.eval()
    
    # Load new text embeddings
    new_text_embeddings = org_emb
    new_text_embeddings = torch.tensor(new_text_embeddings, dtype=torch.float32).to(device)
    
    # Perform inference
    with torch.no_grad():
        # Pass new text embeddings through the encoder
        encoded_text = encoder(new_text_embeddings.to(device))
        
        # Generate graph embeddings through the decoder
        generated_graph_embeddings = decoder(encoded_text)
    
    # Convert the generated embeddings back to numpy if needed
    generated_graph_embeddings = generated_graph_embeddings.cpu().detach().numpy()
    
    # print("Generated Graph Embeddings:", generated_graph_embeddings)
    return generated_graph_embeddings

In [13]:
text_embedding_dim = prompt_embeddings.shape[0]  # Example text embedding dimension
hidden_dim = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(text_embedding_dim, hidden_dim).to(device)
decoder = Decoder(hidden_dim, 128).to(device)

In [14]:
prompt_graph_embeddings = generate_graph_emb(prompt_embeddings)

  new_text_embeddings = torch.tensor(new_text_embeddings, dtype=torch.float32).to(device)


In [15]:
import json
import numpy as np
total_obj=1136
paragraphs = []
with open(config.DESCRIPTION_FILE) as fp:
    pos_to_desc = json.load(fp)
with open(config.DATA_DIR+"doc_id_to_emb_id.json") as fp:
    id_to_pos = json.load(fp)
with open(config.DATA_DIR+'emb_id_to_doc_id.json') as fp:
    pos_to_id = json.load(fp)
graph_embeddings = np.load(config.OUTPUT_DIR+"gcl_data/pt_gpt2-xl/sample_10/GAT/triplet/text_deepwalk_dual3_gm_1.0.npy")
text_embeddings = np.load(config.EMBEDDING_DIR+"pt_gpt2-xl/text_embeddings.npy")


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
def get_sim(gen_emb, org_emb):
    # Compute the cosine similarity matrix
    print(gen_emb.shape)
    print(org_emb.shape)
    gen_emb = gen_emb.reshape(1,gen_emb.shape[0])
    org_emb = org_emb.reshape(1,org_emb.shape[0])
    cosine_sim_matrix = cosine_similarity(gen_emb, org_emb)
    diagonal_values = np.diagonal(cosine_sim_matrix)
    # print(cosine_sim_matrix.shape)  # Should print (203, 933)
    return diagonal_values

In [17]:
get_sim(prompt_graph_embeddings, graph_embeddings[656])

(128,)
(128,)


array([-0.16370472], dtype=float32)

In [18]:
prompt_graph_embeddings.reshape(1,-1).shape

(1, 128)

In [19]:
attack_size=203
weakness_size=933
n_nodes = 1136

In [20]:
def get_top_attack_weakness(prompt_graph_embeddings, graph_embeddings, top_k1, top_k2):
    attack_size=203
    weakness_size=933
    n_nodes = 1136
    prompt_graph_embeddings = prompt_graph_embeddings.reshape(1,-1)
    cos_sim_attack = cosine_similarity(prompt_graph_embeddings, graph_embeddings[:attack_size]).reshape(-1)
    cos_sim_weak = cosine_similarity(prompt_graph_embeddings, graph_embeddings[attack_size:]).reshape(-1)
    top_attack = np.argsort(cos_sim_attack)[::-1][:top_k1]
    top_weak = np.argsort(cos_sim_weak)[::-1][:top_k2]

    attack_pairs = list(zip(cos_sim_attack[top_attack], top_attack))
    weak_pairs = list(zip(cos_sim_weak[top_weak], top_weak+attack_size))
    
    return attack_pairs, weak_pairs

In [21]:
get_top_attack_weakness(prompt_graph_embeddings, graph_embeddings, 10, 10)

([(0.1715091, 172),
  (0.13287933, 184),
  (0.11583056, 160),
  (0.11303975, 195),
  (0.10477096, 42),
  (0.101518475, 115),
  (0.10004559, 176),
  (0.09450128, 45),
  (0.09160155, 186),
  (0.091591865, 39)],
 [(0.21997546, 404),
  (0.20372754, 791),
  (0.20244679, 649),
  (0.19950409, 829),
  (0.19440942, 230),
  (0.19415408, 303),
  (0.19164823, 979),
  (0.18940672, 368),
  (0.1878996, 405),
  (0.18762362, 788)])

In [22]:
related_attack, related_weakness = get_top_attack_weakness(prompt_graph_embeddings, graph_embeddings, 10, 10)

In [23]:
def generate_prompt_context(prompt, related_attack, related_weakness):
    attack_text = ""
    weakness_text = ""
    for _,pos in related_attack:
        attack_text = attack_text+" "+ pos_to_desc[pos_to_id[str(pos)]]
    for _,pos in related_weakness:
        weakness_text = weakness_text+" "+ pos_to_desc[pos_to_id[str(pos)]]
    augmented_prompt = prompt+ "\nRelated Att@ck Description:\n"+attack_text+ "\nRelated Weakness Description:\n"+weakness_text
    return augmented_prompt

In [24]:
augmented_prompt = generate_prompt_context(prompt, related_attack, related_weakness)

In [25]:
print(augmented_prompt)

Can you suggest common weaknesses and vulnerabilities related to the Colonial Pipeline Attack? In May of 2021, a hacker group known as DarkSide gained access to Colonial Pipeline’s network through a compromised VPN password. This was possible, in part, because the system did not have multifactor authentication protocols in place. This made entry into the VPN easier since multiple steps were not required to verify the user’s identity. Even though the compromised password was a “complex password,” malicious actors acquired it as part of a separate data breach.
Related Att@ck Description:
 Adversaries may compromise safety system functions designed to maintain safe operation of a process when unacceptable or dangerous conditions occur. Safety systems are often composed of the same elements as control systems but have the sole purpose of ensuring the process fails in a predetermined safe manner. 

Many unsafe conditions in process control happen too quickly for a human operator to react to