In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import json
import sys
sys.path.append('../')
import config

In [2]:
prompt_embeddings = np.load("data/prompt_embeddings.npy")

In [3]:
# Define Transformer Encoder
# Define Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.fc = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = self.activation(self.fc(x))
        return x
# Define Decoder
class Decoder(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(Decoder, self).__init__()
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc(x)
        return x

In [4]:
def generate_graph_emb(org_emb):
    checkpoint = torch.load('../embedding_generator/data/encoder_decoder_model.pth')

    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])
    # Ensure the model is in evaluation mode
    encoder.eval()
    decoder.eval()
    
    # Load new text embeddings
    new_text_embeddings = org_emb
    new_text_embeddings = torch.tensor(new_text_embeddings, dtype=torch.float32).to(device)
    
    # Perform inference
    with torch.no_grad():
        # Pass new text embeddings through the encoder
        encoded_text = encoder(new_text_embeddings.to(device))
        
        # Generate graph embeddings through the decoder
        generated_graph_embeddings = decoder(encoded_text)
    
    # Convert the generated embeddings back to numpy if needed
    generated_graph_embeddings = generated_graph_embeddings.cpu().detach().numpy()
    
    # print("Generated Graph Embeddings:", generated_graph_embeddings)
    return generated_graph_embeddings

In [5]:
text_embedding_dim = prompt_embeddings.shape[0]  # Example text embedding dimension
hidden_dim = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(text_embedding_dim, hidden_dim).to(device)
decoder = Decoder(hidden_dim, 128).to(device)

In [6]:
prompt_graph_embeddings = generate_graph_emb(prompt_embeddings)

In [9]:
import json
import numpy as np
total_obj=1136
paragraphs = []
with open(config.DESCRIPTION_FILE) as fp:
    pos_to_desc = json.load(fp)
with open(config.DATA_DIR+"doc_id_to_emb_id.json") as fp:
    id_to_pos = json.load(fp)
with open(config.DATA_DIR+'emb_id_to_doc_id.json') as fp:
    pos_to_id = json.load(fp)
graph_embeddings = np.load(config.OUTPUT_DIR+"gcl_data/pt_gpt2-xl/sample_10/GAT/triplet/text_deepwalk_dual3_gm_1.0.npy")
text_embeddings = np.load(config.EMBEDDING_DIR+"pt_gpt2-xl/text_embeddings.npy")


In [16]:
id_to_pos['522']

656

In [17]:
pos_to_desc['656']

'Reliance on Security Through Obscurity. When available, use publicly-vetted algorithms and procedures, as these are more likely to undergo more extensive security analysis and testing. This is especially the case with encryption and authentication. This reliance on "security through obscurity" can produce resultant weaknesses if an attacker is able to reverse engineer the inner workings of the mechanism. Note that obscurity can be one small part of defense in depth, since it can create more work for an attacker; however, it is a significant risk if used as the primary means of protection. '

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
def get_sim(gen_emb, org_emb):
    # Compute the cosine similarity matrix
    print(gen_emb.shape)
    print(org_emb.shape)
    gen_emb = gen_emb.reshape(1,gen_emb.shape[0])
    org_emb = org_emb.reshape(1,org_emb.shape[0])
    cosine_sim_matrix = cosine_similarity(gen_emb, org_emb)
    diagonal_values = np.diagonal(cosine_sim_matrix)
    # print(cosine_sim_matrix.shape)  # Should print (203, 933)
    return diagonal_values

In [19]:
get_sim(prompt_graph_embeddings, graph_embeddings[656])

(128,)
(128,)


array([-0.16370472], dtype=float32)

In [20]:
prompt_graph_embeddings.reshape(1,-1).shape

(1, 128)

In [21]:
np.save("data/prompt_graph_embeddings.npy", prompt_graph_embeddings)

In [22]:
# Get top similar

In [23]:
attack_size=203
weakness_size=933
n_nodes = 1136

In [24]:
def get_top_attack_weakness(prompt_graph_embeddings, graph_embeddings, top_k1, top_k2):
    prompt_graph_embeddings = prompt_graph_embeddings.reshape(1,-1)
    cos_sim_attack = cosine_similarity(prompt_graph_embeddings, graph_embeddings[:attack_size]).reshape(-1)
    cos_sim_weak = cosine_similarity(prompt_graph_embeddings, graph_embeddings[attack_size:]).reshape(-1)
    top_attack = np.argsort(cos_sim_attack)[::-1][:top_k1]
    top_weak = np.argsort(cos_sim_weak)[::-1][:top_k2]

    attack_pairs = list(zip(cos_sim_attack[top_attack], top_attack))
    weak_pairs = list(zip(cos_sim_weak[top_weak], top_weak+attack_size))
    
    return attack_pairs, weak_pairs

In [25]:
get_top_attack_weakness(prompt_graph_embeddings, graph_embeddings, 10, 10)

([(0.1715091, 172),
  (0.13287933, 184),
  (0.11583056, 160),
  (0.11303975, 195),
  (0.10477096, 42),
  (0.101518475, 115),
  (0.10004559, 176),
  (0.09450128, 45),
  (0.09160155, 186),
  (0.091591865, 39)],
 [(0.21997546, 404),
  (0.20372754, 791),
  (0.20244679, 649),
  (0.19950409, 829),
  (0.19440942, 230),
  (0.19415408, 303),
  (0.19164823, 979),
  (0.18940672, 368),
  (0.1878996, 405),
  (0.18762362, 788)])

In [26]:
graph_embeddings.shape

(1136, 128)

In [27]:
related_attack, related_weakness = get_top_attack_weakness(prompt_graph_embeddings, graph_embeddings, 10, 10)

In [32]:
def generate_prompt_context(prompt, related_attack, related_weakness):
    attack_text = ""
    weakness_text = ""
    for _,pos in related_attack:
        attack_text = attack_text+" "+ pos_to_desc[pos_to_id[str(pos)]]
    for _,pos in related_weakness:
        weakness_text = weakness_text+" "+ pos_to_desc[pos_to_id[str(pos)]]
    augmented_prompt = prompt+ "\nRelated Att@ck Description:\n"+attack_text+ "\nRelated Weakness Description:\n"+weakness_text
    return augmented_prompt

In [33]:
prompt = "Can you suggest common weaknesses and vulnerabilities related to the Colonial Pipeline Attack? In May of 2021, a hacker group known as DarkSide gained access to Colonial Pipeline’s network through a compromised VPN password. This was possible, in part, because the system did not have multifactor authentication protocols in place. This made entry into the VPN easier since multiple steps were not required to verify the user’s identity. Even though the compromised password was a “complex password,” malicious actors acquired it as part of a separate data breach."

augmented_prompt = generate_prompt_context(prompt, related_attack, related_weakness)

In [34]:
print(augmented_prompt)

Can you suggest common weaknesses and vulnerabilities related to the Colonial Pipeline Attack? In May of 2021, a hacker group known as DarkSide gained access to Colonial Pipeline’s network through a compromised VPN password. This was possible, in part, because the system did not have multifactor authentication protocols in place. This made entry into the VPN easier since multiple steps were not required to verify the user’s identity. Even though the compromised password was a “complex password,” malicious actors acquired it as part of a separate data breach.
Related Att@ck Description:
 Adversaries may compromise safety system functions designed to maintain safe operation of a process when unacceptable or dangerous conditions occur. Safety systems are often composed of the same elements as control systems but have the sole purpose of ensuring the process fails in a predetermined safe manner. 

Many unsafe conditions in process control happen too quickly for a human operator to react to

In [None]:
prompt_dict = {"prompt":prompt, "augmented_prompt":augmented_prompt}

In [None]:
with open('data/augmented_prompt.json','w') as fp:
    json.dump(prompt_dict,fp)

In [31]:
pos_to_desc.keys()

dict_keys(['malware--a4a98eab-b691-45d9-8c48-869ef8fefd57', 'attack-pattern--b7e13ee8-182c-4f19-92a4-a88d7d855d54', 'course-of-action--aadac250-bcdc-44e3-a4ae-f52bd0a7a16a', 'attack-pattern--1c478716-71d9-46a4-9a53-fa5d576adb60', 'x-mitre-data-component--9c2fa0ae-7abc-485a-97f6-699e3b6cf9fa', 'attack-pattern--097924ce-a9a9-4039-8591-e0deedfb8722', 'intrusion-set--381fcf73-60f6-4ab2-9991-6af3cbc35192', 'attack-pattern--40b300ba-f553-48bf-862e-9471b220d455', 'course-of-action--97f33c84-8508-45b9-8a1d-cac921828c9e', 'attack-pattern--35392fb4-a31d-4c6a-b9f2-1c65b7f5e6b9', 'attack-pattern--be69c571-d746-4b1f-bdd0-c0c9817e9068', 'x-mitre-data-component--3d20385b-24ef-40e1-9f56-f39750379077', 'attack-pattern--fa3aa267-da22-4bdd-961f-03223322a8d5', 'course-of-action--f0f5c87a-a58d-440a-b3b5-ca679d98c6dd', 'attack-pattern--a81696ef-c106-482c-8f80-59c30f2569fb', 'course-of-action--4fa717d9-cabe-47c8-8cdd-86e9e2e37f30', 'attack-pattern--5a2610f6-9fff-41e1-bc27-575ca20383d4', 'course-of-action--66