In [1]:
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
# Approach1: top and bottom similar for each weak and attack
# nodes: 1136
# anchors: 15420
# number of edges:
# anchor_edges = 2077
# positive_edges = 1356
# negative_edges = 842
# Approach2: using thresold pos=0.40, neg=0.2 random negative selection
# nodes: 873
# anchors: 30694
# number of edges:
# anchor_edges = 1412
# positive_edges = 1425
# negative_edges = 1013
# Sample 3: using thresold pos=0.45, neg=0.2 random negative selection
# nodes: 523
# anchors: 14350
# number of edges:
# anchor_edges = 722
# positive_edges = 703
# negative_edges = 565
# Sample 4: using thresold pos=0.40, neg=0.2 select anchors that present both in positive and negative.
# nodes: 873
# anchors: 12218
# number of edges:
# anchor_edges = 1412
# positive_edges = 1065
# negative_edges = 676

In [2]:
models = ["pretrained_SecBert", "SecBert_E5", "pretrained_SecureBert",  "SecureBert_E5", "pretrained_Gpt2","Gpt2_E5"]
# models = ["pretrained_SecBert", "SecBert", "pretrained_SecureBert",  "SecureBert", "pretrained_Gpt2","Gpt2"]
#models = ["pretrained_SecBert","pretrained_SecureBert","pretrained_Gpt2"]
current_model = models[4]
margin1=1.0
margin2=1.0
# Define positive and negative pairs based on similarity

dir_name = "../../ics_cwe/"+current_model+"/"
# Assuming predefined weights are stored in a numpy array named 'predefined_embeddings1_weights'
text_embeddings = np.load(dir_name+'data/all_embeddings.npy')
with open('../../ics_cwe/id_to_pos.json') as fp:
    id_to_pos = json.load(fp)
with open('../../ics_cwe/pos_to_id.json') as fp:
    pos_to_id = json.load(fp)

In [3]:
# Placeholder function for generating text embeddings
def get_text_embeddings(text_embeddings, nodes):
    return [text_embeddings[node] for node in nodes]

In [4]:
attack_range = (0,203)
weak_range = (203,1136)

In [5]:
# Generate embeddings for both sets of nodes
attack_nodes = list(range(attack_range[0],attack_range[1]))
weakness_nodes = list(range(weak_range[0],weak_range[1]))
attack_embeddings = get_text_embeddings(text_embeddings, attack_nodes)
weakness_embeddings = get_text_embeddings(text_embeddings, weakness_nodes)

In [6]:
# Compute cosine similarity between all pairs of nodes
weak_attack_matrix = cosine_similarity(weakness_embeddings, attack_embeddings)
# Compute cosine similarity between all pairs of nodes
attack_weak_matrix = cosine_similarity(attack_embeddings, weakness_embeddings)

In [7]:
# Approach1: using top and bottom similar for each weak and attack

In [None]:
anchor_pos_neg_triple = []
for i, attack_node in enumerate(attack_nodes):
    # Get the similarity scores for the current attack node
    sim_scores = attack_weak_matrix[i]
    
    # Get the indices of the 5 most similar and 5 least similar nodes
    top5_sim_indices = np.argsort(sim_scores)[-30:]  # Most similar
    bottom5_sim_indices = np.argsort(sim_scores)[:30]  # Least similar
    
    # Select the most similar positive nodes
    # Select the least similar negative nodes
    for j in range(len(top5_sim_indices)):
        idx1 = top5_sim_indices[j]
        idx2 = bottom5_sim_indices[j]
        anchor_pos_neg_triple.append((attack_node, weakness_nodes[idx1], weakness_nodes[idx2]))
for i, weak_node in enumerate(weakness_nodes):
    # Get the similarity scores for the current attack node
    sim_scores = weak_attack_matrix[i]
    
    # Get the indices of the 5 most similar and 5 least similar nodes
    top5_sim_indices = np.argsort(sim_scores)[-10:]  # Most similar
    bottom5_sim_indices = np.argsort(sim_scores)[:10]  # Least similar
    
    # Select the most similar positive nodes
    # Select the least similar negative nodes
    for j in range(len(top5_sim_indices)):
        idx1 = top5_sim_indices[j]
        idx2 = bottom5_sim_indices[j]
        anchor_pos_neg_triple.append((weak_node, attack_nodes[idx1], attack_nodes[idx2]))

In [None]:
len(anchor_pos_neg_triple)

In [None]:
import pickle
with open(dir_name+'data/anchor_pos_neg_triple_1.pkl', 'wb') as f:
    pickle.dump(anchor_pos_neg_triple, f)

In [None]:
# Approach2: using thresold. 

In [14]:
import random
from collections import defaultdict
anchor_pos_pair = []
anchor_neg_pair = []
positive_threshold = 0.65  # Similarity threshold for positive pairs
negative_threshold = 0.45  # Similarity threshold for negative pairs
for i, attack_node in enumerate(attack_nodes):
    for j, weakness_node in enumerate(weakness_nodes):
        if attack_weak_matrix[i, j] >= positive_threshold:
            anchor_pos_pair.append((attack_node,weakness_node, attack_weak_matrix[i, j]))
        if attack_weak_matrix[i, j] <= negative_threshold:
            anchor_neg_pair.append((attack_node,weakness_node, attack_weak_matrix[i, j]))
        
for i, weakness_node in enumerate(weakness_nodes):
    for j, attack_node in enumerate(attack_nodes):
        if weak_attack_matrix[i, j] >= positive_threshold:
            anchor_pos_pair.append((weakness_node,attack_node, weak_attack_matrix[i, j]))
        if weak_attack_matrix[i, j] <= negative_threshold:
            anchor_neg_pair.append((weakness_node,attack_node, weak_attack_matrix[i, j]))

anchor_pos_pair.sort(reverse=True, key=lambda x:x[2])
anchor_neg_pair.sort(key=lambda x:x[2])
print(len(anchor_pos_pair))
print(len(anchor_neg_pair))
anchor_pos_neg_triple = []
anchor_for_neg = [pair[0] for pair in anchor_neg_pair]
pos_pair = []
neg_pair = []
for anchor,pos,val in anchor_pos_pair:
    if(anchor in anchor_for_neg):
        idx = anchor_for_neg.index(anchor)
        anchor_pos_neg_triple.append((anchor,pos,anchor_neg_pair[idx][1]))
        pos_pair.append((pos,val))
        neg_pair.append((anchor_neg_pair[idx][1],anchor_neg_pair[idx][2]))
        anchor_for_neg.pop(idx)
        anchor_neg_pair.pop(idx)



# anchor_dict = defaultdict(list)
# anchor_pos_neg_triple = []
# pos_pair = []
# neg_pair = []
# for anchor,neg,val in anchor_neg_pair:
#     anchor_dict[anchor].append((anchor,neg,val))
# for anchor,pos,val in anchor_pos_pair:
#     if(anchor in anchor_dict.keys()):
#         idx = random.randrange(0, len(anchor_dict[anchor]))
#         anchor_pos_neg_triple.append((anchor,pos,anchor_dict[anchor][idx][1]))
#         pos_pair.append((pos,val))
#         neg_pair.append((anchor_dict[anchor][idx][1],anchor_dict[anchor][idx][2]))
#         #anchor_dict[anchor].pop(idx)

59988
25082


In [15]:
ass = set()
ps=set()
ns=set()
for a,p,n in anchor_pos_neg_triple:
    ass.add(a)
    ps.add(p)
    ns.add(n)
print(len(ass))
print(len(ps))
print(len(ns))


843
572
295


In [16]:
import pickle
with open(dir_name+'data/anchor_pos_neg_triple_4.pkl', 'wb') as f:
    pickle.dump(anchor_pos_neg_triple, f)

In [None]:
dir2 = '../data1/node_similarities/'+current_model

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
def histogram2(all_pairs):
    # print(len(cosine_sim_pairs))
    # print(len(cosine_sim_pairs[0]))
    # for row in cosine_sim_pairs:
    #     print(row[0][1], row[1][1])
    #Extract the cosine similarity values from the filtered results
    #cosine_sim_values = [pair[1] for row in cosine_sim_pairs for pair in row[-30:]]
    cosine_sim_values = [pair[1] for pair in all_pairs]
    
    # Define the bins for the histogram
    bins = np.arange(0, 1.1, 0.1)  # Bins from 0 to 1 with step size 0.1
    
    # Create the histogram
    plt.hist(cosine_sim_values, bins=bins, edgecolor='black')
    
    # Set the x-axis and y-axis labels
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Frequency')
    
    # Set the title of the histogram
    plt.title('Attack & Weak Positive')
    plt.savefig(dir2+'/histogram_pos_t_40.png',dpi=300)
    # Show the plot
    plt.show()

In [None]:
histogram2(pos_pair)

In [None]:
histogram2(neg_pair)

In [None]:
import pickle
with open('../data/pos_neg_sample/anchor_pos_neg_triple_4.pkl', 'wb') as f:
    pickle.dump(anchor_pos_neg_triple, f)

In [None]:
with open('../data/pos_neg_sample/anchor_pos_neg_triple_1.pkl', 'rb') as f:
    x=pickle.load(f)