In [1]:
import numpy as np
import json
import pickle
from sklearn.metrics.pairwise import cosine_similarity
# Optimized
from collections import defaultdict
import os
import sys
#base_dir = os.environ['AWEB_DIR']
sys.path.append("../../")
import config
# Approach1: top and bottom similar for each weak and attack
# anchor nodes: 1136
# postive nodes: 668
# negative nodes : 490

# Sample 2 pt_gpt: using thresold pos=0.65, neg=0.45 select anchors that present both in positive and negative.
# pos pair 82554
# neg pair 30772
# anchor nodes: 893
# postive nodes: 536
# negative nodes : 212

# Sample 2 ft_gpt: using thresold pos=0.45, neg=0.30 select anchors that present both in positive and negative.
# pos pair 70878
# neg pair 55820
# anchor nodes: 1034
# postive nodes: 765
# negative nodes : 618

In [2]:
models = ["pt_SecRoBERTa","SecRoBERTa","pt_SecureBERT","SecureBERT","pt_gpt2-xl","gpt2-xl"]
result_dir = config.OUTPUT_DIR
embeddings_dir = config.EMBEDDING_DIR
data_dir = config.DATA_DIR
model_name = models[4]
cwe_per_attack = 30
attack_per_cwe = 50
positive_threshold = 0.65
negative_threshold = 0.45
text_emb_dir = embeddings_dir+model_name+"/"
output_dir = result_dir+"gcl_data/"+model_name+"/"
# Assuming predefined weights are stored in a numpy array named 'predefined_embeddings1_weights'
text_embeddings = np.load(text_emb_dir+'text_embeddings.npy')
hop_text_embeddings = np.load(text_emb_dir+'text_hop_embeddings.npy')
with open(data_dir+"doc_id_to_emb_id.json") as f:
    doc_id_to_emb_id = json.load(f)
with open(data_dir+"emb_id_to_doc_id.json") as f:
    emb_id_to_doc_id = json.load(f)
with open(data_dir+'attack_weak_range.json') as fp:
    attack_weak_range = json.load(fp)

In [3]:
# # Placeholder function for generating text embeddings
# def get_text_embeddings(text_embeddings, nodes):
#     return [text_embeddings[node] for node in nodes]

In [4]:
attack_range = attack_weak_range['attack']
weak_range = attack_weak_range['cwe']

In [5]:
attack_weak_range

{'attack': [0, 2043], 'cwe': [2043, 2982], 'n_nodes': 2982}

In [6]:
# Generate embeddings for both sets of nodes
attack_nodes = list(range(attack_range[0],attack_range[1]))
weakness_nodes = list(range(weak_range[0],weak_range[1]))
attack_embeddings = text_embeddings[attack_nodes]
weakness_embeddings = text_embeddings[weakness_nodes]
hop_attack_embeddings = hop_text_embeddings[attack_nodes]
hop_weakness_embeddings = hop_text_embeddings[weakness_nodes]

In [7]:
# Compute cosine similarity between all pairs of nodes
weak_attack_matrix = cosine_similarity(weakness_embeddings, attack_embeddings)
# Compute cosine similarity between all pairs of nodes
attack_weak_matrix = cosine_similarity(attack_embeddings, weakness_embeddings)

# Compute cosine similarity between all pairs of nodes
hop_attack_matrix = cosine_similarity(hop_attack_embeddings, hop_attack_embeddings)
# Compute cosine similarity between all pairs of nodes
hop_weak_matrix = cosine_similarity(hop_weakness_embeddings, hop_weakness_embeddings)

In [8]:
# Approach1: using top and bottom similar for each weak and attack

In [9]:
def get_sample_1():
    anchor_pos_neg_triple = []
    for i, attack_node in enumerate(attack_nodes):
        # Get the similarity scores for the current attack node
        sim_scores = attack_weak_matrix[i]
        
        # Get the indices of the 5 most similar and 5 least similar nodes
        top5_sim_indices = np.argsort(sim_scores)[-cwe_per_attack:]  # Most similar
        bottom5_sim_indices = np.argsort(sim_scores)[:cwe_per_attack]  # Least similar
        
        # Select the most similar positive nodes
        # Select the least similar negative nodes
        for j in range(len(top5_sim_indices)):
            idx1 = top5_sim_indices[j]
            idx2 = bottom5_sim_indices[j]
            anchor_pos_neg_triple.append((attack_node, weakness_nodes[idx1], weakness_nodes[idx2]))
    for i, weak_node in enumerate(weakness_nodes):
        # Get the similarity scores for the current attack node
        sim_scores = weak_attack_matrix[i]
        
        # Get the indices of the 5 most similar and 5 least similar nodes
        top5_sim_indices = np.argsort(sim_scores)[-attack_per_cwe:]  # Most similar
        bottom5_sim_indices = np.argsort(sim_scores)[:attack_per_cwe]  # Least similar
        
        # Select the most similar positive nodes
        # Select the least similar negative nodes
        for j in range(len(top5_sim_indices)):
            idx1 = top5_sim_indices[j]
            idx2 = bottom5_sim_indices[j]
            anchor_pos_neg_triple.append((weak_node, attack_nodes[idx1], attack_nodes[idx2]))
    anchor_set = set()
    positive_set=set()
    negative_set=set()
    for a,p,n in anchor_pos_neg_triple:
        anchor_set.add(a)
        positive_set.add(p)
        negative_set.add(n)
        if(a>=weak_range[1] or p>=weak_range[1] or n>=weak_range[1]):
            print(a," ",p," ", n)
    print("# anchor nodes:",len(anchor_set))
    print("# postive nodes:",len(positive_set))
    print("# negative nodes :",len(negative_set))
    print("# examples: ", len(anchor_pos_neg_triple))
    with open(output_dir+'anchor_pos_neg_triple_1.pkl', 'wb') as f:
        pickle.dump(anchor_pos_neg_triple, f)

In [None]:
def get_sample_2():
    anchor_pos_pair2 = []
    anchor_neg_pair2 = []
    
    # Find anchor-positive and anchor-negative pairs based on thresholds
    weakness_idxs, attack_idxs = np.where(attack_weak_matrix >= positive_threshold)
    anchor_pos_pair2.extend([(attack_nodes[i], weakness_nodes[j], attack_weak_matrix[i, j]) for i, j in zip(weakness_idxs, attack_idxs)])
    
    weakness_idxs, attack_idxs = np.where(attack_weak_matrix <= negative_threshold)
    anchor_neg_pair2.extend([(attack_nodes[i], weakness_nodes[j], attack_weak_matrix[i, j]) for i, j in zip(weakness_idxs, attack_idxs)])
    
    attack_idxs, weakness_idxs = np.where(weak_attack_matrix >= positive_threshold)
    anchor_pos_pair2.extend([(weakness_nodes[i], attack_nodes[j], weak_attack_matrix[i, j]) for i, j in zip(attack_idxs, weakness_idxs)])
    
    attack_idxs, weakness_idxs = np.where(weak_attack_matrix <= negative_threshold)
    anchor_neg_pair2.extend([(weakness_nodes[i], attack_nodes[j], weak_attack_matrix[i, j]) for i, j in zip(attack_idxs, weakness_idxs)])
    
    #Sort pairs
    anchor_pos_pair2.sort(reverse=True, key=lambda x: x[2])
    anchor_neg_pair2.sort(key=lambda x: x[2])
    
    print(len(anchor_pos_pair2))
    print(len(anchor_neg_pair2))
    
    # Make triplets (anchor, positive, negative)
    from collections import defaultdict
    
    anchor_pos_neg_triple = []
    anchor_neg_dict = defaultdict(list)
    
    for anchor, neg, val in anchor_neg_pair2:
        anchor_neg_dict[anchor].append((neg, val))
    
    for anchor, pos, pos_val in anchor_pos_pair2:
        if anchor in anchor_neg_dict and anchor_neg_dict[anchor]:
            neg, neg_val = anchor_neg_dict[anchor].pop(0)
            anchor_pos_neg_triple.append((anchor, pos, neg))
    anchor_set = set()
    positive_set=set()
    negative_set=set()
    for a,p,n in anchor_pos_neg_triple:
        anchor_set.add(a)
        positive_set.add(p)
        negative_set.add(n)
        if(a>=weak_range[1] or p>=weak_range[1] or n>=weak_range[1]):
            print(a," ",p," ", n)
    print("# anchor nodes:",len(anchor_set))
    print("# postive nodes:",len(positive_set))
    print("# negative nodes :",len(negative_set))
    print("# Sample:",len(anchor_pos_neg_triple))
    
    th_path= output_dir+"{}_{}/".format(positive_threshold, negative_threshold)
    if not os.path.exists(th_path):
        os.makedirs(th_path)
    with open(th_path+'anchor_pos_neg_triple_2.pkl', 'wb') as f:
        pickle.dump(anchor_pos_neg_triple, f)

In [None]:
def get_sample_3():
    anchor_pos_neg_triple = []
    for i, attack_node in enumerate(attack_nodes):
        # Get the similarity scores for the current attack node
        sim_scores = attack_weak_matrix[i]
        sim_scores2 = hop_attack_matrix[i]
        # Get the indices of the 5 most similar and 5 least similar nodes
        top5_sim_indices = np.argsort(sim_scores)[-cwe_per_attack:]  # Most similar
        bottom5_sim_indices = np.argsort(sim_scores2)[:cwe_per_attack]  # Least similar
        
        # Select the most similar positive nodes
        # Select the least similar negative nodes
        for j in range(len(top5_sim_indices)):
            idx1 = top5_sim_indices[j]
            idx2 = bottom5_sim_indices[j]
            anchor_pos_neg_triple.append((attack_node, weakness_nodes[idx1], attack_nodes[idx2]))
    for i, weak_node in enumerate(weakness_nodes):
        # Get the similarity scores for the current attack node
        sim_scores = weak_attack_matrix[i]
        sim_scores2 = hop_weak_matrix[i]
        # Get the indices of the 5 most similar and 5 least similar nodes
        top5_sim_indices = np.argsort(sim_scores)[-attack_per_cwe:]  # Most similar
        bottom5_sim_indices = np.argsort(sim_scores2)[:attack_per_cwe]  # Least similar
        
        # Select the most similar positive nodes
        # Select the least similar negative nodes
        for j in range(len(top5_sim_indices)):
            idx1 = top5_sim_indices[j]
            idx2 = bottom5_sim_indices[j]
            anchor_pos_neg_triple.append((weak_node, attack_nodes[idx1], weakness_nodes[idx2]))
    anchor_set = set()
    positive_set=set()
    negative_set=set()
    for a,p,n in anchor_pos_neg_triple:
        anchor_set.add(a)
        positive_set.add(p)
        negative_set.add(n)
        if(a>=weak_range[1] or p>=weak_range[1] or n>=weak_range[1]):
            print(a," ",p," ", n)
    print("# anchor nodes:",len(anchor_set))
    print("# postive nodes:",len(positive_set))
    print("# negative nodes :",len(negative_set))
    print("# examples: ", len(anchor_pos_neg_triple))
    with open(output_dir+'anchor_pos_neg_triple_3.pkl', 'wb') as f:
        pickle.dump(anchor_pos_neg_triple, f)

In [11]:
get_sample_1()
get_sample_2()
get_sample_3()

In [12]:
# Approach3: positive from attack-weak, negative from attack-attack, weak-weak

In [16]:
# Approach2: using thresold. 

In [23]:
# import random
# from collections import defaultdict
# anchor_pos_pair = []
# anchor_neg_pair = []
# positive_threshold = 0.70
# negative_threshold = 0.40
# # There are two group of nodes.
# # The first 2043 nodes are attack node and the embeddings = (2043, 1600) 
# # The second set 939 nodes are weakness node and the embedding shape = (939, 1600)
# # attack_embeddings.shape = (2043, 1600)
# # weakness_embeddings.shape = (939, 1600)

# # Compute cosine similarity between weakness and attack nodes
# # I calculate weak_attack_matrix a cosine similarity of shape (939,2043) where rows are weakness and columns are attack and values are cosine similarity
# weak_attack_matrix = cosine_similarity(weakness_embeddings, attack_embeddings)
# # Compute cosine similarity between attack and weakness nodes
# # Then I calculate attack_weak_matrix a cosine similarity of shape (2043,939) where rows are attack and columns are weakness and values are cosine similarity
# attack_weak_matrix = cosine_similarity(attack_embeddings, weakness_embeddings)
# # This loop find the anchor-positive pairs based on positive_threshold and anchor-negative pairs based on negative_threshold from the attack_weak_matrix 
# for i, attack_node in enumerate(attack_nodes):
#     for j, weakness_node in enumerate(weakness_nodes):
#         if attack_weak_matrix[i, j] >= positive_threshold:
#             anchor_pos_pair.append((attack_node,weakness_node, attack_weak_matrix[i, j]))
#         if attack_weak_matrix[i, j] <= negative_threshold:
#             anchor_neg_pair.append((attack_node,weakness_node, attack_weak_matrix[i, j]))

# # This loop find the anchor-positive pairs based on positive_threshold and anchor-negative pairs based on negative_threshold from the weak_attack_matrix         
# for i, weakness_node in enumerate(weakness_nodes):
#     for j, attack_node in enumerate(attack_nodes):
#         if weak_attack_matrix[i, j] >= positive_threshold:
#             anchor_pos_pair.append((weakness_node,attack_node, weak_attack_matrix[i, j]))
#         if weak_attack_matrix[i, j] <= negative_threshold:
#             anchor_neg_pair.append((weakness_node,attack_node, weak_attack_matrix[i, j]))
# # Now I sort the positive pairs in descending order and negative pairs in ascending order
# anchor_pos_pair.sort(reverse=True, key=lambda x:x[2])
# anchor_neg_pair.sort(key=lambda x:x[2])
# print(len(anchor_pos_pair))
# print(len(anchor_neg_pair))

# Now the main purpose of the code is to make triplet (anchor,positive,negative) from the positive and negative pairs in a way that
# For each anchor we choose the highest similar positive node and for the (anchor,positive) pair choose the lowest similar (dissimilar) negative node from (anchor,negative) pairs.
# Once i choose a (anchor,positive,negative), i don't want to repeat the negative node for same anchor, so I remove those negative for the next (anchor,positive) pair
# This part of the code is taking so much time due to the pop action.

# anchor_pos_neg_triple = []
# anchor_for_neg = [pair[0] for pair in anchor_neg_pair]
# pos_pair = []
# neg_pair = []
# for anchor,pos,val in anchor_pos_pair:
#     if(anchor in anchor_for_neg):
#         idx = anchor_for_neg.index(anchor)
#         anchor_pos_neg_triple.append((anchor,pos,anchor_neg_pair[idx][1]))
#         pos_pair.append((pos,val))
#         neg_pair.append((anchor_neg_pair[idx][1],anchor_neg_pair[idx][2]))
#         anchor_for_neg.pop(idx)
#         anchor_neg_pair.pop(idx)



# anchor_dict = defaultdict(list)
# anchor_pos_neg_triple = []
# pos_pair = []
# neg_pair = []
# for anchor,neg,val in anchor_neg_pair:
#     anchor_dict[anchor].append((anchor,neg,val))
# for anchor,pos,val in anchor_pos_pair:
#     if(anchor in anchor_dict.keys()):
#         idx = random.randrange(0, len(anchor_dict[anchor]))
#         anchor_pos_neg_triple.append((anchor,pos,anchor_dict[anchor][idx][1]))
#         pos_pair.append((pos,val))
#         neg_pair.append((anchor_dict[anchor][idx][1],anchor_dict[anchor][idx][2]))
#         #anchor_dict[anchor].pop(idx)

In [24]:
# dir2 = '../data1/node_similarities/'+current_model

# import matplotlib.pyplot as plt
# import matplotlib.colors as mcolors
# def histogram2(all_pairs):
#     # print(len(cosine_sim_pairs))
#     # print(len(cosine_sim_pairs[0]))
#     # for row in cosine_sim_pairs:
#     #     print(row[0][1], row[1][1])
#     #Extract the cosine similarity values from the filtered results
#     #cosine_sim_values = [pair[1] for row in cosine_sim_pairs for pair in row[-30:]]
#     cosine_sim_values = [pair[1] for pair in all_pairs]
    
#     # Define the bins for the histogram
#     bins = np.arange(0, 1.1, 0.1)  # Bins from 0 to 1 with step size 0.1
    
#     # Create the histogram
#     plt.hist(cosine_sim_values, bins=bins, edgecolor='black')
    
#     # Set the x-axis and y-axis labels
#     plt.xlabel('Cosine Similarity')
#     plt.ylabel('Frequency')
    
#     # Set the title of the histogram
#     plt.title('Attack & Weak Positive')
#     plt.savefig(dir2+'/histogram_pos_t_40.png',dpi=300)
#     # Show the plot
#     plt.show()

In [25]:
# histogram2(pos_pair)

In [26]:
# histogram2(neg_pair)

In [27]:
# import pickle
# with open('../data/pos_neg_sample/anchor_pos_neg_triple_4.pkl', 'wb') as f:
#     pickle.dump(anchor_pos_neg_triple, f)

In [28]:
# with open('../data/pos_neg_sample/anchor_pos_neg_triple_1.pkl', 'rb') as f:
#     x=pickle.load(f)