In [1]:
import numpy as np
import os
import json

In [2]:
models = ["pt_SecBert", "ft_SecBert_E10", "pt_SecureBert",  "ft_SecureBert_E10", "pt_Gpt2","ft_Gpt2_E10"]
# models = ["pretrained_SecBert", "SecBert", "pretrained_SecureBert",  "SecureBert", "pretrained_Gpt2","Gpt2"]
#models = ["pretrained_SecBert","pretrained_SecureBert","pretrained_Gpt2"]
current_model = models[0]
dir_name = "../../ics_cwe/Text_Hop/"+current_model+"/"
# Assuming predefined weights are stored in a numpy array named 'predefined_embeddings1_weights'
text_embeddings = np.load(dir_name+'data/all_embeddings.npy')
with open('../../ics_cwe/id_to_pos.json') as fp:
    id_to_pos = json.load(fp)
with open('../../ics_cwe/pos_to_id.json') as fp:
    pos_to_id = json.load(fp)

In [3]:
print("text_embedding shape",text_embeddings.shape)
print("id_to_pos len",len(id_to_pos))
print("pos_to_id len",len(pos_to_id))

text_embedding shape (1136, 768)
id_to_pos len 1136
pos_to_id len 1136


In [4]:
attack_range = (0,203)
weak_range = (203,1136)
total_nodes = 1136

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(text_embeddings)

# Verify the shape of the resulting cosine similarity matrix
print(cosine_sim_matrix.shape)  # This should print (1136, 1136)

(1136, 1136)


In [6]:
# Save the filtered result to a file (optional)
import pickle
import os
dir1 = '../data/node_similarities/'+current_model

if not os.path.exists(dir1):
    os.makedirs(dir1)


In [7]:
np.save(dir1+"/cosine_sim_matrix.npy", cosine_sim_matrix)

In [8]:
# Initialize an empty list to store the filtered results
attack_sorted_cos_sim = []
weak_sorted_cos_sim = []
# Iterate over the first 203 rows in the cosine similarity matrix
for i in range(total_nodes):
    # Get the current row
    row = cosine_sim_matrix[i]
    
    # Get the sorted indices in descending order
    sorted_indices = np.argsort(-row)
    
    # Get the sorted cosine similarities
    sorted_similarities = row[sorted_indices]
    
    # Combine the indices and similarities
    sorted_row = list(zip(sorted_indices, sorted_similarities))
    if(i<attack_range[1]):
        # Filter the sorted row to keep only the pairs with indices from 203 to 1135
        filtered_sorted_row = [pair for pair in sorted_row if weak_range[0] <= pair[0] < weak_range[1]]
        # Append the filtered sorted row to the results
        attack_sorted_cos_sim.append(filtered_sorted_row)
    else:
        # Filter the sorted row to keep only the pairs with indices from 203 to 1135
        filtered_sorted_row = [pair for pair in sorted_row if attack_range[0] <= pair[0] < attack_range[1]]
        # Append the filtered sorted row to the results
        weak_sorted_cos_sim.append(filtered_sorted_row)

In [9]:
attack_sorted_cos_sim[0]

[(786, 0.48260426019143887),
 (954, 0.4760597306317078),
 (1042, 0.47399487262186674),
 (735, 0.471611703096182),
 (511, 0.46778447882420815),
 (451, 0.46514918859062127),
 (484, 0.46452238887808345),
 (949, 0.46008592036037477),
 (408, 0.4543397878650483),
 (886, 0.4533238400315741),
 (556, 0.45313828574987297),
 (397, 0.45283934148006955),
 (513, 0.45039125952992454),
 (900, 0.4486267545358028),
 (927, 0.44824934865931504),
 (409, 0.4472712161313824),
 (647, 0.44686397859819194),
 (743, 0.4451673411024548),
 (322, 0.4444848305479097),
 (1085, 0.4440125493220355),
 (628, 0.44395218508276),
 (227, 0.4439311558333206),
 (460, 0.4427594734498736),
 (734, 0.4426056463081848),
 (956, 0.4424384604364694),
 (320, 0.4417484198095636),
 (274, 0.4409880599322887),
 (327, 0.4408317306418845),
 (329, 0.43985861527548614),
 (955, 0.43979111606716137),
 (562, 0.4395455263177688),
 (1087, 0.43919931936714285),
 (778, 0.43840430570234357),
 (540, 0.43818295223402526),
 (856, 0.4381660421339033),
 (34

In [10]:
weak_sorted_cos_sim[0]

[(103, 0.6443878879615088),
 (167, 0.6414710619420096),
 (163, 0.6379178929485558),
 (77, 0.6134646371202682),
 (1, 0.5956856494236009),
 (123, 0.5920130126095936),
 (104, 0.5915243925634115),
 (132, 0.5872649695356351),
 (137, 0.5866268034117431),
 (24, 0.5859777100668276),
 (145, 0.5858980545101728),
 (31, 0.5806862396964889),
 (126, 0.58000790285583),
 (106, 0.5787812253025693),
 (162, 0.5768999652035027),
 (177, 0.5748866146193221),
 (143, 0.5702059376331501),
 (94, 0.5672758576196133),
 (83, 0.5647805329985032),
 (19, 0.5637676031155324),
 (50, 0.5606736806791077),
 (117, 0.558433424882804),
 (5, 0.5562756499561834),
 (25, 0.554348014634201),
 (150, 0.5541896908271438),
 (148, 0.5531273529954348),
 (64, 0.551557694773157),
 (135, 0.5511985785127455),
 (35, 0.5511817413588477),
 (7, 0.5490237071556672),
 (46, 0.5471625100745332),
 (147, 0.545803644381035),
 (69, 0.5440995651641666),
 (48, 0.5417187047057463),
 (170, 0.5412359252917532),
 (14, 0.5402138752939396),
 (29, 0.5389134375

In [11]:
with open(dir1+'/attack_text.pkl', 'wb') as file:
    pickle.dump(attack_sorted_cos_sim, file)

In [12]:
with open(dir1+'/weak_text.pkl', 'wb') as file:
    pickle.dump(weak_sorted_cos_sim, file)

In [13]:
# Analysis

In [14]:
len(attack_sorted_cos_sim[0])

933

In [15]:

attack_sorted_cos_sim[0][0]

(786, 0.48260426019143887)

In [16]:
attack_sorted_cos_sim[0]

[(786, 0.48260426019143887),
 (954, 0.4760597306317078),
 (1042, 0.47399487262186674),
 (735, 0.471611703096182),
 (511, 0.46778447882420815),
 (451, 0.46514918859062127),
 (484, 0.46452238887808345),
 (949, 0.46008592036037477),
 (408, 0.4543397878650483),
 (886, 0.4533238400315741),
 (556, 0.45313828574987297),
 (397, 0.45283934148006955),
 (513, 0.45039125952992454),
 (900, 0.4486267545358028),
 (927, 0.44824934865931504),
 (409, 0.4472712161313824),
 (647, 0.44686397859819194),
 (743, 0.4451673411024548),
 (322, 0.4444848305479097),
 (1085, 0.4440125493220355),
 (628, 0.44395218508276),
 (227, 0.4439311558333206),
 (460, 0.4427594734498736),
 (734, 0.4426056463081848),
 (956, 0.4424384604364694),
 (320, 0.4417484198095636),
 (274, 0.4409880599322887),
 (327, 0.4408317306418845),
 (329, 0.43985861527548614),
 (955, 0.43979111606716137),
 (562, 0.4395455263177688),
 (1087, 0.43919931936714285),
 (778, 0.43840430570234357),
 (540, 0.43818295223402526),
 (856, 0.4381660421339033),
 (34