In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.nn import GCNConv
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
import os
import pickle
from torch.optim.lr_scheduler import StepLR
# Ensure to set the correct CUDA device if multiple GPUs are available
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [29]:
models = ["pretrained_SecBert", "SecBert_E5", "pretrained_SecureBert",  "SecureBert_E5", "pt_Gpt2","ft_Gpt2_E5"]
# models = ["pretrained_SecBert", "SecBert", "pretrained_SecureBert",  "SecureBert", "pretrained_Gpt2","Gpt2"]
#models = ["pretrained_SecBert","pretrained_SecureBert","pretrained_Gpt2"]
current_model = models[5]
#gnn_model = "GAT"
gnn_model = "GCN"
sample = 4
margin1=1.0
margin2=1.0

dir_name = "../../ics_cwe/Text_Hop/"+current_model+"/"
graph_path = "../../graph_network/data/"
# Assuming predefined weights are stored in a numpy array named 'predefined_embeddings1_weights'
text_embeddings = np.load(dir_name+'data/all_embeddings.npy')
#feature_2 = np.load(dir_name+'data/our_embeddings.npy')
feature_2 = np.load(graph_path+'graph_features/node2vec.npy')
with open('../../ics_cwe/id_to_pos.json') as fp:
    id_to_pos = json.load(fp)
with open('../../ics_cwe/pos_to_id.json') as fp:
    pos_to_id = json.load(fp)

with open(graph_path+'combined_nodes.json') as fp:
    nodes_json = json.load(fp)
# with open(graph_path+'combined_edges.json') as fp:
#     edges_json = json.load(fp)
# SemiSupervised
with open(graph_path+'combined_edges.json') as fp:
    edges_json = json.load(fp)

with open(dir_name+'data/anchor_pos_neg_hop_{}.pkl'.format(sample), 'rb') as f:
    anchor_pos_neg_triple=pickle.load(f)

# with open(graph_path+'graph_features/anchor_pos_neg_triple_4_node2vec.npy', 'rb') as f:
#     anchor_pos_neg_triple=pickle.load(f)
gnn_dir = '../../ics_cwe/{}/sample_{}/{}/'.format(gnn_model,sample,current_model)
if not os.path.exists(gnn_dir):
    os.makedirs(gnn_dir)
out_file = '/text_node2vec_dual_gm_{}.npy'.format(margin1, margin2)

In [30]:
attack_range = (0,203)
weak_range = (203,1136)

In [31]:
anchor_pos_neg_triple[0]

(64, 689, 279, 18, 195)

In [32]:
unq = set()
for a,_,_,_,_ in anchor_pos_neg_triple:
    unq.add(a)
len(unq)

879

In [6]:
print(len([x for x in list(unq) if x < 203]))
print(len([x for x in list(unq) if x >= 203]))


194
685


In [7]:
anchor_nodes = []
positive_nodes = []
negative_nodes = []
hp_nodes = []
hn_nodes = []
for a,p,n,hp,hn in anchor_pos_neg_triple:
    anchor_nodes.append(a)
    positive_nodes.append(p)
    negative_nodes.append(n)

In [8]:
len(negative_nodes)

24332

In [9]:
len(positive_nodes)

24332

In [10]:
len(anchor_nodes)

24332

In [11]:
# # Convert lists to tensors for use in the model
# anchor_text_embeddings = torch.tensor(text_embeddings[anchor_nodes], dtype=torch.float).to(device)
# positive_text_embeddings = torch.tensor(text_embeddings[positive_nodes], dtype=torch.float).to(device)
# negative_text_embeddings = torch.tensor(text_embeddings[negative_nodes], dtype=torch.float).to(device)
# hp_text_embeddings = torch.tensor(text_embeddings[hp_nodes], dtype=torch.float).to(device)
# hn_text_embeddings = torch.tensor(text_embeddings[hn_nodes], dtype=torch.float).to(device)
node_text_embeddings = torch.tensor(text_embeddings, dtype=torch.float).to(device)
node_feature_2 = torch.tensor(feature_2, dtype=torch.float).to(device)


In [12]:
anchor_pos_neg_triple[-5:]

[(81, 715, 1096, 187, 180),
 (84, 215, 681, 88, 129),
 (1119, 104, 198, 1111, 745),
 (168, 390, 684, 201, 171),
 (390, 168, 136, 305, 978)]

In [13]:
node_list = list(range(0, weak_range[1]))

In [14]:
edge_list = [(int(e[0]), int(e[1])) for e in edges_json]

In [15]:
edge_list[0]

(0, 1)

In [16]:
full_edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous().to(device)

In [17]:
# GAT model definition
class GAT(nn.Module):
    def __init__(self, in_channels, out_channels, hid_dim):
        super(GAT, self).__init__()
        self.gat1 = GATConv(in_channels, hid_dim, heads=8)
        self.gat2 = GATConv(hid_dim * 8, out_channels, heads=1)
    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = torch.relu(x)
        x = self.gat2(x, edge_index)
        return x

In [18]:
class GCN(nn.Module):
    def __init__(self, in_channels, out_channels, hid_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hid_dim)
        self.conv2 = GCNConv(hid_dim, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

In [19]:
# # define tensors 
# tens_1 = torch.Tensor([[11, 12, 13], [14, 15, 16]]) 
# tens_2 = torch.Tensor([[17, 18, 19], [20, 21, 22]]) 
# tens_3 = torch.Tensor([[37, 38, 39], [30, 31, 32]]) 
# # print first tensors 
# print("tens_1 \n", tens_1) 
  
# # print second tensor 
# print("tens_2 \n", tens_2) 
  
# # call torch,cat() function 
# # join tensor in -1 dimension 
# tens = torch.cat((tens_1, tens_2,tens_3), -1) 
# print("join tensors in the -1 dimension \n", tens) 
  
# # join tensor in 0 dimension 
# tens = torch.cat((tens_1, tens_2,tens_3), dim=1) 
# print("join tensors in the 1 dimension \n", tens) 

In [20]:
class DualEncoder(nn.Module):
    def __init__(self, graph_model1,graph_model2, text_dim,feature_2_dim, hidden_dim):
        super(DualEncoder, self).__init__()
        self.graph_model1 = graph_model1
        self.graph_model2 = graph_model2
        self.fc1 = nn.Linear(text_dim, out_channels)
        #self.fc2 = nn.Linear(feature_2_dim, feature_2_dim)
        self.fc2 = nn.Linear(2*out_channels+feature_2_dim, hidden_dim)

    def forward(self, text_emb,feature_2, graph_features, edge_index):
        graph_emb1 = self.graph_model1(graph_features, edge_index)
        graph_emb2 = self.graph_model2(feature_2, edge_index)
        text_emb = self.fc1(text_emb)
        # feature_2 = self.fc2(feature_2)
        combined_emb = torch.cat([text_emb, graph_emb1, graph_emb2], dim=1)
        combined_emb = self.fc2(combined_emb)
        return combined_emb

In [21]:
# Define contrastive loss
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=.5):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        # Normalize the embeddings to unit vectors
        anchor_norm = F.normalize(anchor, p=2, dim=1)
        positive_norm = F.normalize(positive, p=2, dim=1)
        negative_norm = F.normalize(negative, p=2, dim=1)
        # Compute the cosine similarity
        cos_sim1 = F.cosine_similarity(anchor_norm, positive_norm, dim=1)
        cos_sim2 = F.cosine_similarity(anchor_norm, negative_norm, dim=1)
        # Cosine distance is 1 - cosine similarity
        pos_dist = 1 - cos_sim1
        neg_dist = 1 - cos_sim2
        
        # Compute the loss
        loss = torch.relu(pos_dist - neg_dist + self.margin).mean()
        return loss

In [22]:
# GCN Model
in_channels = node_text_embeddings.shape[1]  
feature_2_dim = node_feature_2.shape[1]  
out_channels = 128
hidden_dim = 128
text_dim = in_channels  # Text embedding dimension
if(gnn_model=="GCN"):
    graph_model1 = GCN(in_channels=in_channels, out_channels=out_channels, hid_dim=128).to(device)
    graph_model2 = GCN(in_channels=feature_2_dim, out_channels=feature_2_dim, hid_dim=feature_2_dim).to(device)
else:
    graph_model1 = GAT(in_channels=in_channels, out_channels=out_channels, hid_dim=128).to(device)
    graph_model2 = GAT(in_channels=feature_2_dim, out_channels=feature_2_dim, hid_dim=feature_2_dim).to(device)
print(graph_model2)
model = DualEncoder(graph_model1,graph_model2, text_dim=text_dim,feature_2_dim=feature_2_dim, hidden_dim=hidden_dim).to(device)

GCN(
  (conv1): GCNConv(64, 64)
  (conv2): GCNConv(64, 64)
)


In [23]:
# Loss Function and Optimizer
contrastive_loss1 = ContrastiveLoss(margin=margin1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#scheduler = StepLR(optimizer, step_size=50, gamma=0.5)


In [24]:
# Training Loop
num_epochs = 200
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    gnn_model = model(node_text_embeddings,node_feature_2,node_text_embeddings, full_edge_index)
    anchor_output = gnn_model[anchor_nodes]
    positive_output = gnn_model[positive_nodes]
    negative_output = gnn_model[negative_nodes]
    # hp_output = gnn_model[hp_nodes]
    # hn_output = gnn_model[hn_nodes]
    loss = contrastive_loss1(anchor_output, positive_output, negative_output)
    # loss2 = contrastive_loss1(anchor_output, hp_output, hn_output)
    # loss = loss1+0.00*loss2
    loss.backward()
    #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    #scheduler.step()
    if(epoch%10==0):
        print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 0.9214301705360413
Epoch 10, Loss: 0.0831502228975296
Epoch 20, Loss: 0.060156360268592834
Epoch 30, Loss: 0.04526502639055252
Epoch 40, Loss: 0.036833591759204865
Epoch 50, Loss: 0.03131534531712532
Epoch 60, Loss: 0.027542781084775925
Epoch 70, Loss: 0.024541370570659637
Epoch 80, Loss: 0.022278528660535812
Epoch 90, Loss: 0.02049735002219677
Epoch 100, Loss: 0.019041260704398155
Epoch 110, Loss: 0.01785700023174286
Epoch 120, Loss: 0.016880078241229057
Epoch 130, Loss: 0.016047609969973564
Epoch 140, Loss: 0.015312112867832184
Epoch 150, Loss: 0.014691143296658993
Epoch 160, Loss: 0.014114088378846645
Epoch 170, Loss: 0.013577640056610107
Epoch 180, Loss: 0.01316805835813284
Epoch 190, Loss: 0.01267700083553791


In [25]:
# Extract embeddings for all nodes
# node_text_embeddings = torch.tensor(text_embeddings, dtype=torch.float).to(device)
# full_edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous().to(device)

def extract_embeddings(model, node_embeddings,node_feature_2, edge_index):
    model.eval()
    with torch.no_grad():
        embeddings = model(node_embeddings,node_feature_2,node_text_embeddings, edge_index)
    return embeddings

final_node_embeddings = extract_embeddings(model, node_text_embeddings,node_feature_2, full_edge_index)

# Print the final embeddings
print(final_node_embeddings.shape)

torch.Size([1136, 128])


In [26]:
# import numpy as np
# import os
# # Assuming node_embeddings is a PyTorch tensor
# node_embeddings_np = final_node_embeddings.detach().cpu().numpy()

# nn_dir = '../data/GAT_embeddings/'+current_model
# if not os.path.exists(nn_dir):
#     os.makedirs(nn_dir)
# np.save(nn_dir+'/node_embeddings_gm_{}_tm_{}.npy'.format(margin1,margin2), np.array(node_embeddings_np)) # save

In [27]:
# import numpy as np
# import os
# # Assuming node_embeddings is a PyTorch tensor
# node_embeddings_np = final_node_embeddings.detach().cpu().numpy()

# nn_dir = '../data/sample_{}/GAN_embeddings/'.format(sample)+current_model
# if not os.path.exists(nn_dir):
#     os.makedirs(nn_dir)
# np.save(nn_dir+'/node_embeddings_gm_{}.npy'.format(margin1), np.array(node_embeddings_np)) # save

In [28]:

# Assuming node_embeddings is a PyTorch tensor
node_embeddings_np = final_node_embeddings.detach().cpu().numpy()
np.save(gnn_dir+out_file, np.array(node_embeddings_np)) # save