In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1" 
# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cuda


In [3]:
embeddings_dir = "../../model_outputs/ics_attack/embeddings/"
models = ["pt_gpt2-xl","gpt2-xl/Epoch_5","gpt2-xl/epoch_10"]
data_dir = "../../datasets/ics_attack/"
text_emb_dir = embeddings_dir+models[2]+"/"


with open(data_dir+'doc_id_to_emb_id.json') as f:
    doc_id_to_emb_id = json.load(f)
with open(data_dir+'emb_id_to_doc_id.json') as f:
    emb_id_to_doc_id = json.load(f)

training_data = np.load(data_dir+'hop_training_data.npy') # load
text_embeddings = np.load(text_emb_dir+'text_embeddings.npy')
weights = [x for _,_,x in training_data]
pairs = [[int(x),int(y)] for x,y,_ in training_data]

In [28]:
obj_set = []
obj_set2=[]
for p1,p2 in pairs:
    obj_set.append(p1)
    obj_set2.append(p2)
len(obj_set)

455281

In [29]:
import random
def shuffle_two_arrays(array1, array2):
    combined = list(zip(array1, array2))
    random.shuffle(combined)
    shuffled_array1, shuffled_array2 = zip(*combined)
    return list(shuffled_array1), list(shuffled_array2)
spairs, sweights = shuffle_two_arrays(pairs, weights)
# Pull out columns for X (data to train with) and Y (value to predict)
import numpy as np
X_training = np.array(spairs)
Y_training = np.array(sweights)
Y_training = Y_training.reshape(-1,1)
Y_training = Y_training.squeeze()
print(Y_training.shape)

X1_training = X_training.T[0]
X2_training = X_training.T[1]
print("X_training shape:",X1_training.shape)
print("Y_training shape:",Y_training.shape)

(455281,)
X_training shape: (455281,)
Y_training shape: (455281,)


In [30]:
TOTAL_OBJECTS = len(doc_id_to_emb_id)
EMBEDDING_DIM_1 = len(text_embeddings[0])
HIDDEN_DIM = 256
EMBEDDING_DIM_2 = 64
RUN_NAME = "run 1 with 200 epoches"
training_epochs = 200
learning_rate = 0.001
SCALE_FACTOR = 1



In [31]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        # Embedding layers
        self.text_embedding_layer = nn.Embedding.from_pretrained(torch.FloatTensor(text_embeddings), freeze=True)
        self.hidden_layers = nn.Sequential(
            nn.Linear(EMBEDDING_DIM_1, HIDDEN_DIM),
            nn.ReLU(),
            nn.Linear(HIDDEN_DIM, EMBEDDING_DIM_2),
            nn.ReLU()
        )
        
    def forward(self, X1, X2):
        # Get embeddings
        obj1_text_embedding = self.text_embedding_layer(X1)
        obj2_text_embedding = self.text_embedding_layer(X2)
        
        # Pass embeddings through hidden layers
        hidden_output = self.hidden_layers(self.text_embedding_layer.weight)
        
        # Gather the corresponding embeddings
        obj1_model1_embedding = hidden_output[X1]
        obj2_model1_embedding = hidden_output[X2]

        obj1_model1_embedding_norm = F.normalize(obj1_model1_embedding, p=2, dim=1)
        obj2_model1_embedding_norm = F.normalize(obj2_model1_embedding, p=2, dim=1)
        obj1_text_embedding_norm = F.normalize(obj1_text_embedding, p=2, dim=1)
        obj2_text_embedding_norm = F.normalize(obj2_text_embedding, p=2, dim=1)
        
        # Compute cosine distances
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        hop_dist_predict = SCALE_FACTOR * (1 - cos(obj1_model1_embedding_norm, obj2_model1_embedding_norm))
        text_dist_predict = SCALE_FACTOR * (1 - cos(obj1_text_embedding_norm, obj2_text_embedding_norm))
        
        return hop_dist_predict, text_dist_predict

In [32]:
# Define the model
model = NeuralNetwork()
model = model.to(device)
# Define the loss functions
criterion = nn.MSELoss(reduction='sum')

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [33]:
# Assuming X1_training, X2_training, and Y_training are numpy arrays
X1 = torch.tensor(X1_training, dtype=torch.long)
X2 = torch.tensor(X2_training, dtype=torch.long)
Y = torch.tensor(Y_training, dtype=torch.float32)
X1 = X1.to(device)
X2 = X2.to(device)
Y = Y.to(device)

In [34]:
# Training the model
for epoch in range(training_epochs):
    model.train()
    optimizer.zero_grad()
    
    hop_dist_predict, text_dist_predict = model(X1, X2)
    
    loss1 = criterion(Y, hop_dist_predict)
    loss2 = criterion(hop_dist_predict, text_dist_predict)
    alpha = 0.5
    cost = (alpha)*loss1 + (1-alpha)*loss2
    
    cost.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print("Epoch: {} - Training Cost: {}".format(epoch, cost.item()))

Epoch: 0 - Training Cost: 35812.890625
Epoch: 20 - Training Cost: 7921.5458984375
Epoch: 40 - Training Cost: 6284.50341796875
Epoch: 60 - Training Cost: 5597.91796875
Epoch: 80 - Training Cost: 5192.03173828125
Epoch: 100 - Training Cost: 4921.77783203125
Epoch: 120 - Training Cost: 4996.1162109375
Epoch: 140 - Training Cost: 4630.00341796875
Epoch: 160 - Training Cost: 4500.91650390625
Epoch: 180 - Training Cost: 4470.48193359375


In [35]:
# Save the embeddings
embeddings1_val = model.cpu().text_embedding_layer.weight.data.numpy()
embeddings2_val = model.cpu().hidden_layers(model.text_embedding_layer.weight).detach().numpy()
text_hop_embeddings = np.array(embeddings2_val, dtype="float64")
print("save in ", text_emb_dir+"text_hop_embeddings.npy")
np.save(text_emb_dir+"text_hop_embeddings.npy", np.array(embeddings2_val), allow_pickle=True)

save in  ../model_outputs/ics_attack/llm_finetuned_models/gpt2-xl/Epoch_10/text_hop_embeddings.npy


In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# def cos_sim(embeddings_val):
#     # Compute cosine similarity between all pairs of embeddings
#     cosine_distances = cosine_similarity(embeddings_val)

#     # Print cosine distances
#     print("Cosine distances between all pairs of embeddings:")
#     for i in range(TOTAL_OBJECTS):
#         for j in range(i+1, i+5):
#             if(j<TOTAL_OBJECTS):
#                 print(f"Pair ({i}, {j}): {cosine_distances[i][j]}")

In [None]:
# cos_sim(embeddings2_val)

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np
# embedding_size=64
# def get_cosine_sim(all_embeddings, file):
#     f = open(file, "a")
    
#     knnDict = {}
#     all_sim = []
#     for i in range(len(all_embeddings)):
#         x = np.array(all_embeddings[i]).reshape(1,embedding_size)
#         #simList = []
#         for j in range(i+1,len(all_embeddings)):
#             if(i==j):
#                 continue
#             y = np.array(all_embeddings[j]).reshape(1,embedding_size)
            
#             cos_sim = cosine_similarity(x, y)
#             #simList.append([all_embeddings[j]['simple_id'], cos_sim])
#             all_sim.append(cos_sim[0][0])
#     #     y= sorted(simList,key=lambda l:l[1], reverse=True)
#     #     objName = all_embeddings[i]['simple_id']
#     #     #print('Cosine similarity of *', objName)
#     #     f.write(f"\n___ Cosine similarity of * {objName}____\n")
#     #     knnList = []
#     #     for i in range(0,10):
#     #         #print( y[i][0], ': ',"%.4f" % y[i][1][0][0])
#     #         f.write(f"{y[i][0]} : {y[i][1][0][0]}\n")
#     #         knnList.append(y[i][0]+" "+str(round(y[i][1][0][0],4)))
#     #     knnDict[objName]=knnList
#     # f.close()
#     np.save(file, np.array(all_sim, dtype=object), allow_pickle=True)
#     #b = np.load('a.npy', allow_pickle=True)
#     return all_sim
#     #print("#####################################\n")

In [None]:
# all_sim= get_cosine_sim(embeddings2_val, "embeddings_cos_sim.npy")

In [None]:
# import matplotlib.pyplot as plt

# def histogram2(all_sim,file):
#     # Plot histogram with 5 bins
#     plt.hist(all_sim, bins=20, edgecolor='black')
    
#     # Add labels and title
#     plt.xlabel('Values')
#     plt.ylabel('Frequency')
#     plt.title('Cosine Similarity Distribution of Generated Embeddings')
#     plt.savefig(file, dpi=300)
#     # Show plot
#     plt.show()

In [None]:
# histogram2(all_sim,"synthetic_embedding_cos_sim_distribution.png")