In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [2]:
# Define the Generator network
class Generator(nn.Module):
    def __init__(self, text_embedding_dim, graph_embedding_dim, hidden_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(text_embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, graph_embedding_dim),
            nn.Tanh()
        )

    def forward(self, text_embedding):
        graph_embedding = self.model(text_embedding)
        return graph_embedding

In [3]:
# Define the Discriminator network
class Discriminator(nn.Module):
    def __init__(self, text_embedding_dim, graph_embedding_dim, hidden_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(text_embedding_dim + graph_embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),  # Add dropout to prevent discriminator from becoming too confident
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, text_embedding, graph_embedding):
        combined = torch.cat((text_embedding, graph_embedding), dim=1)
        validity = self.model(combined)
        return validity

In [4]:
# Define a custom dataset
class EmbeddingDataset(Dataset):
    def __init__(self, text_embeddings, graph_embeddings):
        self.text_embeddings = torch.tensor(text_embeddings, dtype=torch.float32)
        self.graph_embeddings = torch.tensor(graph_embeddings, dtype=torch.float32)
    
    def __len__(self):
        return len(self.text_embeddings)
    
    def __getitem__(self, idx):
        text_embedding = self.text_embeddings[idx]
        graph_embedding = self.graph_embeddings[idx]
        return text_embedding, graph_embedding

In [5]:
import json
import numpy as np
total_obj=1136
paragraphs = []
with open('../../ics_cwe/id_to_desc.json') as fp:
    id_to_desc = json.load(fp)
for i in range(total_obj):
    paragraphs.append(id_to_desc[str(i)])
graph_embeddings = np.load("../../ics_cwe/{}/sample_{}/{}/text_hop_dual_gm_1.0.npy".format("GCN",4,"pt_Gpt2"))
text_embeddings = np.load("../../ics_cwe/Text_Hop/{}/data/all_embeddings.npy".format("pt_Gpt2"))

In [6]:
# Hyperparameters
text_embedding_dim = text_embeddings.shape[1]  # Example text embedding dimension
graph_embedding_dim = graph_embeddings.shape[1]  # Desired graph embedding dimension
hidden_dim = 128
lr = 0.0002
batch_size = 64
shuffle = True
epochs = 10000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# Create the dataset
embedding_dataset = EmbeddingDataset(text_embeddings, graph_embeddings)
# Create the DataLoader
embedding_dataloader = DataLoader(embedding_dataset, batch_size=batch_size, shuffle=shuffle)

In [8]:
# Instantiate the networks and move to device
generator = Generator(text_embedding_dim, graph_embedding_dim, hidden_dim).to(device)
discriminator = Discriminator(text_embedding_dim, graph_embedding_dim, hidden_dim).to(device)


In [9]:
# # Loss and optimizers
# adversarial_loss = nn.BCELoss().to(device)
# optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
# optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))
# Loss and optimizers
adversarial_loss = nn.BCELoss().to(device)
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002, betas=(0.5, 0.999))


In [None]:
# Training the cGAN
epochs = 10000

for epoch in range(epochs):
    for i, (real_text_embeddings, real_graph_embeddings) in enumerate(embedding_dataloader):
        batch_size = real_text_embeddings.size(0)

        # Move data to device
        real_text_embeddings = real_text_embeddings.to(device)
        real_graph_embeddings = real_graph_embeddings.to(device)

        # Adversarial ground truths with one-sided label smoothing
        valid = torch.full((batch_size, 1), 0.9, device=device, requires_grad=False)  # Valid labels as 0.9
        fake = torch.zeros((batch_size, 1), device=device, requires_grad=False)

        # -----------------
        #  Train Generator
        # -----------------
        optimizer_G.zero_grad()

        # Generate graph embeddings
        generated_graph_embeddings = generator(real_text_embeddings)

        # Loss measures generator's ability to fool the discriminator
        g_loss = adversarial_loss(discriminator(real_text_embeddings, generated_graph_embeddings), valid)

        g_loss.backward()
        optimizer_G.step()

        # ---------------------
        #  Train Discriminator
        # ---------------------
        optimizer_D.zero_grad()

        # Loss for real graph embeddings
        real_loss = adversarial_loss(discriminator(real_text_embeddings, real_graph_embeddings), valid)
        # Loss for fake graph embeddings
        fake_loss = adversarial_loss(discriminator(real_text_embeddings, generated_graph_embeddings.detach()), fake)
        # Total discriminator loss
        d_loss = (real_loss + fake_loss) / 2

        d_loss.backward()
        optimizer_D.step()
    if(epoch%100==0):
        print(f"Epoch [{epoch}/{epochs}]  D Loss: {d_loss.item()}  G Loss: {g_loss.item()}")


Epoch [0/10000]  D Loss: 0.6525412201881409  G Loss: 0.8440119028091431
Epoch [100/10000]  D Loss: 0.5759603977203369  G Loss: 1.0861787796020508
Epoch [200/10000]  D Loss: 0.4503016769886017  G Loss: 1.7135488986968994
Epoch [300/10000]  D Loss: 0.38879868388175964  G Loss: 2.5334699153900146
Epoch [400/10000]  D Loss: 0.32022085785865784  G Loss: 4.027320861816406
Epoch [500/10000]  D Loss: 0.3064548671245575  G Loss: 4.074382781982422
Epoch [600/10000]  D Loss: 0.3373211622238159  G Loss: 5.704192161560059


In [None]:
# Path to save the model
model_save_path = 'generator_model.pth'

# Save the state dictionary of the generator
torch.save(generator.state_dict(), model_save_path)

print(f"Generator model saved to {model_save_path}")

In [None]:

generator.load_state_dict(torch.load('generator_model.pth'))  # Load the trained model weights
generator.eval()  # Set the generator to evaluation mode

# Load new text embeddings
new_text_embeddings = text_embeddings
new_text_embeddings = torch.tensor(new_text_embeddings, dtype=torch.float32).to(device)

# Generate graph embeddings for the new text embeddings
with torch.no_grad():  # Disable gradient calculation
    generated_graph_embeddings = generator(new_text_embeddings)

# Convert the generated embeddings back to numpy if needed
generated_graph_embeddings = generated_graph_embeddings.cpu().numpy()

print("Generated Graph Embeddings:", generated_graph_embeddings)

In [None]:
with open("../../ics_cwe/Text_Hop/{}/data/GAN_generated.npy".format("pt_Gpt2"), 'wb') as f:
    np.save(f,generated_graph_embeddings)

In [None]:
generated_graph_embeddings.shape

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(generated_graph_embeddings, graph_embeddings)
sim_vec = []
for i in range(len(graph_embeddings)):
    sim_vec.append(cosine_sim_matrix[i][i])
# print(cosine_sim_matrix.shape)  # Should print (203, 933)

In [None]:
sim_vec

In [None]:
import numpy as np
import random
from matplotlib import pyplot as plt

plt.figure(figsize=(5, 5))
plt.hist(sim_vec, bins=10, edgecolor='black')  

# Add title and labels
plt.title('Histogram of GAN Gen. Emb VS Graph Emb')
plt.xlabel('Level')
plt.ylabel('Frequency')
plt.savefig("GAN_sim_vec.png", dpi=300, bbox_inches='tight')
# Show plot
plt.show()