In [1]:
# %load_ext autoreload
# %autoreload 2

import sys
sys.path.append("../src")

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [2]:
import pickle
import tqdm
import numpy as np
import pandas as pd
import networkx as nx
from gensim.models import KeyedVectors as word2vec

import torch
from torch_geometric.utils import from_networkx
from torch_geometric.data import DataLoader

import torch.nn.functional as F
from torch_geometric.data import DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import GATConv, SGConv
from torch_geometric.nn import GraphConv, TopKPooling
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
from torch_geometric.nn import GCNConv, GAE, VGAE
from torch_geometric.utils import train_test_split_edges



from code_parser import *
from dataset import CloneDataset

In [27]:
batch_size = 32
num_workers = 16

In [28]:
train_dataset = CloneDataset(root="../data/train_2", functions_path="../data/functions/", pairs_path="../data/train.npz")
valid_dataset = CloneDataset(root="../data/valid_2", functions_path="../data/functions/", pairs_path="../data/valid.npz")
test_dataset = CloneDataset(root="../data/test_2", functions_path="../data/functions/", pairs_path="../data/test.npz")

In [29]:
test_loader = DataLoader(test_dataset, batch_size=batch_size)
val_loader = DataLoader(valid_dataset, batch_size=1, num_workers=1)
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)

In [30]:
class Encoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logvar = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VGAE(Encoder(train_dataset.num_features, 512)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def train(epoch):
    model.train()
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        
        z = model.encode(data.x, data.edge_index)
        
        loss = model.recon_loss(z,  data.edge_index)
        
        print(f"loss = {loss.item()}", end="\r")
        
        loss = loss + (1 / data.num_nodes) * model.kl_loss()
        
        loss.backward()
        optimizer.step()
        loss_all += data.num_graphs * loss.item()
        
        
    return loss_all / len(train_dataset)


@torch.no_grad()
def test(loader):
    model.eval()
    correct = 0
    aucs, aps = [], []
    for data in loader:
        data = train_test_split_edges(data)
        data = data.to(device)
        with torch.no_grad():
            z = model.encode(data.x, data.train_pos_edge_index)
        auc, ap = model.test(z, data.test_pos_edge_index, data.test_neg_edge_index)
        print(f"auc = {auc}", end="\r")
        aucs.append(auc)
        aps.append(aps)
    return np.mean(aucs), np.mean(aps)

In [33]:
best_val_acc = 0
for epoch in range(1, 201):
    loss = train(epoch)
    train_acc = test(train_loader)
    val_acc,val_aps = test(valid_dataset)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "../data/pg_play_sgc.pt")
        
    
    print('Epoch: {:03d}, Loss: {:.5f}, Train Acc: {:.5f}, Val Acc: {:.5f}, Best: {:.5f}'.
          format(epoch, loss, train_acc, val_acc, best_val_acc))

loss = 1.1765700578689575

AssertionError: 

In [None]:
x = test(train_dataset[0:10])

auc = 0.3935790725326991
auc = 0.39909297052154197
auc = 0.3043787824848701
auc = 0.30879999999999996
auc = 0.3700655387898186
auc = 0.5578231292517006
auc = 0.4982164090368609
auc = 0.4025974025974026
auc = 0.4022491349480969


In [None]:
x