In [1]:
import torch
from torch_geometric.data import InMemoryDataset, Data, HeteroData
from torch_geometric.utils import to_edge_index
from tqdm import tqdm
from pathlib import Path
import pickle
import networkx as nx
from word2vec.word2vec import Word2VecBuilder
from multiprocessing import Pool, cpu_count
from functools import partial
import os 
# from graphviz import Source 

import pygraphviz as pgv
import networkx as nx
from tqdm.auto import tqdm



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
word2vec = Word2VecBuilder()
label_categories = {}
# with open(Path(self.root)/'label_info.pkl', 'rb') as f:
    # labels = pickle.load(f)['label'].keys()
labels = ['TranslationUnitDeclaration', 'ConstructExpression','FunctionDeclaration', 'CompoundStatement', 'CallExpression', 'DeclaredReferenceExpression', 'WhileStatement', 'BinaryOperator', 'Literal', 'ReturnStatement', 'ParamVariableDeclaration', 'VariableDeclaration', 'DeclarationStatement', 'UnaryOperator', 'MemberExpression', 'ConditionalExpression', 'IfStatement', 'ForStatement', 'BreakStatement', 'MemberCallExpression', 'ArraySubscriptionExpression', 'TypeIdExpression', 'InitializerListExpression', 'ProblemDeclaration', 'CastExpression', 'ContinueStatement', 'EmptyStatement', 'GotoStatement', 'LabelStatement', 'SwitchStatement', 'CaseStatement', 'ProblemExpression', 'ExpressionList', 'DoStatement', 'DefaultStatement', 'RecordDeclaration', 'FieldDeclaration', 'ConstructorDeclaration', 'DesignatedInitializerExpression', 'EnumDeclaration', 'EnumConstantDeclaration', 'MethodDeclaration', 'ASMDeclarationStatement', 'TypedefDeclaration']
                    
sorted_labels = sorted(labels)
# onehot encode labels
for label in sorted_labels:
    onehot = torch.zeros(len(labels))
    onehot[sorted_labels.index(label)] = 1
    label_categories[label] = onehot
def column_normalize_dense_features_torch(mx):  
    '''Column-normalize dense features'''  # amos
    colsum = torch.sum(mx, axis=0)
    c_inv = torch.pow(colsum, -1)
    c_inv[torch.isinf(c_inv)] = 0.
    c_mat_inv = torch.diag(c_inv)
    mx = mx.mm(c_mat_inv)
    return mx

def get_adjacency_matrices(G):
        adj_matrices = {edge_type: torch.zeros((len(G.nodes), len(G.nodes)), dtype=torch.float32, device='cpu') for edge_type in ['AST','CFG','CG','DFG']}
        node_list = list([str(i) for i in sorted([int(s) for s in list(G.nodes())])])
        node_mapping = {x:i for i, x in enumerate(node_list)}


        # Update adjacency matrices based on edge types
        for edge in G.edges(data=True):
            source = edge[0]
            target = edge[1]
            edge_type = edge[2]['label']
            adj_matrices[edge_type][node_mapping[source]][node_mapping[target]] += 1  # Adjust indices to start from 0

        return node_list, {edge_type: adj_matrix.to_sparse() for edge_type, adj_matrix in adj_matrices.items()}
    
def get_graph(file):
        G = nx.drawing.nx_agraph.read_dot(file)
        # G = nx.Graph(nx.drawing.nx_pydot.read_dot(file))

        # with open(file, "r", encoding="utf8") as f:
        #     G = nx.Graph(pgv.AGraph(f.read()))

        # if not (len(G.nodes) <= 1000 and len(G.nodes) >= 10):  # filter graphs which are too small or too large
        #     placeholder_edgeindex = torch.tensor([[0,0],[0,0]], dtype=torch.long, device='cpu')
        #     placeholder_x = torch.tensor([[1,0]], dtype=torch.float32, device='cpu')
        #     return placeholder_edgeindex, placeholder_x

        node_list, adj_matrices = get_adjacency_matrices(G)

        # get binary adjacency matrix for all edge types
        matrix = torch.zeros((len(G.nodes), len(G.nodes)), layout=torch.sparse_coo, dtype=torch.float32, device='cpu')
        for adj_matrix in adj_matrices.values():
            matrix = matrix+adj_matrix

        # set nonzero to 1, undirected
        #matrix = matrix + matrix.T  # undirected graph
        coo = matrix.coalesce()
        values = torch.ones(coo.values().shape).to('cpu')
        matrix = torch.sparse_coo_tensor(coo.indices(), values, matrix.shape, device='cpu')



        # get node degrees per edge type and in-/out-degrees
        # in1, in2, in3, out1 out2 out3, total1, total2, total3, total4...

        num_types = len(adj_matrices)
        degrees = torch.zeros((len(G.nodes), num_types*3+1), dtype=torch.float32, device='cpu')
        for i, adj_matrix in enumerate(adj_matrices.values()):
            degrees[:,i] = torch.sum(adj_matrix, dim=1).to_dense()  # out-degrees
            degrees[:,i+num_types] = torch.sum(adj_matrix, dim=0).to_dense()   # in-degrees
            degrees[:,i+2*num_types] = degrees[:,i] + degrees[:,i+num_types]  # total degrees

        degrees[:,-1] = torch.sparse.sum(matrix, dim=1).to_dense()   # total degrees (also total1, 2, 3 because sparse does not support range index)

        # add triangle counts per edge type and total triangle counts
        # with triangle count and degree count, we could calculate clustering coefficient
        triangle_counts = torch.zeros((len(G.nodes), num_types+1), device='cpu')
        for i, adj_matrix in enumerate(adj_matrices.values()):
            undirected_adj = (adj_matrix + adj_matrix.T).to_sparse()
            # sparse = undirected_adj.to_sparse()            # unnormalized triangle count
            A_times_A = torch.sparse.mm(undirected_adj,undirected_adj)
            # sum(mul(A,B), dim=1) faster than trace(mm(A,B))
            triangle_counts[:,i] = torch.sparse.sum(torch.mul(undirected_adj, A_times_A), dim=1).to_dense()

        triangle_counts[:,-1] = torch.sum(triangle_counts[:,:-1], dim=1)

        # onehot label of node  e.g. "ParamVariableDeclaration"
        labels = [label_categories[G.nodes[node]['label'].replace('"','')] for node in node_list]
        labels = torch.stack(labels).to('cpu').to(torch.float32)



        # can implement later:
        # or fasttext
        # or SentenceBert
        # pagerank
        #
        # in, out, total * 4 (4 edge types) + total degree of all
        # degrees: 13, triangle_counts: 5
        X = torch.cat([degrees, triangle_counts], dim=1)
        X = column_normalize_dense_features_torch(mx=X)

        node_embeddings = []
        # gensim word vectors (average for sentence vector)
        for node_id in node_list:
            average_embedding = word2vec.get_embedding(G, node_id)
            average_embedding = torch.tensor(average_embedding, dtype=torch.float32, device='cpu')
            node_embeddings.append(average_embedding)

        node_embeddings = torch.stack(node_embeddings)

        X = torch.cat([X, labels, node_embeddings], dim=1)

        # eye = torch.eye(len(G.nodes), device='cpu').to_sparse()

        if is_heterogeneous:
            matrices = {'TOTAL': to_edge_index(matrix)[0]}
            for type, adj_matrix in adj_matrices.items():
                matrices[type.replace('"','')] = to_edge_index(adj_matrix)[0]  # only edge_index

            return matrices, X
                                                       

        return to_edge_index(matrix)[0], X  # only edge_index, not count

In [3]:
is_heterogeneous = True

In [None]:
G = nx.drawing.nx_agraph.read_dot(file)
    # G = nx.Graph(nx.drawing.nx_pydot.read_dot(file))

    # with open(file, "r", encoding="utf8") as f:
    #     G = nx.Graph(pgv.AGraph(f.read()))

    # if not (len(G.nodes) <= 1000 and len(G.nodes) >= 10):  # filter graphs which are too small or too large
    #     placeholder_edgeindex = torch.tensor([[0,0],[0,0]], dtype=torch.long, device='cpu')
    #     placeholder_x = torch.tensor([[1,0]], dtype=torch.float32, device='cpu')
    #     return placeholder_edgeindex, placeholder_x

    node_list, adj_matrices = get_adjacency_matrices(G)

In [5]:
# check file overlap 
parentfolder = Path('codegraphs/diversevul/')
folder1 = parentfolder/'v2_directed_withdegreecount'
folder2 = parentfolder/'v2_directed_withdegreecount_heterogeneous'
folder3 = parentfolder/'v2_undirected_withdegreecount'
folder4 = parentfolder/'v2_undirected_withdegreecount_heterogeneous'

all_files = []
for folder in [folder1, folder2, folder3, folder4]:
    files = os.listdir(folder)
    all_files.append(set(files))
    
# get combinations
# print out the set difference for each combination
for i in range(4):
    for j in range(i+1, 4):
        print(f'Files in {i} but not in {j}: {all_files[i] - all_files[j]}')
        print(f'Files in {j} but not in {i}: {all_files[j] - all_files[i]}')
        print()

Files in 0 but not in 1: set()
Files in 1 but not in 0: set()

Files in 0 but not in 2: set()
Files in 2 but not in 0: set()

Files in 0 but not in 3: set()
Files in 3 but not in 0: set()

Files in 1 but not in 2: set()
Files in 2 but not in 1: set()

Files in 1 but not in 3: set()
Files in 3 but not in 1: set()

Files in 2 but not in 3: set()
Files in 3 but not in 2: set()



In [10]:
all_files[0]


{'10b3ebbe61a7031a3dae97f05834442220447181_33781552796874571012475500914717371939_0.cpg.pt',
 'e18903a6b56341481a2e08469c0602010bf7bfe3_48211062023748212512062689287962220610_0.cpg.pt',
 '00e8181bd97c834fe60751b0c511d4bb97875f78_29374516506970483228471917455471401899_1.cpg.pt',
 '9ea93a2ec8f555ceed1ee27294cf94822f14f10f_200923437217388481551384501819729982405_0.cpg.pt',
 '2f7f3d9960aa6ea21358bdf3687cee5149aa35cf_133531755068952599441102945909583699727_0.cpg.pt',
 'ce9f24cccdc019229b70a5c15e2b09ad9c0ab5d1_301762575777024187502840825015472157848_0.cpg.pt',
 'd68f0f778e7f4fbd674627274267f269e40f0b04_339099610148240126214094540236288472696_0.cpg.pt',
 '37cee01784ff0df13e5209517e1b3594a5e792d1_53472854621242450163498599833719865103_0.cpg.pt',
 '321027c1fe77f892f4ea07846aeae08cefbbb290_220614792428840474255696522881433770716_0.cpg.pt',
 'bc0bdc5afaa740d782fbf936aaeebd65e5c2921d_143707781069951738838728114259263080071_0.cpg.pt',
 '9a5467bf7b6e9e02ec9c3da4e23747c05faeaac6_572243632571911718325