In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from random import random

# from SourceCodeTools.code.data.dataset.reader import load_data
from SourceCodeTools.code.data.dataset.Dataset import SourceGraphDataset, filter_dst_by_freq
from SourceCodeTools.tabular.common import compact_property
from SourceCodeTools.code.data.file_utils import unpersist

import dgl
import torch
import numpy as np

Using backend: pytorch


# Reading Data
## Reading Nodes and Edges

When using text-baser storage formats for pandas it is necessary to make sure correct data format is loaded.

In [3]:
def load_data(node_path, edge_path, rename_columns=True):
    nodes = unpersist(node_path)
    edges = unpersist(edge_path)
    
    
    # `int64` not nullable
    # `Int64` nullable
    nodes = nodes.astype({
        'type': 'category', "serialized_name": "string", "mentioned_in": "Int64", "string": "string", "id": "int64"
    })
    edges = edges.astype({
        'type': 'category', "mentioned_in": "Int64", "source_node_id": "int64", "target_node_id": "int64"
    })

    if rename_columns:
        nodes = nodes.rename(mapper={
            'serialized_name': 'name'
        }, axis=1)
        edges = edges.rename(mapper={
            'source_node_id': 'src',
            'target_node_id': 'dst'
        }, axis=1)

    return nodes, edges

In [4]:
nodes, edges = load_data("small_graph/common_nodes.json", "small_graph/common_edges.json")

In [5]:
print("Nodes dtypes:\n", nodes.dtypes)
print()
print("Edges dtypes:\n", edges.dtypes)
print()
print(f"Unique nodes: {len(nodes)}, node types: {len(nodes['type'].unique())}")
print(f"Unique edges: {len(edges)}, edge types: {len(edges['type'].unique())}")

Nodes dtypes:
 id                 int64
type            category
name              string
mentioned_in       Int64
string            string
dtype: object

Edges dtypes:
 id                 int64
type            category
src                int64
dst                int64
file_id          float64
mentioned_in       Int64
dtype: object

Unique nodes: 150, node types: 26
Unique edges: 415, edge types: 50


In [6]:
nodes

Unnamed: 0,id,type,name,mentioned_in,string
0,0,module,ExampleModule,,
1,1,class,ExampleModule.ExampleClass,,
2,2,class_method,ExampleModule.ExampleClass.__init__,,
3,3,non_indexed_symbol,builtins,,
4,4,class,builtins.int,,
...,...,...,...,...,...
145,152,mention,print@FunctionDef_0x16d41315c9c41f53,143,
146,153,Call,Call_0x16d41315c9148a75,143,print(a+b)
147,154,BinOp,BinOp_0x16d41315c919b5a9,143,a+b
148,155,mention,main@Module_0x16d41315c9361936,138,


In [73]:
edges

Unnamed: 0,id,type,src,dst,file_id,mentioned_in
0,0,defines,0,1,,
1,1,defines,1,2,,
2,2,defines,3,4,,
3,3,uses_type,2,4,,
4,4,defines,1,5,,
...,...,...,...,...,...,...
410,464,func_rev,156,155,32.0,138
411,465,next,143,156,32.0,138
412,466,prev,156,143,32.0,138
413,467,defined_in_module,156,138,32.0,138


In [74]:
assert all(edges.eval("src in @node_ids", local_dict={"node_ids": nodes["id"]}))
assert all(edges.eval("dst in @node_ids", local_dict={"node_ids": nodes["id"]}))

In [75]:
nodes = nodes[["id", "type", "name"]]
edges = edges[["id", "type", "src", "dst"]]

## Reading type annotations

In [76]:
type_annotations = unpersist("small_graph/type_annotations.json").query("src in @node_ids", local_dict={"node_ids": nodes["id"]})
type_annotations

Unnamed: 0,src,dst
0,22,int
2,35,str
4,49,str
6,53,int
8,56,str
10,79,
12,104,int
14,128,str


# Preprocessing graph
## Removing some edges

As an exercise, we remove some edge types

In [77]:
def remove_global_edges(edges):
    global_edges = SourceGraphDataset.get_global_edges()
    is_ast = lambda type: type not in global_edges
    edges = edges.query("type.map(@is_ast)", local_dict={"is_ast": is_ast})
    return edges

In [78]:
edges_ast = remove_global_edges(edges)
edges_ast

Unnamed: 0,id,type,src,dst
50,50,subword,14,15
51,51,arg,15,19
52,52,arg_rev,19,15
53,53,args,19,20
54,54,args_rev,20,19
...,...,...,...,...
410,464,func_rev,156,155
411,465,next,143,156
412,466,prev,156,143
413,467,defined_in_module,156,138


## Making sure no isolated nodes are present

After graph has been edited, need to make sure there are no isolated nodes. They will cause errors when training GNN.

In [79]:
def ensure_connectedness(nodes, edges):
    """
    Filter isolated nodes
    :param nodes: DataFrame
    :param edges: DataFrame
    :return:
    """
    unique_connected_nodes = set(edges['src'].append(edges['dst']))
    
    nodes = nodes.query("id in @unique_connected_nodes", local_dict={"unique_connected_nodes": unique_connected_nodes})
                        
    print(f"Ending up with {len(nodes)} nodes and {len(edges)} edges")
    return nodes, edges

In [80]:
nodes, edges = ensure_connectedness(nodes, edges)

Ending up with 150 nodes and 415 edges


In [81]:
assert all(edges.eval("src in @node_ids", local_dict={"node_ids": nodes["id"]}))
assert all(edges.eval("dst in @node_ids", local_dict={"node_ids": nodes["id"]}))

## Adding extra node information

In [82]:
def format_node_types(nodes):
    """
    DGL confuses some node types with internal objects, need to change current type names
    """
    nodes = nodes.copy()  # copying is slow for large datasets, prefer in-place operations
    nodes['type_backup'] = nodes['type']
    nodes['type'] = nodes['type'].apply(lambda x: f"{x}_")
    # nodes['type'] = "node_"
    # nodes = nodes.astype({'type': 'category'})
    return nodes

# def add_embedding_names(nodes):
#     """
#     Embedding names are used for initial embeddings (layer 0)
#     """
#     nodes = nodes.copy()
#     nodes["embeddable"] = True
#     nodes["embeddable_name"] = nodes["name"].apply(SourceGraphDataset.get_embeddable_name)
#     return nodes

def add_splits(nodes, train_frac, restricted_id_pool=None):
    nodes = nodes.copy()
    
    def random_partition():
        r = random()
        if r < train_frac:
            return "train"
        elif r < train_frac + (1 - train_frac) / 2:
            return "val"
        else:
            return "test"
    
    import numpy as np
    # define partitioning
    masks = np.array([random_partition() for _ in range(len(nodes))])
    
    # create masks
    nodes["train_mask"] = masks == "train"
    nodes["val_mask"] = masks == "val"
    nodes["test_mask"] = masks == "test"
    
    if restricted_id_pool is not None:
        # if `restricted_id_pool` is provided, mask all nodes not in `restricted_id_pool` negatively
        to_keep = nodes.eval("id in @restricted_ids", local_dict={"restricted_ids": restricted_id_pool})
        nodes["train_mask"] = nodes["train_mask"] & to_keep
        nodes["test_mask"] = nodes["test_mask"] & to_keep
        nodes["val_mask"] = nodes["val_mask"] & to_keep
    
    return nodes

In [83]:
nodes = add_splits(format_node_types(nodes), 0.5, restricted_id_pool=type_annotations["src"])
print("Train examples:", len(nodes.query("train_mask == True")))
print("Test examples:", len(nodes.query("test_mask == True")))
print("Validation examples:", len(nodes.query("val_mask == True")))
nodes

Train examples: 4
Test examples: 1
Validation examples: 3


Unnamed: 0,id,type,name,type_backup,train_mask,val_mask,test_mask
0,0,module_,ExampleModule,module,False,False,False
1,1,class_,ExampleModule.ExampleClass,class,False,False,False
2,2,class_method_,ExampleModule.ExampleClass.__init__,class_method,False,False,False
3,3,non_indexed_symbol_,builtins,non_indexed_symbol,False,False,False
4,4,class_,builtins.int,class,False,False,False
...,...,...,...,...,...,...,...
145,152,mention_,print@FunctionDef_0x16d41315c9c41f53,mention,False,False,False
146,153,Call_,Call_0x16d41315c9148a75,Call,False,False,False
147,154,BinOp_,BinOp_0x16d41315c919b5a9,BinOp,False,False,False
148,155,mention_,main@Module_0x16d41315c9361936,mention,False,False,False


In [84]:
def add_type_dependent_dense_ids_to_nodes(nodes):
    """
    DGL requires dense ids: https://docs.dgl.ai/en/latest/generated/dgl.heterograph.html#dgl.heterograph
    Compute dense ids for each node type
    """
    nodes = nodes.copy()

    typed_id_map = {}

    for type_ in nodes['type'].unique():
        # create mask for the current node type
        type_mask = nodes['type'] == type_

        # `compact_property` will create a dense mapping
        # it is equivalent to dict(zip(node_ids, range(len(node_ids))))
        id_map = compact_property(nodes.loc[type_mask, 'id'])

        # add a new column with dense type-dependent ids
        nodes.loc[type_mask, 'typed_id'] = nodes.loc[type_mask, 'id'].apply(lambda old_id: id_map[old_id])

        # store for further reference
        typed_id_map[type_] = id_map

    nodes = nodes.astype({"typed_id": "int64"})
    return nodes, typed_id_map

In [85]:
nodes, typed_id_map = add_type_dependent_dense_ids_to_nodes(nodes)
nodes

Unnamed: 0,id,type,name,type_backup,train_mask,val_mask,test_mask,typed_id
0,0,module_,ExampleModule,module,False,False,False,0
1,1,class_,ExampleModule.ExampleClass,class,False,False,False,0
2,2,class_method_,ExampleModule.ExampleClass.__init__,class_method,False,False,False,0
3,3,non_indexed_symbol_,builtins,non_indexed_symbol,False,False,False,0
4,4,class_,builtins.int,class,False,False,False,1
...,...,...,...,...,...,...,...,...
145,152,mention_,print@FunctionDef_0x16d41315c9c41f53,mention,False,False,False,33
146,153,Call_,Call_0x16d41315c9148a75,Call,False,False,False,9
147,154,BinOp_,BinOp_0x16d41315c919b5a9,BinOp,False,False,False,1
148,155,mention_,main@Module_0x16d41315c9361936,mention,False,False,False,34


## Adding extra edge information

In [86]:
def format_edge_types(edges):
    """
    DGL confuses some edge types with internal objects, need to change current type names
    """
    edges = edges.copy()
    edges['type'] = edges['type'].apply(lambda x: f"{x}_")
    return edges

In [87]:
edges = format_edge_types(edges)
edges

Unnamed: 0,id,type,src,dst
0,0,defines_,0,1
1,1,defines_,1,2
2,2,defines_,3,4
3,3,uses_type_,2,4
4,4,defines_,1,5
...,...,...,...,...
410,464,func_rev_,156,155
411,465,next_,143,156
412,466,prev_,156,143
413,467,defined_in_module_,156,138


In [88]:
def add_node_types_to_edges(nodes, edges):
    """
    Add node types because they are needed for refining edge signatures
    """
    edges = edges.copy()
    node_type_map = dict(zip(nodes['id'], nodes['type']))

    edges['src_type'] = edges['src'].apply(lambda src_id: node_type_map[src_id])
    edges['dst_type'] = edges['dst'].apply(lambda dst_id: node_type_map[dst_id])
    edges = edges.astype({'src_type': 'category', 'dst_type': 'category'})

    return edges

In [89]:
edges = add_node_types_to_edges(nodes, edges)
edges

Unnamed: 0,id,type,src,dst,src_type,dst_type
0,0,defines_,0,1,module_,class_
1,1,defines_,1,2,class_,class_method_
2,2,defines_,3,4,non_indexed_symbol_,class_
3,3,uses_type_,2,4,class_method_,class_
4,4,defines_,1,5,class_,class_field_
...,...,...,...,...,...,...
410,464,func_rev_,156,155,Call_,mention_
411,465,next_,143,156,FunctionDef_,Call_
412,466,prev_,156,143,Call_,FunctionDef_
413,467,defined_in_module_,156,138,Call_,Module_


# Building graph

In [90]:
print(f"Unique nodes: {len(nodes)}, node types: {len(nodes['type'].unique())}")
print(f"Unique edges: {len(edges)}, edge types: {len(edges['type'].unique())}")

Unique nodes: 150, node types: 26
Unique edges: 415, edge types: 50


In [91]:
def add_global_dense_graph_id(nodes, graph, typed_id_map):
    """
    Add dense global node ids to make it easier working with embeddings in the future
    """
    orig_id = []
    graph_id = []
    prev_offset = 0
    
    nodes = nodes.copy()

    # simply assign global id in the order node types appear in `graph.ntypes`
    for type_ in graph.ntypes:
        from_id, to_id = zip(*typed_id_map[type_].items())
        orig_id.extend(from_id)
        graph_id.extend([t + prev_offset for t in to_id])
        prev_offset += graph.number_of_nodes(type_)

    global_map = dict(zip(orig_id, graph_id))

    nodes['global_graph_id'] = nodes['id'].apply(lambda old_id: global_map[old_id])
    
    return nodes

def add_node_data(graph, nodes):
    field_types = {
        "train_mask": torch.bool,
        "test_mask": torch.bool,
        "val_mask": torch.bool,
        "typed_id": torch.int64,
        "original_id": torch.int64,
        "global_graph_id": torch.int64,
    }
    
    for ntype in graph.ntypes:
        node_data = nodes.query(f"type == '{ntype}'").sort_values('typed_id').rename({"id": "original_id"}, axis=1)
           
        for field_name, field_type in field_types.items():
            graph.nodes[ntype].data[field_name] = torch.tensor(node_data[field_name].values, dtype=field_type)
        
    return graph

def create_hetero_graph(nodes, edges, typed_id_map):
    # nodes = nodes.copy()
    # edges = edges.copy()

    # edges = add_node_types_to_edges(nodes, edges)

    typed_node_id = dict(zip(nodes['id'], nodes['typed_id']))

    typed_subgraphs = {}

    # group by in pandas is slow, use something else for large datasets
    for signature, signature_edges in edges.groupby(['src_type', 'type', 'dst_type']):  
        # `signature` is a tuple (src_type, edge_type, dst_type)
        typed_subgraphs[signature] = list(
            zip(
                signature_edges['src'].map(lambda old_id: typed_node_id[old_id]),
                signature_edges['dst'].map(lambda old_id: typed_node_id[old_id])
            )
        )

    print(
        f"Unique triplet types in the graph: {len(typed_subgraphs.keys())}"
    )

    g = dgl.heterograph(typed_subgraphs)
    
    nodes = add_global_dense_graph_id(nodes, g, typed_id_map)
    
    g = add_node_data(g, nodes)
    return g, nodes, edges

In [92]:
g, nodes, edges = create_hetero_graph(nodes, edges, typed_id_map)

Unique triplet types in the graph: 140


## Graph Atributes

In [93]:
g.ntypes

['#attr#_',
 'AnnAssign_',
 'Assign_',
 'Attribute_',
 'BinOp_',
 'Call_',
 'ClassDef_',
 'Constant_',
 'FunctionDef_',
 'ImportFrom_',
 'JoinedStr_',
 'Module_',
 'Op_',
 'Return_',
 'alias_',
 'arg_',
 'arguments_',
 'class_',
 'class_field_',
 'class_method_',
 'function_',
 'global_variable_',
 'mention_',
 'module_',
 'non_indexed_symbol_',
 'subword_']

In [94]:
g.nodes["module_"]

NodeSpace(data={'train_mask': tensor([False, False, False]), 'test_mask': tensor([False, False, False]), 'val_mask': tensor([False, False, False]), 'typed_id': tensor([0, 1, 2]), 'original_id': tensor([ 0, 10, 91]), 'global_graph_id': tensor([122, 123, 124])})

In [95]:
g.ndata["train_mask"]

{'#attr#_': tensor([False, False, False, False]),
 'AnnAssign_': tensor([False, False]),
 'Assign_': tensor([False, False, False, False, False]),
 'Attribute_': tensor([False, False, False, False, False, False, False]),
 'BinOp_': tensor([False, False]),
 'Call_': tensor([False, False, False, False, False, False, False, False, False, False,
         False]),
 'ClassDef_': tensor([False, False]),
 'Constant_': tensor([False]),
 'FunctionDef_': tensor([False,  True, False,  True, False, False, False, False]),
 'ImportFrom_': tensor([False, False]),
 'JoinedStr_': tensor([False]),
 'Module_': tensor([False, False, False, False]),
 'Op_': tensor([False]),
 'Return_': tensor([False, False, False, False]),
 'alias_': tensor([False, False]),
 'arg_': tensor([False, False, False, False, False, False, False, False, False]),
 'arguments_': tensor([False, False, False, False, False, False]),
 'class_': tensor([False, False, False, False]),
 'class_field_': tensor([False, False]),
 'class_method_'

In [96]:
g.canonical_etypes

[('#attr#_', 'attr_', 'Attribute_'),
 ('AnnAssign_', 'defined_in_function_', 'FunctionDef_'),
 ('AnnAssign_', 'next_', 'AnnAssign_'),
 ('AnnAssign_', 'next_', 'Return_'),
 ('AnnAssign_', 'prev_', 'AnnAssign_'),
 ('AnnAssign_', 'target_rev_', 'mention_'),
 ('AnnAssign_', 'value_rev_', 'Attribute_'),
 ('AnnAssign_', 'value_rev_', 'Call_'),
 ('Assign_', 'defined_in_function_', 'FunctionDef_'),
 ('Assign_', 'defined_in_module_', 'Module_'),
 ('Assign_', 'next_', 'Assign_'),
 ('Assign_', 'next_', 'Call_'),
 ('Assign_', 'next_', 'FunctionDef_'),
 ('Assign_', 'prev_', 'Assign_'),
 ('Assign_', 'prev_', 'ImportFrom_'),
 ('Assign_', 'targets_rev_', 'Attribute_'),
 ('Assign_', 'targets_rev_', 'mention_'),
 ('Assign_', 'value_rev_', 'Call_'),
 ('Assign_', 'value_rev_', 'mention_'),
 ('Attribute_', 'func_', 'Call_'),
 ('Attribute_', 'left_', 'BinOp_'),
 ('Attribute_', 'right_', 'BinOp_'),
 ('Attribute_', 'targets_', 'Assign_'),
 ('Attribute_', 'value_', 'AnnAssign_'),
 ('Attribute_', 'value_rev_', 

In [97]:
g.edges[('subword_', 'subword_', 'mention_')]

EdgeSpace(data={})

## Dataloaders

In [98]:
def get_train_nodes(graph):
    train_nodes = {}
    for node_type, mask in g.ndata["train_mask"].items():
        train_ids = g.ndata["typed_id"][node_type][mask]
        if len(train_ids) > 0:
            train_nodes[node_type] = train_ids
    return train_nodes

get_train_nodes(g)

{'FunctionDef_': tensor([1, 3]), 'mention_': tensor([ 6, 19])}

In [99]:
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
loader = dgl.dataloading.NodeDataLoader(
    g, get_train_nodes(g), sampler, batch_size=1, shuffle=False, num_workers=0)

In [100]:
for ind, (input_nodes, seeds, blocks) in enumerate(loader):
    print("Batch:", ind)
    
    print("Seeds:")
    for key, val in seeds.items():
        if len(val) > 0:
            print(key, blocks[-1].dstnodes[key].data["original_id"])
            
    print()
    
    print("Input nodes:")
    for key, val in input_nodes.items():
        if len(val) > 0:
            print(key, blocks[0].srcnodes[key].data["original_id"])
            
    print()
    
    for b_ind, block in enumerate(blocks):
        print("Layer", b_ind)
        for etype in block.canonical_etypes:
            if block[etype].num_edges() > 0:
                # for srctype, dsttype in zip(
                # print(blocks[0][etype].adj().to_dense())
                print(block.srcnodes[etype[0]].data["original_id"], "-->", etype, "-->", block.dstnodes[etype[2]].data["original_id"])  #, block[etype].num_edges())    
        print()
    print()


Batch: 0
Seeds:
FunctionDef_ tensor([35])

Input nodes:
ClassDef_ tensor([17])
FunctionDef_ tensor([35, 16, 49])
Return_ tensor([45])
arguments_ tensor([37])
mention_ tensor([46])

Layer 0
tensor([17]) --> ('ClassDef_', 'defined_in_class_rev_', 'FunctionDef_') --> tensor([35])
tensor([35, 16, 49]) --> ('FunctionDef_', 'next_', 'FunctionDef_') --> tensor([35])
tensor([35, 16, 49]) --> ('FunctionDef_', 'prev_', 'FunctionDef_') --> tensor([35])
tensor([45]) --> ('Return_', 'defined_in_function_', 'FunctionDef_') --> tensor([35])
tensor([37]) --> ('arguments_', 'args_', 'FunctionDef_') --> tensor([35])
tensor([46]) --> ('mention_', 'function_name_rev_', 'FunctionDef_') --> tensor([35])


Batch: 1
Seeds:
FunctionDef_ tensor([79])

Input nodes:
Assign_ tensor([75])
Call_ tensor([83, 90])
FunctionDef_ tensor([79])
Module_ tensor([66])
mention_ tensor([89])

Layer 0
tensor([75]) --> ('Assign_', 'next_', 'FunctionDef_') --> tensor([79])
tensor([83, 90]) --> ('Call_', 'defined_in_function_', 'Fu