In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

# import os
# os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [2]:
import os
from argparse import ArgumentParser

import dgl
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv
from torch.utils.data import DataLoader

from code_parser import *
from dgl_dataset import CloneDataset

Using backend: pytorch


In [120]:
"""
Improved Semantic Representations From Tree-Structured Long Short-Term Memory Networks
https://arxiv.org/abs/1503.00075
"""
import time
import itertools
import networkx as nx
import numpy as np
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import dgl

class TreeLSTMCell(nn.Module):
    def __init__(self, x_size, h_size):
        super(TreeLSTMCell, self).__init__()
        self.W_iou = nn.Linear(x_size, 3 * h_size, bias=False)
        self.U_iou = nn.Linear(2 * h_size, 3 * h_size, bias=False)
        self.b_iou = nn.Parameter(th.zeros(1, 3 * h_size))
        self.U_f = nn.Linear(2 * h_size, 2 * h_size)

    def message_func(self, edges):
        return {'h': edges.src['h'], 'c': edges.src['c']}

    def reduce_func(self, nodes):
        h_cat = nodes.mailbox['h'].view(nodes.mailbox['h'].size(0), -1)
        f = th.sigmoid(self.U_f(h_cat)).view(*nodes.mailbox['h'].size())
        c = th.sum(f * nodes.mailbox['c'], 1)
        return {'iou': self.U_iou(h_cat), 'c': c}

    def apply_node_func(self, nodes):
        iou = nodes.data['iou'] + self.b_iou
        i, o, u = th.chunk(iou, 3, 1)
        i, o, u = th.sigmoid(i), th.sigmoid(o), th.tanh(u)
        c = i * u + nodes.data['c']
        h = o * th.tanh(c)
        return {'h' : h, 'c' : c}

class ChildSumTreeLSTMCell(nn.Module):
    def __init__(self, x_size, h_size):
        super(ChildSumTreeLSTMCell, self).__init__()
        self.W_iou = nn.Linear(x_size, 3 * h_size, bias=False)
        self.U_iou = nn.Linear(h_size, 3 * h_size, bias=False)
        self.b_iou = nn.Parameter(th.zeros(1, 3 * h_size))
        self.U_f = nn.Linear(h_size, h_size)

    def message_func(self, edges):
        return {'h': edges.src['h'], 'c': edges.src['c']}

    def reduce_func(self, nodes):
        h_tild = th.sum(nodes.mailbox['h'], 1)
        f = th.sigmoid(self.U_f(nodes.mailbox['h']))
        c = th.sum(f * nodes.mailbox['c'], 1)
        return {'iou': self.U_iou(h_tild), 'c': c}

    def apply_node_func(self, nodes):
        iou = nodes.data['iou'] + self.b_iou
        i, o, u = th.chunk(iou, 3, 1)
        i, o, u = th.sigmoid(i), th.sigmoid(o), th.tanh(u)
        c = i * u + nodes.data['c']
        h = o * th.tanh(c)
        return {'h': h, 'c': c}

class TreeLSTM(nn.Module):
    def __init__(self,
                 x_size,
                 h_size,
                 num_classes,
                 dropout,
                 cell_type='nary',
                 pretrained_emb=None):
        super(TreeLSTM, self).__init__()
        self.x_size = x_size
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(h_size, num_classes)
        cell = TreeLSTMCell if cell_type == 'nary' else ChildSumTreeLSTMCell
        self.cell = cell(x_size, h_size)

    def forward(self, g, h, c):
        """Compute tree-lstm prediction given a batch.
        Parameters
        ----------
        batch : dgl.data.SSTBatch
            The data batch.
        h : Tensor
            Initial hidden state.
        c : Tensor
            Initial cell state.
        Returns
        -------
        logits : Tensor
            The prediction of each node.
        """
#         g = batch.graph
        g.register_message_func(self.cell.message_func)
        g.register_reduce_func(self.cell.reduce_func)
        g.register_apply_node_func(self.cell.apply_node_func)
        
        # feed embedding
        embeds = g.ndata['data'].float()
        g.ndata['iou'] = self.cell.W_iou(embeds).float()
        g.ndata['h'] = h
        g.ndata['c'] = c
        
        # propagate
#         dgl.prop_nodes_topo(g)
        traversal_order = dgl.topological_nodes_generator(g)
        g.prop_nodes(traversal_order)
        
        # compute logits
        h = self.dropout(g.ndata.pop('h'))
        logits = self.linear(h)
        
        return logits

In [121]:
from torch.utils.data import DataLoader
import torch.nn.functional as F

device = th.device('cpu')
# hyper parameters
x_size = 128
h_size = 128
dropout = 0.5
lr = 0.05
weight_decay = 1e-4
epochs = 10

# create the model
model = TreeLSTM(x_size,
                 h_size,
                 6,
                 dropout)
print(model)

TreeLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=128, out_features=6, bias=True)
  (cell): TreeLSTMCell(
    (W_iou): Linear(in_features=128, out_features=384, bias=False)
    (U_iou): Linear(in_features=256, out_features=384, bias=False)
    (U_f): Linear(in_features=256, out_features=256, bias=True)
  )
)


In [122]:
# from gensim.models import KeyedVectors as word2vec
# code2vec = word2vec.load_word2vec_format("../data/token_vecs.txt", binary=False)

In [123]:
dataset = CloneDataset("../data/train.npz", "../data/bcb_funcs_all.tsv", "../data/java.so", code2vec)

loading functions
preparing parser
loading code2vec


In [124]:
dataset[0]

(DGLGraph(num_nodes=170, num_edges=169,
          ndata_schemes={'data': Scheme(shape=(128,), dtype=torch.float64)}
          edata_schemes={}),
 5)

In [125]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device = torch.device('cpu')
def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph.to(device), torch.tensor(labels).to(device)

In [126]:
train_loader = DataLoader(dataset,
                          batch_size=4,
                          collate_fn=collate,
                          shuffle=True,
                          num_workers=0
                         )

In [127]:
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.0001)

In [128]:
for batch in train_loader:
    break

In [129]:
g,y=batch

In [130]:
n = g.number_of_nodes()

In [131]:
h = torch.zeros((n, h_size)).to(device)
c = torch.zeros((n, h_size)).to(device)

In [132]:
logits = model(g, h, c)

RuntimeError: size mismatch, m1: [12 x 128], m2: [256 x 256] at /opt/conda/conda-bld/pytorch_1587428398394/work/aten/src/TH/generic/THTensorMath.cpp:41

In [133]:
code2vec['int']

array([-1.01240054e-02,  4.42560966e-04,  4.24054414e-02,  8.56862217e-03,
       -2.30304897e-02,  3.28675285e-02, -1.81365516e-02, -6.60097897e-02,
       -8.27029124e-02, -5.99313155e-03, -4.36425395e-03, -8.52343626e-03,
        2.87920013e-02, -3.62295774e-03,  2.27357056e-02,  1.20289177e-02,
       -1.08650681e-02,  1.41063749e-04, -4.97219451e-02, -3.72136533e-02,
       -3.92279327e-02,  1.50501123e-02, -2.88130436e-02,  1.08163403e-02,
       -9.84790735e-04,  8.59240815e-03,  1.68674830e-02, -2.80296039e-02,
       -4.87690279e-03,  1.78364813e-02,  6.07565418e-02, -2.23978199e-02,
        8.84049386e-02,  5.09566478e-02, -1.75079983e-02, -5.12673520e-02,
       -3.87850292e-02,  2.27795020e-02, -1.54022262e-01,  1.86604727e-02,
       -3.82257774e-02, -1.05149839e-02,  1.70635451e-02, -1.05162999e-02,
        4.94960807e-02,  3.24654719e-03, -1.34795859e-01, -1.13393851e-02,
        7.32898265e-02, -3.52490954e-02,  5.21207005e-02,  3.96120275e-04,
       -2.39544921e-02,  