In [9]:
from typing import Optional

import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Input, Embedding, Dense, Dot, Activation, Lambda
from tensorflow.keras.models import Model

from collections import OrderedDict #for ordered sets of the data

In [10]:
#having functions.ndjson 
#create set of leaf_node_values
#create set of all distinct paths
#create set of all tags_vocab

value_vocab = set() #set of all leaf values
path_vocab = set() #set of all distinct paths
tags_vocab = set() #set of all distinct function tags

#vocab sizes and embedding dimensions
value_vocab_size = len(value_vocab)
path_vocab_size = len(path_vocab)
tags_vocab_size = len(tags_vocab)
y = tags_vocab_size
embedding_dim = 128 


# Dummy vocabulary sizes and dimensions
value_vocab_size = 10000  # Example vocab size for value vocab
path_vocab_size = 5000    # Example vocab size for path vocab
tags_vocab_size = 1000    # Example tag vocab size
embedding_dim = 128       # Embedding dimension
num_context = 20          # Number of contexts for a single function

# inputs for value1, path, and value2 (with num_context inputs per batch)
input_value1 = Input(shape=(num_context,), name='value1_input')
input_path = Input(shape=(num_context,), name='path_input')
input_value2 = Input(shape=(num_context,), name='value2_input')

# Embedding layers
value_embedding = Embedding(input_dim=value_vocab_size, output_dim=embedding_dim, name='value_embedding')
path_embedding = Embedding(input_dim=path_vocab_size, output_dim=embedding_dim, name='path_embedding')
tag_embedding = Embedding(input_dim=tags_vocab_size, output_dim=embedding_dim, name='tag_embedding')

# Embed the inputs
embedded_value1 = value_embedding(input_value1)  # Shape: (None, num_context, embedding_dim)
embedded_path = path_embedding(input_path)      # Shape: (None, num_context, embedding_dim)
embedded_value2 = value_embedding(input_value2)  # Shape: (None, num_context, embedding_dim)

# Concatenate along the last axis (for each context, value1, path, and value2 are concatenated)
embedded_concat = Concatenate(axis=-1)([embedded_value1, embedded_path, embedded_value2])
# Shape: (None, num_context, 3 * embedding_dim)

# Apply a dense transformation to each concatenated context (row-wise transformation)
transformed_contexts = Dense(units=y, activation='tanh')(embedded_concat)
# Shape: (None, num_context, y)

# Attention mechanism
attention_weights = Dense(1, activation='softmax')(transformed_contexts)
# Shape: (None, num_context, 1) - attention scores for each context

# apply attention weights to get the weighted sum of contexts
weighted_context = tf.reduce_sum(attention_weights * transformed_contexts, axis=1)
# shape: (None, embedding_dim) - weighted sum across contexts

# get the tag embeddings
tags_embedding_matrix = tag_embedding(tf.range(tags_vocab_size))  # Shape: (tags_vocab_size, embedding_dim)

# compute the dot product between the weighted context and all tag embeddings
tag_scores = tf.matmul(weighted_context, tags_embedding_matrix, transpose_b=True)  # Shape: (None, tags_vocab_size)

# apply softmax to get probabilities over all tags
output = Softmax()(tag_scores)
# Shape: (None, tags_vocab_size) - probabilities for each tag

# Define the model
model = Model(inputs=[input_value1, input_path, input_value2], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()



NameError: name 'Concatenate' is not defined

In [4]:
#NODE TO NODE PATHS
# Function to collect all leaf nodes iteratively using DFS
def collect_leaves_iterative(root):
    if root is None:
        return []

    stack = [(root, [])]  # Stack to store (node, path_from_root)
    leaves = []  # List to store leaf nodes and their paths

    while stack:
        node, path = stack.pop()
        current_path = path + [node.kind]  # Update the current path

        # leaf node - has no children
        if not node.children:
            leaves.append((node, current_path))

        # push the children to the stack for DFS
        children = reversed(node.children)
        for child in children:  # process children in order on the stack
            stack.append((child, current_path))

    return leaves


# Function to find the Lowest Common Ancestor (LCA) iteratively
def find_lca_iterative(n1_path, n2_path):
    length = n1_path.length if n1_path.length < n2_path.length else n2_path.length

    lca = None
    for i in range(length):
        if n1_path[i] == n2_path[i]:
            lca = n1_path[i]
        else:
            break
    return lca


def find_leaf_to_leaf_paths_iterative(root):
    leaf_nodes = collect_leaves_iterative(root)

    #list of all leaf-to-leaf paths
    leaf_to_leaf_paths = []

    # Iterate over each pair of leaf nodes
    for i in range(len(leaf_nodes)):
        for j in range(i + 1, len(leaf_nodes)):
            leaf1, path1 = leaf_nodes[i]
            leaf2, path2 = leaf_nodes[j]

            # find lca
            lca = find_lca_iterative(path1, path2)

            # find the indexes
            lca_index1 = path1.index(lca)
            lca_index2 = path2.index(lca)

            # Path from leaf1 to leaf2 via the LCA
            path_to_lca_from_leaf1 = path1[:lca_index1 + 1]
            path_to_lca_from_leaf2 = path2[:lca_index2 + 1]
            path_to_lca_from_leaf2.reverse().pop()

            #combine the paths
            complete_path = path_to_lca_from_leaf1 + path_to_lca_from_leaf2

            # Add the complete leaf-to-leaf path to the result
            leaf_to_leaf_paths.append(complete_path)

    return leaf_to_leaf_paths



In [14]:
def generate_vocabs(file_path):
    # Open the .ndjson file
    with open(file_path, 'r') as ndjson_file:
        # Load the file content
        data = ndjson.load(ndjson_file)

        value_vocab = set() #set of all leaf values
        path_vocab = set() #set of all distinct paths
        tags_vocab = set() #set of all distinct function tags
        
        for function_json in data:
            # convert each line (function) to a tree
            func_root = json_to_tree(function_json)
            func_values, func_paths = find_leaf_to_leaf_paths_iterative(root)

            # add to vocabs new values from calling find_leafs_to_leaves

        return value_vocab, path_vocab, tags_vocab
            
            
    

In [13]:
def json_to_tree(data: dict) -> Node:
    """
    Recursively builds a tree of Node objects from a JSON dictionary.
    """
    node = Node(
        b_i=None,
        kind=data.get('kind'),
        code_pos=data.get('code_pos'),
        data=data.get('data')
    )

    # Recursively add children
    for child_data in data.get('children', []):
        child_node = json_to_tree(child_data)
        child_node.set_parent(node)  # Set the parent for the child node
        node.add_child(child_node)

    return node
    

In [11]:

class Node:
    def __init__(self, b_i: Optional[int], kind: str, code_pos: str, data: str):
        self.branching_idx = b_i
        self.parent = None
        self.children = []
        self.kind = kind
        self.code_pos = code_pos
        self.data = data

    def set_parent(self, parent: 'Node'):
        self.parent = parent

    def add_child(self, child: 'Node'):
        self.children.append(child)

    def to_dict(self):
        """Convert the node and its children to a dictionary."""
        return {
            'kind': self.kind,
            'code_pos': self.code_pos,
            'data': self.data,
            'children': [child.to_dict() for child in self.children]
        }