In [2]:
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Input, Embedding, Dense, Dot, Activation, Lambda
from tensorflow.keras.models import Model

from collections import OrderedDict #for ordered sets of the data

In [None]:
#having functions.ndjson 
#create set of leaf_node_values
#create set of all distinct paths
#create set of all tags_vocab

value_vocab = set() #set of all leaf values
path_vocab = set() #set of all distinct paths
tags_vocab = set() #set of all distinct function tags

#vocab sizes and embedding dimensions
value_vocab_size = len(value_vocab)
path_vocab_size = len(path_vocab)
tags_vocab_size = len(tags_vocab)
y = tags_vocab_size
embedding_dim = 128 


# Dummy vocabulary sizes and dimensions
value_vocab_size = 10000  # Example vocab size for value vocab
path_vocab_size = 5000    # Example vocab size for path vocab
tags_vocab_size = 1000    # Example tag vocab size
embedding_dim = 128       # Embedding dimension
num_context = 20          # Number of contexts for a single function

# inputs for value1, path, and value2 (with num_context inputs per batch)
input_value1 = Input(shape=(num_context,), name='value1_input')
input_path = Input(shape=(num_context,), name='path_input')
input_value2 = Input(shape=(num_context,), name='value2_input')

# Embedding layers
value_embedding = Embedding(input_dim=value_vocab_size, output_dim=embedding_dim, name='value_embedding')
path_embedding = Embedding(input_dim=path_vocab_size, output_dim=embedding_dim, name='path_embedding')
tag_embedding = Embedding(input_dim=tags_vocab_size, output_dim=embedding_dim, name='tag_embedding')

# Embed the inputs
embedded_value1 = value_embedding(input_value1)  # Shape: (None, num_context, embedding_dim)
embedded_path = path_embedding(input_path)      # Shape: (None, num_context, embedding_dim)
embedded_value2 = value_embedding(input_value2)  # Shape: (None, num_context, embedding_dim)

# Concatenate along the last axis (for each context, value1, path, and value2 are concatenated)
embedded_concat = Concatenate(axis=-1)([embedded_value1, embedded_path, embedded_value2])
# Shape: (None, num_context, 3 * embedding_dim)

# Apply a dense transformation to each concatenated context (row-wise transformation)
transformed_contexts = Dense(units=y, activation='tanh')(embedded_concat)
# Shape: (None, num_context, y)

# Attention mechanism
attention_weights = Dense(1, activation='softmax')(transformed_contexts)
# Shape: (None, num_context, 1) - attention scores for each context

# apply attention weights to get the weighted sum of contexts
weighted_context = tf.reduce_sum(attention_weights * transformed_contexts, axis=1)
# shape: (None, embedding_dim) - weighted sum across contexts

# get the tag embeddings
tags_embedding_matrix = tag_embedding(tf.range(tags_vocab_size))  # Shape: (tags_vocab_size, embedding_dim)

# compute the dot product between the weighted context and all tag embeddings
tag_scores = tf.matmul(weighted_context, tags_embedding_matrix, transpose_b=True)  # Shape: (None, tags_vocab_size)

# apply softmax to get probabilities over all tags
output = Softmax()(tag_scores)
# Shape: (None, tags_vocab_size) - probabilities for each tag

# Define the model
model = Model(inputs=[input_value1, input_path, input_value2], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

