In [1]:
import sys
sys.path.append('/tf/main/ds4se/notebooks/code2vec/code2vec')

from typing import *
import tensorflow as tf
tf.executing_eagerly()
tf.config.set_visible_devices([], 'GPU')

In [2]:
config = {
    'code2vec_model_path': '/tf/main/ds4se/dvc-ds4se/models/cv/java-large-release/saved_model_iter3.release',
    'code2vec_predicter': {
        'SHOW_TOP_CONTEXTS': 10,
        'MAX_PATH_LENGTH': 8,
        'MAX_PATH_WIDTH': 2,
        'JAR_PATH': 'JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar',
    },
    'codesearchnet_java_csv_path': '/tf/main/ds4se/dvc-ds4se/code/searchnet/[codesearchnet-java-1597073966.81902].csv',
    'spm_model_path': '/tf/main/my_model/spm',
    'spm_vocab_size': 16384,
    'spm_sentence_length': 256,
    'ae_checkpoint_path': '/tf/main/my_model/ae_checkpoint',
    'ae_embedding_dim': 256,
    'ae_batch_size': 16,
}

## Load CodeSearchNet-Java

In [3]:
import pandas as pd

df_searchnet = pd.read_csv(config['codesearchnet_java_csv_path'], header=0, index_col=0, sep='~')

In [4]:
len(df_searchnet)

496688

In [5]:
df_searchnet.head(4)

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
0,ReactiveX/RxJava,src/main/java/io/reactivex/internal/observers/...,https://github.com/ReactiveX/RxJava/blob/ac841...,protected final void fastPathOrderedEmit(U val...,"['protected', 'final', 'void', 'fastPathOrdere...",Makes sure the fast-path emits in order.\n@par...,"['Makes', 'sure', 'the', 'fast', '-', 'path', ...",java,test,"['▁protected', '▁final', '▁void', '▁fast', 'Pa...",134,138
1,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,@CheckReturnValue\n @NonNull\n @Schedule...,"['@', 'CheckReturnValue', '@', 'NonNull', '@',...",Mirrors the one ObservableSource in an Iterabl...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'CheckReturnValue', '▁', '@', 'NonN...",63,71
2,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings(""unchecked"")\n @CheckRetu...","['@', 'SuppressWarnings', '(', '""unchecked""', ...",Mirrors the one ObservableSource in an array o...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'SuppressWarnings', '(""', 'unchecke...",107,109
3,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Concatenates elements of each ObservableSource...,"['Concatenates', 'elements', 'of', 'each', 'Ob...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",79,83


## Running code2vec over CodeSearchNet-Java

In [None]:
import code2vec
import common as code2vec_common
import config as code2vec_config
import extractor as code2vec_extrator

code2vec_cfg = code2vec_config.Config(set_defaults=True)
code2vec_cfg.PREDICT = True
code2vec_cfg.MODEL_LOAD_PATH = config['code2vec_model_path']
code2vec_cfg.DL_FRAMEWORK = 'tensorflow'
code2vec_cfg.EXPORT_CODE_VECTORS = True
code2vec_cfg.verify()

In [None]:
code2vec_model = code2vec.load_model_dynamically(code2vec_cfg)
code2vec_path_extractor = code2vec_extrator.Extractor(
    code2vec_cfg,
    jar_path=config['code2vec_predicter']['JAR_PATH'],
    max_path_length=config['code2vec_predicter']['MAX_PATH_LENGTH'],
    max_path_width=config['code2vec_predicter']['MAX_PATH_WIDTH']
)

In [None]:
import os
import tempfile
import numpy as np

def code2vec_predict(code: str) -> List[np.ndarray]:
    os.chdir('/tf/main/ds4se/notebooks/code2vec/code2vec')
    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.java') as input_file:
        input_file.write(code)
        input_file.flush()
        input_filename = input_file.name
        try:
            predict_lines, hash_to_string_dict = code2vec_path_extractor.extract_paths(input_filename)
        except ValueError as e:
            raise
    assert len(predict_lines) == 1
    raw_prediction_results = code2vec_model.predict(predict_lines)
    assert len(raw_prediction_results) == 1
    raw_prediction = raw_prediction_results[0]
    return raw_prediction.code_vector

In [None]:
for i, row in df_searchnet.iterrows():
    if i == 4:
        break
    print('\n'.join(repr(code2vec_predict(row['code'])).split('\n')[:2]))

## Use google/sentencepiece for input tokenization

In [None]:
! pip3 install --user -U sentencepiece

### Training

In [6]:
import sentencepiece as spm

print("Training... (Use docker logs -f NAME to see the logs)", flush=True)
spm.SentencePieceTrainer.train(
    sentence_iterator=(i for i in df_searchnet['code']),
    max_sentence_length=config['spm_sentence_length'],
    model_prefix=config['spm_model_path'],
    model_type='bpe',
    vocab_size=config['spm_vocab_size']
)
print('Finished.')

Training... (Use docker logs -f NAME to see the logs)
Finished.


### Load the model

In [7]:
import sentencepiece as spm

spm_model = spm.SentencePieceProcessor(model_file=config['spm_model_path'] + '.model')

In [8]:
print(spm_model.encode(df_searchnet['code'].iloc[0], out_type='str', add_bos=True, add_eos=True))
print(spm_model.encode(df_searchnet['code'].iloc[0], add_bos=True, add_eos=True))

['<s>', '▁protected', '▁final', '▁void', '▁fast', 'Path', 'Ordered', 'Emit', '(', 'U', '▁value', ',', '▁boolean', '▁delay', 'Error', ',', '▁Disposable', '▁dis', 'posable', ')', '▁{', '▁final', '▁O', 'bserver', '<?', '▁super', '▁V', '>', '▁observer', '▁=', '▁downstream', ';', '▁final', '▁Simple', 'Plain', 'Queue', '<', 'U', '>', '▁q', '▁=', '▁queue', ';', '▁if', '▁(', 'w', 'ip', '.', 'get', '()', '▁==', '▁0', '▁&&', '▁w', 'ip', '.', 'compareAndSet', '(0,', '▁1))', '▁{', '▁if', '▁(', 'q', '.', 'isEmpty', '())', '▁{', '▁accept', '(', 'observer', ',', '▁value', ');', '▁if', '▁(', 'le', 'ave', '(', '-1)', '▁==', '▁0)', '▁{', '▁return', ';', '▁}', '▁}', '▁else', '▁{', '▁q', '.', 'offer', '(', 'value', ');', '▁}', '▁}', '▁else', '▁{', '▁q', '.', 'offer', '(', 'value', ');', '▁if', '▁(!', 'enter', '())', '▁{', '▁return', ';', '▁}', '▁}', '▁Queue', 'Dr', 'ain', 'Helper', '.', 'dr', 'ain', 'Loop', '(', 'q', ',', '▁observer', ',', '▁delay', 'Error', ',', '▁dis', 'posable', ',', '▁this', ');', '▁}

In [9]:
spm_model.vocab_size()

16384

### Preprocess

In [10]:
import itertools

def encode_and_pad(s: str) -> tf.Tensor:
    return tf.convert_to_tensor(
        tf.keras.preprocessing.sequence.pad_sequences([
            spm_model.encode(s, add_bos=True, add_eos=True)
        ], maxlen=config['spm_sentence_length'], padding='post')[0],
        dtype=tf.int32
    )

df_searchnet_tf_slices = []
for i, row in enumerate(df_searchnet['code']):
    df_searchnet_tf_slices.append(encode_and_pad(row))
    if (i % 10000) == 0:
        print(i)
        
df_searchnet_tf = tf.data.Dataset.from_tensor_slices(df_searchnet_tf_slices).shuffle(len(df_searchnet_tf_slices))
df_searchnet_tf = df_searchnet_tf.batch(config['ae_batch_size'], drop_remainder=True)
del df_searchnet_tf_slices

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000


In [11]:
df_searchnet_tf

<BatchDataset shapes: (16, 256), types: tf.int32>

### Autoencoder

In [12]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))


class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights


optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([1] * config['ae_batch_size'], 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [13]:
encoder = Encoder(
    spm_model.vocab_size(),
    config['ae_embedding_dim'],
    config['spm_sentence_length'],
    config['ae_batch_size'])
decoder = Decoder(
    spm_model.vocab_size(),
    config['ae_embedding_dim'],
    config['spm_sentence_length'],
    config['ae_batch_size'])
checkpoint_prefix = config['ae_checkpoint_path']
checkpoint = tf.train.Checkpoint(
    optimizer=optimizer,
    encoder=encoder,
    decoder=decoder)

In [14]:
# sample input
example_input_batch = next(iter(df_searchnet_tf))
example_input_batch.shape

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print('Encoder output shape: (batch size, sequence length, units)', sample_output.shape)
print('Encoder Hidden state shape: (batch size, units)', sample_hidden.shape)

sample_decoder_output, _, _ = decoder(tf.random.uniform((config['ae_batch_size'], 1)),
                                      sample_hidden, sample_output)
print('Decoder output shape: (batch_size, vocab size)', sample_decoder_output.shape)

Encoder output shape: (batch size, sequence length, units) (16, 256, 256)
Encoder Hidden state shape: (batch size, units) (16, 256)
Decoder output shape: (batch_size, vocab size) (16, 16384)


In [15]:
import time

EPOCHS = 10
steps_per_epoch = config['spm_sentence_length']//config['ae_batch_size']

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, inp) in enumerate(df_searchnet_tf.take(config['ae_batch_size'])):
        batch_loss = train_step(inp, inp, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


Epoch 1 Batch 0 Loss 3.3997
Epoch 1 Loss 3.7845
Time taken for 1 epoch 436.3125982284546 sec

Epoch 2 Batch 0 Loss 2.6036
Epoch 2 Loss 3.1126
Time taken for 1 epoch 134.10465955734253 sec

Epoch 3 Batch 0 Loss 2.4923
Epoch 3 Loss 3.1136
Time taken for 1 epoch 134.5851469039917 sec

Epoch 4 Batch 0 Loss 2.7383
Epoch 4 Loss 3.1887
Time taken for 1 epoch 133.68952465057373 sec

Epoch 5 Batch 0 Loss 2.6992
Epoch 5 Loss 3.1851
Time taken for 1 epoch 135.46811485290527 sec

Epoch 6 Batch 0 Loss 2.5103
Epoch 6 Loss 2.9542
Time taken for 1 epoch 137.94829106330872 sec

Epoch 7 Batch 0 Loss 3.3405
Epoch 7 Loss 3.1072
Time taken for 1 epoch 135.55201148986816 sec

Epoch 8 Batch 0 Loss 2.3652
Epoch 8 Loss 2.9516
Time taken for 1 epoch 135.92289400100708 sec

Epoch 9 Batch 0 Loss 2.9978
Epoch 9 Loss 2.7130
Time taken for 1 epoch 136.479975938797 sec

Epoch 10 Batch 0 Loss 3.6442
Epoch 10 Loss 2.8110
Time taken for 1 epoch 134.91528129577637 sec

