In [1]:
import sys
sys.path.append('/tf/main/notebooks/code2vec/code2vec')

from typing import *
import tensorflow as tf
tf.executing_eagerly()
tf.config.set_visible_devices([], 'GPU')

In [20]:
import pandas as pd
from pprint import pprint

## code2vec_vectorization

> This module uses Code2Vec model to perform vectorization on source code

> @Alvaro 14 April 2021

## Load model

In [3]:
config = {
    'code2vec_model_path': '/tf/main/dvc-ds4se/models/cv/java-large-release/saved_model_iter3.release',
    'code2vec_predicter': {
        'SHOW_TOP_CONTEXTS': 10,
        'MAX_PATH_LENGTH': 8,
        'MAX_PATH_WIDTH': 2,
        'JAR_PATH': 'JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar',
    },
    'codesearchnet_java_csv_path': '/tf/main/dvc-ds4se/code/searchnet/[codesearchnet-java-1597073966.81902].csv',
    'spm_model_path': '/tf/main/my_model/spm',
    'spm_vocab_size': 16384,
    'spm_sentence_length': 256,
    'ae_checkpoint_path': '/tf/main/my_model/ae_checkpoint',
    'ae_embedding_dim': 256,
    'ae_batch_size': 16,
}

In [4]:
import code2vec
import common as code2vec_common
import config as code2vec_config
import extractor as code2vec_extrator

code2vec_cfg = code2vec_config.Config(set_defaults=True)
code2vec_cfg.PREDICT = True
code2vec_cfg.MODEL_LOAD_PATH = config['code2vec_model_path']
code2vec_cfg.DL_FRAMEWORK = 'tensorflow'
code2vec_cfg.EXPORT_CODE_VECTORS = True
code2vec_cfg.verify()

In [5]:
help(code2vec)

Help on module code2vec:

NAME
    code2vec

FUNCTIONS
    load_model_dynamically(config:config.Config) -> model_base.Code2VecModelBase

FILE
    /tf/main/notebooks/code2vec/code2vec/code2vec.py




In [6]:
code2vec_model = code2vec.load_model_dynamically(code2vec_cfg)
code2vec_path_extractor = code2vec_extrator.Extractor(
    code2vec_cfg,
    jar_path=config['code2vec_predicter']['JAR_PATH'],
    max_path_length=config['code2vec_predicter']['MAX_PATH_LENGTH'],
    max_path_width=config['code2vec_predicter']['MAX_PATH_WIDTH']
)

In [12]:
import os
import tempfile
import numpy as np

def code2vec_predict(code: str) -> List[np.ndarray]:
    os.chdir('/tf/main/notebooks/code2vec/code2vec')
    with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', suffix='.java') as input_file:
        input_file.write(code)
        input_file.flush()
        input_filename = input_file.name
        try:
            predict_lines, hash_to_string_dict = code2vec_path_extractor.extract_paths(input_filename)
        except ValueError as e:
            raise
    assert len(predict_lines) == 1
    raw_prediction_results = code2vec_model.predict(predict_lines)
    assert len(raw_prediction_results) == 1
    raw_prediction = raw_prediction_results[0]
    return raw_prediction.code_vector

## Load data (Searchnet - Java)

In [8]:
java_df = pd.read_csv(config['codesearchnet_java_csv_path'], header=0, index_col=0, sep='~')

In [9]:
java_df.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
0,ReactiveX/RxJava,src/main/java/io/reactivex/internal/observers/...,https://github.com/ReactiveX/RxJava/blob/ac841...,protected final void fastPathOrderedEmit(U val...,"['protected', 'final', 'void', 'fastPathOrdere...",Makes sure the fast-path emits in order.\n@par...,"['Makes', 'sure', 'the', 'fast', '-', 'path', ...",java,test,"['▁protected', '▁final', '▁void', '▁fast', 'Pa...",134,138
1,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,@CheckReturnValue\n @NonNull\n @Schedule...,"['@', 'CheckReturnValue', '@', 'NonNull', '@',...",Mirrors the one ObservableSource in an Iterabl...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'CheckReturnValue', '▁', '@', 'NonN...",63,71
2,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings(""unchecked"")\n @CheckRetu...","['@', 'SuppressWarnings', '(', '""unchecked""', ...",Mirrors the one ObservableSource in an array o...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'SuppressWarnings', '(""', 'unchecke...",107,109
3,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Concatenates elements of each ObservableSource...,"['Concatenates', 'elements', 'of', 'each', 'Ob...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",79,83
4,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Returns an Observable that emits the items emi...,"['Returns', 'an', 'Observable', 'that', 'emits...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",91,112


## Use code2vec with Searchnet data

In [13]:
for i, row in java_df.iterrows():
    if i == 4:
        break
    print('\n'.join(repr(code2vec_predict(row['code'])).split('\n')[:2]))

INFO:tensorflow:Restoring parameters from /tf/main/dvc-ds4se/models/cv/java-large-release/saved_model_iter3.release
array([-0.02760463, -0.7521175 ,  0.28506714, -0.27454707, -0.18097347,
        0.24782339, -0.6722916 ,  0.46888486, -0.28081748,  0.8019222 ,
array([-0.78671896,  0.285906  , -0.85090244, -0.8599002 ,  0.91621435,
        0.9692382 , -0.8304114 ,  0.94401   , -0.86446977,  0.50095403,
array([-0.5689152 ,  0.03706676, -0.83483744, -0.5761111 ,  0.73942655,
        0.73533094, -0.8511594 ,  0.89340436, -0.71481305,  0.53665817,
array([ 7.55247056e-01, -5.92072904e-01, -9.10713613e-01, -8.35382640e-01,
        9.15723562e-01, -5.24400175e-01, -6.66961819e-02,  9.70486164e-01,


In [34]:
samples = java_df.sample(10)

# Limitations

Apparently Code2vec is not able to code snippets with errors, because it is unable to generate the required AST structure.

In [39]:
c2v_vecs = samples['code'].apply(lambda snippet: code2vec_predict(snippet))

ValueError: Exception in thread "main" com.github.javaparser.ParseProblemException: Encountered unexpected token: "}" "}"
    at line 103, column 2.

Was expecting one of:

    ";"
    "@"
    "\u001a"
    "abstract"
    "class"
    "enum"
    "final"
    "interface"
    "native"
    "private"
    "protected"
    "public"
    "static"
    "strictfp"
    "synchronized"
    "transient"
    "volatile"
    <EOF>


	at com.github.javaparser.JavaParser.simplifiedParse(JavaParser.java:242)
	at com.github.javaparser.JavaParser.parse(JavaParser.java:210)
	at JavaExtractor.FeatureExtractor.parseFileWithRetries(FeatureExtractor.java:70)
	at JavaExtractor.FeatureExtractor.extractFeatures(FeatureExtractor.java:40)
	at JavaExtractor.ExtractFeaturesTask.extractSingleFile(ExtractFeaturesTask.java:64)
	at JavaExtractor.ExtractFeaturesTask.processFile(ExtractFeaturesTask.java:39)
	at JavaExtractor.App.main(App.java:33)


In [37]:
c2v_vecs

29525    public static String getVcsUrl(Map<String, Str...
7962     public void write(Writer out)\n         throws...
22950    public static byte[] encodeBase64(final byte[]...
14894    public DataSet getDataSet()\n    {\n        if...
11822    @Override\n  public void close() {\n    try {\...
21986    public static int cublasGetMatrix (int rows, i...
14580    public static <T, C extends Collection<T>> C c...
8471     public static OperationBuilder gzipCompression...
29768    @Override\n    public synchronized Enumeration...
7750     public final void accept(T value) {\n\n       ...
Name: code, dtype: object

In [28]:
individual_vec = code2vec_predict(java_df['code'].values[345])

In [29]:
print(individual_vec.shape)

(384,)


In [30]:
repr(individual_vec).split('\n')[:2]

['array([ 0.90667623,  0.23809455, -0.8514606 ,  0.59510237,  0.75598276,',
 '        0.33530924, -0.89018124, -0.10340924, -0.08221343,  0.1775785 ,']

In [31]:
pprint(individual_vec)

array([ 0.90667623,  0.23809455, -0.8514606 ,  0.59510237,  0.75598276,
        0.33530924, -0.89018124, -0.10340924, -0.08221343,  0.1775785 ,
        0.68682784,  0.8717235 , -0.8824909 , -0.2163393 , -0.94581205,
       -0.60218436,  0.16467701,  0.5311251 ,  0.61102974, -0.91910887,
       -0.06518988,  0.92059714, -0.6736148 ,  0.29241586, -0.89693016,
        0.9556126 , -0.72767204, -0.6197471 , -0.77548754,  0.65452117,
       -0.8779617 , -0.89800316, -0.90010667,  0.9015    , -0.8361703 ,
        0.9941117 ,  0.83574724, -0.8061686 , -0.65711653, -0.8074735 ,
        0.78486156, -0.29897797,  0.9641091 ,  0.5523618 , -0.52341473,
        0.7395637 , -0.98014444,  0.05120218,  0.8844695 , -0.9133232 ,
        0.14564386, -0.971864  , -0.03832306, -0.24839212, -0.726976  ,
        0.6552962 ,  0.8738723 , -0.93183345,  0.9217573 , -0.47155234,
        0.23935933, -0.9609556 , -0.7694921 ,  0.00795595,  0.6573367 ,
        0.14391422,  0.77083814, -0.56578416,  0.6385719 , -0.86

In [15]:
def get_code2vec_vectors(df: pd.DataFrame, code_column: str):
    """
    """

    c2v_vectors = df[code_column]