In [65]:
# default_exp interpretability.d2v_vectorization

In [12]:
# export

import numpy as np
import gensim
import pandas as pd
import os
import sentencepiece as spm
from datetime import datetime
from pathlib import Path
import logging
from tokenizers import Tokenizer
from abc import ABC, abstractmethod

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

# Doc2vec usage for source code

> Use doc2vec models to get distributed representation (embedding vectors) for source code

> @Alvaro 15 April 2021

In [4]:
# utils
def check_file_existence(path):
    if not os.path.exists(path):
        logging.error('Provided file cannot be found.')
        return False
    return True

Vectorizer class is defined abstract in order to provide alternatives for tokenization (SentencePiece and HuggingFace's Tokenizers)

In [20]:
# export

class Doc2VecVectorizer(ABC):
    def __init__(self, tkzr_path:str, d2v_path: str):
        """
        Default constructor for Vectorizer class
        """
        self.tkzr_path = tkzr_path
        self.d2v_path = d2v_path
        
        self._load_tokenizer_model(self.tkzr_path)
        self._load_doc2vec_model(d2v_path)
        
    @abstractmethod
    def tokenize_df(self, df: pd.DataFrame, code_column: str):
        pass
    
    @abstractmethod
    def _load_tokenizer_model(self, model_path: str):
        pass
    
    def _load_doc2vec_model(self, model_path: str):
        """
        :param model_path: Path to the model file
        :return: Gensim Doc2Vec model (corresponding to the loaded model)
        """
        if not check_file_existence(model_path):
            msg = 'Doc2vec model could no be loaded'
            logging.error('Doc2vec model could no be loaded')
            raise Exception(msg)
        
        model = gensim.models.Doc2Vec.load(model_path)
        self.d2v_model = model    
        
    def infer_d2v(self, df: pd.DataFrame, tokenized_column: str, out_path: str, steps: int=200) -> tuple:
        """
        Performs vectorization via Doc2Vec model 
        "param df": Pandas DataFrame containing source code
        :param code_column: Column name of the column corresponding to source code
        :param steps: Steps for the doc2vec infere
        :return:
        """
        df_inferred = df.copy()
        
        inferred_vecs = np.array([self.d2v_model.infer_vector(tok_snippet, steps=200) \
                                  for tok_snippet in df[tokenized_column].values])
        
        indices = np.array(df.index)
        
        np.save(f"{out_path}/indices", indices)
        np.save(f"{out_path}/feat_vectors", inferred_vecs)
        
        return indices, inferred_vecs

In [48]:
# export

class Doc2VecVectorizerSP(Doc2VecVectorizer):
    """
    Class to perform vectorization via Doc2Vec model
    leveraging SentencePiece to tokenizer sequences.
    """
    def __init__(self, sp_path: str, d2v_path: str):
        """
        :param sp_path: Path to the SentencePiece saved model
        :param d2v_path: Path to the Doc2Vec saved model
        """
        super().__init__(sp_path, d2v_path)
    
    def _load_tokenizer_model(self, model_path: str):
        """
        Loads the sentence piece model stored in the specified path 
        :param model_path: Path to the model file
        :return: SentencePieceProcessor object (corresponding to loaded model)
        """
        if not check_file_existence(model_path):
            msg = 'Sentence piece model could no be loaded'
            logging.error(msg)
            raise Exception(msg)
        
        sp_processor = spm.SentencePieceProcessor()
        sp_processor.load(model_path)
        self.tokenizer = sp_processor
        
    def tokenize_df(self, df: pd.DataFrame, code_column: str):
        """
        Performs processing for a DataFrame containing source code
        :param df: Pandas DataFrame
        :param code_column: Name corresponding to the column containing source code
        :return: DataFrame containing the processed code using SentencePiece
        """
        result_df = df[code_column].apply(lambda snippet: self.__sp_encode_as_pieces(snippet))
        return result_df
    
    def __sp_encode_as_pieces(self, txt: str) -> list:
        """
        Performs tokenization of given text using SentencePieceProcesor
        :param txt: String to be encoded (tokenized)
        :return: List of the subword tokens
        """
        return self.tokenizer.encode_as_pieces(txt)
    
    def __sp_encode_as_ids(self, txt: str) -> list:
        """
        Performs encoding of given text using SentencePieceProcesor
        :param txt: String to be encoded
        :return: List of the subword tokens (ids)
        """
        return self.tokenizer.encode_as_ids(txt)

In [58]:
# export

class Doc2VecVectorizerHF(Doc2VecVectorizer):
    """
    Class to perform vectorization via Doc2Vec model
    leveraging HF's Tokenizer
    """
    def __init__(self, tkzr_path: str, d2v_path: str):
        """
        :param tkzr_path: Path to the HF Tokenizer saved model
        :param d2v_path: Path to the Doc2Vec saved model
        """
        super().__init__(tkzr_path, d2v_path)
        
    def _load_tokenizer_model(self, path: str) -> Tokenizer:
        """
        Function to load a saved HuggingFace tokenizer

        :param path: Path containing the tokenizer file
        :return:
        """

        if not check_file_existence(path):
            msg = 'HuggingFace tokenizer could no be loaded.'
            logging.error(msg)
            raise Exception(msg)
        
        self.tokenizer = Tokenizer.from_file(path)
    
    def tokenize_df(self, df: pd.DataFrame, code_column: str):
        """
        Performs processing for a DataFrame containing source code
        :param df: Pandas DataFrame
        :param code_column: Name corresponding to the column containing source code
        :return: DataFrame containing the processed code using SentencePiece
        """
        result_df = df[code_column].apply(lambda snippet: self.__encode_string_as_tokens(snippet))
        return result_df
    
    def __encode_string_as_tokens(self, txt: str) -> list:
        """
        Perform tokenization using HF Tokenizer
        
        :return: List containing obtained tokens
        """
        return self.tokenizer.encode(txt).tokens
    
    def __encode_string_as_ids(self, txt: str) -> list:
        """
        Perform tokenization using HF Tokenizer
        
        :return: List containing obtained ids (of tokens)
        """
        return self.tokenizer.encode(txt).ids

In [37]:
# export

def configure_dirs(base_path: str, config_name: str) -> str:
    """
    Performs configuration of directories for storing vectors
    :param base_path:
    :param config_name:
    
    :return: Full configuration path
    """
    base_path = Path(base_path)
    base_path.mkdir(exist_ok=True)
    
    now = datetime.now()
    timestamp = str(datetime.timestamp(now))

    full_path = base_path / timestamp 
    full_path.mkdir(exist_ok=True)
    full_path = full_path/ config_name
    full_path.mkdir(exist_ok=True)
    
    return str(full_path)

### Test gensim's implementation

In [29]:
d2v_model = gensim.models.Doc2Vec.load('../../dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model')

2021-04-15 20:40:58,357 : INFO : loading Doc2Vec object from ../../dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model
2021-04-15 20:40:58,926 : INFO : loading vocabulary recursively from ../../dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.vocabulary.* with mmap=None
2021-04-15 20:40:58,927 : INFO : loading trainables recursively from ../../dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.trainables.* with mmap=None
2021-04-15 20:40:58,928 : INFO : loading wv recursively from ../../dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.wv.* with mmap=None
2021-04-15 20:40:58,928 : INFO : loading docvecs recursively from ../../dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.docvecs.* with mmap=None
2021-04-15 20:40:58,929 : INFO : loading vectors_docs from ../../dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-15945

## Load Searchnet data

In [27]:
java_df = pd.read_csv("/tf/main/dvc-ds4se/code/searchnet/[codesearchnet-java-1597073966.81902].csv",  header=0, index_col=0, sep='~')

In [28]:
java_df.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
0,ReactiveX/RxJava,src/main/java/io/reactivex/internal/observers/...,https://github.com/ReactiveX/RxJava/blob/ac841...,protected final void fastPathOrderedEmit(U val...,"['protected', 'final', 'void', 'fastPathOrdere...",Makes sure the fast-path emits in order.\n@par...,"['Makes', 'sure', 'the', 'fast', '-', 'path', ...",java,test,"['▁protected', '▁final', '▁void', '▁fast', 'Pa...",134,138
1,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,@CheckReturnValue\n @NonNull\n @Schedule...,"['@', 'CheckReturnValue', '@', 'NonNull', '@',...",Mirrors the one ObservableSource in an Iterabl...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'CheckReturnValue', '▁', '@', 'NonN...",63,71
2,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings(""unchecked"")\n @CheckRetu...","['@', 'SuppressWarnings', '(', '""unchecked""', ...",Mirrors the one ObservableSource in an array o...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'SuppressWarnings', '(""', 'unchecke...",107,109
3,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Concatenates elements of each ObservableSource...,"['Concatenates', 'elements', 'of', 'each', 'Ob...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",79,83
4,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Returns an Observable that emits the items emi...,"['Returns', 'an', 'Observable', 'that', 'emits...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",91,112


In [30]:
java_samples = java_df.sample(10)

In [31]:
java_samples.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
16751,OpenLiberty/open-liberty,dev/com.ibm.ws.artifact.zip/src/com/ibm/ws/art...,https://github.com/OpenLiberty/open-liberty/bl...,"@Trivial\n public static Map<String, ZipEnt...","['@', 'Trivial', 'public', 'static', 'Map', '<...",Create a table of entry data using the relativ...,"['Create', 'a', 'table', 'of', 'entry', 'data'...",java,train,"['▁', '@', 'Trivial', '▁public', '▁static', '▁...",87,99
21794,OpenLiberty/open-liberty,dev/com.ibm.ws.monitor/src/com/ibm/ws/monitor/...,https://github.com/OpenLiberty/open-liberty/bl...,public boolean isMonitorable(Class<?> clazz) {...,"['public', 'boolean', 'isMonitorable', '(', 'C...",Determine of the specified class can be monito...,"['Determine', 'of', 'the', 'specified', 'class...",java,train,"['▁public', '▁boolean', '▁is', 'Monitor', 'abl...",197,224
23928,neo4j-contrib/neo4j-apoc-procedures,src/main/java/apoc/algo/CoreGraphAlgorithms.java,https://github.com/neo4j-contrib/neo4j-apoc-pr...,"public int[] loadDegrees(String relName, Direc...","['public', 'int', '[', ']', 'loadDegrees', '('...",/*\nprivate int[] loadDegrees(ReadOperations o...,"['/', '*', 'private', 'int', '[]', 'loadDegree...",java,train,"['▁public', '▁int', '[]', '▁load', 'Deg', 'ree...",42,57
22323,isisaddons-legacy/isis-module-publishing,dom/src/main/java/org/isisaddons/module/publis...,https://github.com/isisaddons-legacy/isis-modu...,@Programmatic\n @Override\n public Objec...,"['@', 'Programmatic', '@', 'Override', 'public...",region > serialize (API),"['region', '>', 'serialize', '(', 'API', ')']",java,train,"['▁', '@', 'Pro', 'g', 'rammat', 'ic', '▁', '@...",58,88
1666,reactor/reactor-core,reactor-core/src/main/java/reactor/core/publis...,https://github.com/reactor/reactor-core/blob/d...,public final Flux<T> publishOn(Scheduler sched...,"['public', 'final', 'Flux', '<', 'T', '>', 'pu...","Run onNext, onComplete and onError on a suppli...","['Run', 'onNext', 'onComplete', 'and', 'onErro...",java,valid,"['▁public', '▁final', '▁F', 'l', 'ux', '<', 'T...",31,40


In [32]:
np.array(java_samples.index)

array([16751, 21794, 23928, 22323,  1666, 19402, 28651, 24093,  6367,
        2260])

## Parameterization

In [33]:
params = {
    "bpe32k_path": "/tf/main/dvc-ds4se/models/bpe/sentencepiece/deprecated/java_bpe_32k.model",
    "doc2vec_java_path": "/tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model",
    "hf_tokenizer": "/tf/main/nbs/tokenizer.json"
}

###  Test vectorization with Doc2Vec (based on SentencePiece)

In [38]:
config_path = configure_dirs('vectors', 'human_trn')

In [49]:
vectorizer = Doc2VecVectorizerSP(params['bpe32k_path'], params["doc2vec_java_path"])

2021-05-20 21:45:30,750 : INFO : loading Doc2Vec object from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model
2021-05-20 21:45:31,351 : INFO : loading vocabulary recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.vocabulary.* with mmap=None
2021-05-20 21:45:31,352 : INFO : loading trainables recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.trainables.* with mmap=None
2021-05-20 21:45:31,353 : INFO : loading wv recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.wv.* with mmap=None
2021-05-20 21:45:31,354 : INFO : loading docvecs recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.docvecs.* with mmap=None
2021-05-20 21:45:31,354 : INFO : loading vectors_docs from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBO

In [50]:
tokenized_df = vectorizer.tokenize_df(java_samples, 'code')

In [51]:
java_samples['bpe32k-tokens'] = tokenized_df

In [52]:
indices, vectors = vectorizer.infer_d2v(java_samples, 'bpe32k-tokens', config_path)

###  Test vectorization with Doc2Vec (based on HuggingFace's Tokenizer)

In [59]:
hf_vectorizer = Doc2VecVectorizerHF(params['hf_tokenizer'], params["doc2vec_java_path"])

2021-05-20 21:47:31,681 : INFO : loading Doc2Vec object from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model
2021-05-20 21:47:32,241 : INFO : loading vocabulary recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.vocabulary.* with mmap=None
2021-05-20 21:47:32,242 : INFO : loading trainables recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.trainables.* with mmap=None
2021-05-20 21:47:32,243 : INFO : loading wv recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.wv.* with mmap=None
2021-05-20 21:47:32,244 : INFO : loading docvecs recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.docvecs.* with mmap=None
2021-05-20 21:47:32,245 : INFO : loading vectors_docs from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBO

In [60]:
tokenized_df = hf_vectorizer.tokenize_df(java_samples, 'code')

In [62]:
java_samples['bpe-hf-tokens'] = tokenized_df

In [64]:
indices, vectors = hf_vectorizer.infer_d2v(java_samples, 'bpe-hf-tokens', config_path)

In [None]:
# TODO: Export code as module