In [82]:
# default_exp interpretability.d2v_vectorization

In [26]:
# export

import numpy as np
import gensim
import pandas as pd
import os
import sentencepiece as spm

from datetime import datetime
from pathlib import Path
from tokenizers import Tokenizer
from abc import ABC, abstractmethod

import logging

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

# d2v_vectorization

> Use doc2vec models to get distributed representation (embedding vectors) for source code

> @Alvaro 15 April 2021

## Note:

Doc2Vec model is not trained, just loaded and used through gensim

In [3]:
# utils
def check_file_existence(path):
    if not os.path.exists(path):
        logging.error('Provided file cannot be found.')
        return False
    return True

In [33]:
# export

def configure_dirs(base_path: str, config_name: str, dataset_name: str) -> str:
    """
    Performs configuration of directories for storing vectors
    :param base_path:
    :param config_name:
    :param dataset_name:
    
    :return: Full configuration path
    """
    base_path = Path(base_path)
    base_path.mkdir(exist_ok=True)

    full_path = base_path / config_name
    full_path.mkdir(exist_ok=True)
    
    full_path = full_path / dataset_name
    full_path.mkdir(exist_ok=True)
    
    return str(full_path)

## Vectorizer classes

Vectorizer class is defined abstract in order to provide alternatives for tokenization (SentencePiece and HuggingFace's Tokenizers)

In [63]:
# export

class Doc2VecVectorizer(ABC):
    def __init__(self, tkzr_path:str, d2v_path: str):
        """
        Default constructor for Vectorizer class
        """
        self.tkzr_path = tkzr_path
        self.d2v_path = d2v_path
        
        self._load_tokenizer_model(self.tkzr_path)
        self._load_doc2vec_model(d2v_path)
        
    @abstractmethod
    def tokenize_df(self, df: pd.DataFrame, code_column: str):
        pass
    
    @abstractmethod
    def _load_tokenizer_model(self, model_path: str):
        pass
    
    def _load_doc2vec_model(self, model_path: str):
        """
        :param model_path: Path to the model file
        :return: Gensim Doc2Vec model (corresponding to the loaded model)
        """
        if not check_file_existence(model_path):
            msg = 'Doc2vec model could no be loaded'
            logging.error('Doc2vec model could no be loaded')
            raise Exception(msg)
        
        model = gensim.models.Doc2Vec.load(model_path)
        self.d2v_model = model    
        
    def infer_d2v(self, df: pd.DataFrame, tokenized_column: str, out_path: str,
                  config_name: str, sample_set_name: str, steps: int=200) -> tuple:
        """
        Performs vectorization via Doc2Vec model 
        :param df: Pandas DataFrame containing source code
        :param tokenized_column: Column name of the column corresponding to source code tokenized
                                 with the appropriate implementation
        :param out_path: String indicating the base location for storing vectors
        :param config_name: String indicating the model from which the samples came from
        :param sample_set_name: String indicating the base name for identifying the set of
                                 samples being processed
        :param steps: Steps for the doc2vec infere
        :return: Tuple containing (idx of the input DF, obtained vectors)
        """
        df_inferred = df.copy()
        
        inferred_vecs = np.array([self.d2v_model.infer_vector(tok_snippet, steps=200) \
                                  for tok_snippet in df[tokenized_column].values])
        
        indices = np.array(df.index)
        
        dest_path = configure_dirs(out_path, config_name, sample_set_name)
        
        now = datetime.now()
        ts = str(datetime.timestamp(now))
        
        file_name = f"{dest_path}/{self.tok_name}-{ts}"
        
        np.save(f"{file_name}-idx", indices)
        np.save(f"{file_name}-ft_vecs", inferred_vecs)
        
        return indices, inferred_vecs

In [69]:
# export

class Doc2VecVectorizerSP(Doc2VecVectorizer):
    """
    Class to perform vectorization via Doc2Vec model
    leveraging SentencePiece to tokenizer sequences.
    """
    def __init__(self, sp_path: str, d2v_path: str):
        """
        :param sp_path: Path to the SentencePiece saved model
        :param d2v_path: Path to the Doc2Vec saved model
        """
        
        super().__init__(sp_path, d2v_path)
        self.tok_name = "sp"
    
    def _load_tokenizer_model(self, model_path: str):
        """
        Loads the sentence piece model stored in the specified path 
        :param model_path: Path to the model file
        :return: SentencePieceProcessor object (corresponding to loaded model)
        """
        if not check_file_existence(model_path):
            msg = 'Sentence piece model could no be loaded'
            logging.error(msg)
            raise Exception(msg)
        
        sp_processor = spm.SentencePieceProcessor()
        sp_processor.load(model_path)
        self.tokenizer = sp_processor
        
    def tokenize_df(self, df: pd.DataFrame, code_column: str):
        """
        Performs processing for a DataFrame containing source code
        :param df: Pandas DataFrame
        :param code_column: Name corresponding to the column containing source code
        :return: DataFrame containing the processed code using SentencePiece
        """
        result_df = df[code_column].apply(lambda snippet: self.__sp_encode_as_pieces(snippet))
        return result_df
    
    def __sp_encode_as_pieces(self, txt: str) -> list:
        """
        Performs tokenization of given text using SentencePieceProcesor
        :param txt: String to be encoded (tokenized)
        :return: List of the subword tokens
        """
        return self.tokenizer.encode_as_pieces(txt)
    
    def __sp_encode_as_ids(self, txt: str) -> list:
        """
        Performs encoding of given text using SentencePieceProcesor
        :param txt: String to be encoded
        :return: List of the subword tokens (ids)
        """
        return self.tokenizer.encode_as_ids(txt)

In [68]:
# export

class Doc2VecVectorizerHF(Doc2VecVectorizer):
    """
    Class to perform vectorization via Doc2Vec model
    leveraging HF's Tokenizer
    """
    def __init__(self, tkzr_path: str, d2v_path: str):
        """
        :param tkzr_path: Path to the HF Tokenizer saved model
        :param d2v_path: Path to the Doc2Vec saved model
        """
        super().__init__(tkzr_path, d2v_path)
        self.tok_name = "hf"
        
    def _load_tokenizer_model(self, path: str) -> Tokenizer:
        """
        Function to load a saved HuggingFace tokenizer

        :param path: Path containing the tokenizer file
        :return:
        """

        if not check_file_existence(path):
            msg = 'HuggingFace tokenizer could no be loaded.'
            logging.error(msg)
            raise Exception(msg)
        
        self.tokenizer = Tokenizer.from_file(path)
    
    def tokenize_df(self, df: pd.DataFrame, code_column: str):
        """
        Performs processing for a DataFrame containing source code
        :param df: Pandas DataFrame
        :param code_column: Name corresponding to the column containing source code
        :return: DataFrame containing the processed code using SentencePiece
        """
        result_df = df[code_column].apply(lambda snippet: self.__encode_string_as_tokens(snippet))
        return result_df
    
    def __encode_string_as_tokens(self, txt: str) -> list:
        """
        Perform tokenization using HF Tokenizer
        
        :return: List containing obtained tokens
        """
        return self.tokenizer.encode(txt).tokens
    
    def __encode_string_as_ids(self, txt: str) -> list:
        """
        Perform tokenization using HF Tokenizer
        
        :return: List containing obtained ids (of tokens)
        """
        return self.tokenizer.encode(txt).ids

## Load Searchnet data

In [8]:
java_df = pd.read_csv("/tf/main/dvc-ds4se/code/searchnet/[codesearchnet-java-1597073966.81902].csv",  header=0, index_col=0, sep='~')

In [9]:
java_df.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
0,ReactiveX/RxJava,src/main/java/io/reactivex/internal/observers/...,https://github.com/ReactiveX/RxJava/blob/ac841...,protected final void fastPathOrderedEmit(U val...,"['protected', 'final', 'void', 'fastPathOrdere...",Makes sure the fast-path emits in order.\n@par...,"['Makes', 'sure', 'the', 'fast', '-', 'path', ...",java,test,"['▁protected', '▁final', '▁void', '▁fast', 'Pa...",134,138
1,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,@CheckReturnValue\n @NonNull\n @Schedule...,"['@', 'CheckReturnValue', '@', 'NonNull', '@',...",Mirrors the one ObservableSource in an Iterabl...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'CheckReturnValue', '▁', '@', 'NonN...",63,71
2,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings(""unchecked"")\n @CheckRetu...","['@', 'SuppressWarnings', '(', '""unchecked""', ...",Mirrors the one ObservableSource in an array o...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'SuppressWarnings', '(""', 'unchecke...",107,109
3,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Concatenates elements of each ObservableSource...,"['Concatenates', 'elements', 'of', 'each', 'Ob...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",79,83
4,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Returns an Observable that emits the items emi...,"['Returns', 'an', 'Observable', 'that', 'emits...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",91,112


In [10]:
java_samples = java_df.sample(10)

In [11]:
java_samples.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
21155,duracloud/duracloud,durastore/src/main/java/org/duracloud/durastor...,https://github.com/duracloud/duracloud/blob/dc...,"public void addSpace(String spaceID, Map<Strin...","['public', 'void', 'addSpace', '(', 'String', ...",Adds a space.\n\n@param spaceID\n@param storeID,"['Adds', 'a', 'space', '.']",java,train,"['▁public', '▁void', '▁add', 'Space', '(', 'St...",110,140
29011,JOML-CI/JOML,src/org/joml/Intersectiond.java,https://github.com/JOML-CI/JOML/blob/ce2652fc2...,public static int intersectRayAar(double origi...,"['public', 'static', 'int', 'intersectRayAar',...",Determine whether the given ray with the origi...,"['Determine', 'whether', 'the', 'given', 'ray'...",java,train,"['▁public', '▁static', '▁int', '▁intersect', '...",360,483
25379,Azure/azure-sdk-for-java,datalakeanalytics/resource-manager/v2015_10_01...,https://github.com/Azure/azure-sdk-for-java/bl...,public Observable<DataLakeAnalyticsAccountInne...,"['public', 'Observable', '<', 'DataLakeAnalyti...",Gets details of the specified Data Lake Analyt...,"['Gets', 'details', 'of', 'the', 'specified', ...",java,train,"['▁public', '▁Observable', '<', 'Data', 'L', '...",62,88
9456,googleads/googleads-java-lib,modules/ads_lib/src/main/java/com/google/api/a...,https://github.com/googleads/googleads-java-li...,ReportBodyProvider getReportBodyProvider(Repor...,"['ReportBodyProvider', 'getReportBodyProvider'...",Returns the {@link ReportBodyProvider} for the...,"['Returns', 'the', '{', '@link', 'ReportBodyPr...",java,train,"['▁Report', 'Body', 'Provider', '▁get', 'Repor...",93,113
18462,ops4j/org.ops4j.pax.logging,pax-logging-api/src/main/java/org/ops4j/pax/lo...,https://github.com/ops4j/org.ops4j.pax.logging...,"public void warn( String format, Object[] argA...","['public', 'void', 'warn', '(', 'String', 'for...",Log a message at the WARN level according to t...,"['Log', 'a', 'message', 'at', 'the', 'WARN', '...",java,train,"['▁public', '▁void', '▁warn', '(', '▁String', ...",53,51


In [32]:
np.array(java_samples.index)

array([16751, 21794, 23928, 22323,  1666, 19402, 28651, 24093,  6367,
        2260])

## Parameterization

In [38]:
params = {
    "bpe32k_path": "/tf/main/dvc-ds4se/models/bpe/sentencepiece/deprecated/java_bpe_32k.model",
    "doc2vec_java_path": "/tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model",
    "hf_tokenizer": "/tf/main/nbs/tokenizer.json",
    "vectors_storage_path": "/tf/main/dvc-ds4se/results/d2v_vectors"
}

Configure directories to store obtained vectors

###  Test vectorization with Doc2Vec (based on SentencePiece)

In [70]:
vectorizer = Doc2VecVectorizerSP(params['bpe32k_path'], params["doc2vec_java_path"])

2021-05-20 23:35:16,033 : INFO : loading Doc2Vec object from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model
2021-05-20 23:35:16,644 : INFO : loading vocabulary recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.vocabulary.* with mmap=None
2021-05-20 23:35:16,645 : INFO : loading trainables recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.trainables.* with mmap=None
2021-05-20 23:35:16,646 : INFO : loading wv recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.wv.* with mmap=None
2021-05-20 23:35:16,648 : INFO : loading docvecs recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.docvecs.* with mmap=None
2021-05-20 23:35:16,649 : INFO : loading vectors_docs from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBO

In [71]:
tokenized_df = vectorizer.tokenize_df(java_samples, 'code')

In [72]:
java_samples['bpe32k-tokens'] = tokenized_df

In [73]:
indices, vectors = vectorizer.infer_d2v(java_samples, 'bpe32k-tokens', params["vectors_storage_path"],
                                        "human_trn", "10-sample-20052021")

###  Test vectorization with Doc2Vec (based on HuggingFace's Tokenizer)

In [76]:
hf_vectorizer = Doc2VecVectorizerHF(params['hf_tokenizer'], params["doc2vec_java_path"])

2021-05-20 23:35:57,802 : INFO : loading Doc2Vec object from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model
2021-05-20 23:35:58,417 : INFO : loading vocabulary recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.vocabulary.* with mmap=None
2021-05-20 23:35:58,418 : INFO : loading trainables recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.trainables.* with mmap=None
2021-05-20 23:35:58,419 : INFO : loading wv recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.wv.* with mmap=None
2021-05-20 23:35:58,421 : INFO : loading docvecs recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.docvecs.* with mmap=None
2021-05-20 23:35:58,422 : INFO : loading vectors_docs from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBO

In [77]:
tokenized_df = hf_vectorizer.tokenize_df(java_samples, 'code')

In [78]:
java_samples['bpe-hf-tokens'] = tokenized_df

In [79]:
indices, vectors = hf_vectorizer.infer_d2v(java_samples, 'bpe-hf-tokens', params["vectors_storage_path"],
                                        "human_trn", "10-sample-20052021")

In [None]:
# TODO: Export code as module

In [80]:
from nbdev.export import notebook2script
notebook2script()

Converted 0.0_mgmnt.prep.i.ipynb.
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
e
This cell doesn't have an export destination and was ignored:
e
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
Converted 0.10_error_checker.ipynb.
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export d