In [1]:
# default_exp codexplainer.d2v_vectorization

In [1]:
# export

import numpy as np
import gensim
import pandas as pd
import os
import sentencepiece as spm


from tokenizers import Tokenizer
from abc import ABC, abstractmethod
from typing import Any, Optional

from datetime import datetime
from pathlib import Path

from ds4se.mgmnt.prep.bpe_tokenization import HFTokenizer, SPTokenizer

In [2]:
# export

import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

# d2v_vectorization

> Use doc2vec models to get distributed representation (embedding vectors) for source code

> @Alvaro 15 April 2021

## Note:

Doc2Vec model is not trained, just loaded and used through gensim

In [3]:
# export
# utils
def check_file_existence(path) -> bool:
    path = Path(path)
    if not path.exists():
        logging.error('Provided file cannot be found.')
        return False
    return True

In [4]:
# export

def configure_dirs(base_path: str, config_name: str, dataset_name: str) -> str:
    """
    Performs configuration of directories for storing vectors
    :param base_path:
    :param config_name:
    :param dataset_name:
    
    :return: Full configuration path
    """
    base_path = Path(base_path)
    base_path.mkdir(exist_ok=True)

    full_path = base_path / config_name
    full_path.mkdir(exist_ok=True)
    
    full_path = full_path / dataset_name
    full_path.mkdir(exist_ok=True)
    
    return str(full_path)

## Vectorizer classes

Vectorizer class is defined abstract in order to provide alternatives for tokenization (SentencePiece and HuggingFace's Tokenizers)

In [5]:
# export

class Doc2VecVectorizer(ABC):
    def __init__(self, tkzr_path:str, d2v_path: str, tokenizer: Optional[Any]=None):
        """
        Default constructor for Vectorizer class
        """
        self.tkzr_path = tkzr_path
        self.d2v_path = d2v_path
        
        self._load_doc2vec_model(d2v_path)
        if tokenizer is None:
            self._load_tokenizer_model(self.tkzr_path)
        else:
            self.tokenizer = tokenizer
        
    def tokenize_df(self, df: pd.DataFrame, code_column: str) -> pd.DataFrame:
        """
        Performs tokenization of a Dataframe
        
        :param df: DataFrame containing code
        :param code_column: Str indicating column name of code data
        
        :return: Tokenized DataFrame
        """
        
        return self.tokenizer.tokenize_df(df, code_column)
    
    @abstractmethod
    def _load_tokenizer_model(self, model_path: str):
        pass
    
    def _load_doc2vec_model(self, model_path: str):
        """
        :param model_path: Path to the model file
        :return: Gensim Doc2Vec model (corresponding to the loaded model)
        """
        if not check_file_existence(model_path):
            msg = 'Doc2vec model could no be loaded'
            logging.error('Doc2vec model could no be loaded')
            raise Exception(msg)
        
        model = gensim.models.Doc2Vec.load(model_path)
        self.d2v_model = model    
        
    def infer_d2v(self, df: pd.DataFrame, tokenized_column: str, out_path: str,
                  config_name: str, sample_set_name: str,
                  perform_tokenization: Optional[bool]=False,
                  steps: Optional[int]=200) -> tuple:
        """
        Performs vectorization via Doc2Vec model 
        :param df: Pandas DataFrame containing source code
        :param tokenized_column: Column name of the column corresponding to source code tokenized
                                 with the appropriate implementation
        :param out_path: String indicating the base location for storing vectors
        :param config_name: String indicating the model from which the samples came from
        :param sample_set_name: String indicating the base name for identifying the set of
                                 samples being processed
        :param perform_tokenization: Bool indicating whether tokenization is required or not
                                     (input df is previously tokenized or not)
        :param steps: Steps for the doc2vec infere
        :return: Tuple containing (idx of the input DF, obtained vectors)
        """
        
        tokenized_df = df.copy()
        
        if perform_tokenization:
            tokenized_df[tokenized_column] = self.tokenizer.tokenize_df(tokenized_df, 'code')
        
        inferred_vecs = np.array([self.d2v_model.infer_vector(tok_snippet, steps=200) \
                                  for tok_snippet in tokenized_df[tokenized_column].values])
        
        indices = np.array(df.index)
        
        dest_path = configure_dirs(out_path, config_name, sample_set_name)
        
        now = datetime.now()
        ts = str(datetime.timestamp(now))
        
        file_name = f"{dest_path}/{self.tok_name}-{ts}"
        
        np.save(f"{file_name}-idx", indices)
        np.save(f"{file_name}-ft_vecs", inferred_vecs)
        
        return indices, inferred_vecs

In [6]:
# export

class Doc2VecVectorizerSP(Doc2VecVectorizer):
    """
    Class to perform vectorization via Doc2Vec model
    leveraging SentencePiece to tokenizer sequences.
    """
    def __init__(self, sp_path: str, d2v_path: str, tokenizer: Optional[Any]=None):
        """
        :param sp_path: Path to the SentencePiece saved model
        :param d2v_path: Path to the Doc2Vec saved model
        """
        
        super().__init__(sp_path, d2v_path, tokenizer)
        self.tok_name = "sp"
    
    def _load_tokenizer_model(self, model_path: str):
        """
        Loads the sentence piece model stored in the specified path 
        :param model_path: Path to the model file
        :return: SentencePieceProcessor object (corresponding to loaded model)
        """
        if not check_file_existence(model_path):
            msg = 'Sentence piece model could no be loaded'
            logging.error(msg)
            raise Exception(msg)
        
        sp_processor = spm.SentencePieceProcessor()
        sp_processor.load(model_path)
        self.tokenizer = sp_processor

In [7]:
# export

class Doc2VecVectorizerHF(Doc2VecVectorizer):
    """
    Class to perform vectorization via Doc2Vec model
    leveraging HF's Tokenizer
    """
    def __init__(self, tkzr_path: str, d2v_path: str, tokenizer: Optional[Any]=None):
        """
        :param tkzr_path: Path to the HF Tokenizer saved model
        :param d2v_path: Path to the Doc2Vec saved model
        """
        super().__init__(tkzr_path, d2v_path, tokenizer)
        self.tok_name = "hf"
        
    def _load_tokenizer_model(self, path: str) -> Tokenizer:
        """
        Function to load a saved HuggingFace tokenizer

        :param path: Path containing the tokenizer file
        :return:
        """

        if not check_file_existence(path):
            msg = 'HuggingFace tokenizer could no be loaded.'
            logging.error(msg)
            raise Exception(msg)
        
        self.tokenizer = Tokenizer.from_file(path)

## Load Searchnet data

In [8]:
java_df = pd.read_csv("/tf/main/dvc-ds4se/code/searchnet/[codesearchnet-java-1597073966.81902].csv",  header=0, index_col=0, sep='~')

In [9]:
java_df.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
0,ReactiveX/RxJava,src/main/java/io/reactivex/internal/observers/...,https://github.com/ReactiveX/RxJava/blob/ac841...,protected final void fastPathOrderedEmit(U val...,"['protected', 'final', 'void', 'fastPathOrdere...",Makes sure the fast-path emits in order.\n@par...,"['Makes', 'sure', 'the', 'fast', '-', 'path', ...",java,test,"['▁protected', '▁final', '▁void', '▁fast', 'Pa...",134,138
1,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,@CheckReturnValue\n @NonNull\n @Schedule...,"['@', 'CheckReturnValue', '@', 'NonNull', '@',...",Mirrors the one ObservableSource in an Iterabl...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'CheckReturnValue', '▁', '@', 'NonN...",63,71
2,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings(""unchecked"")\n @CheckRetu...","['@', 'SuppressWarnings', '(', '""unchecked""', ...",Mirrors the one ObservableSource in an array o...,"['Mirrors', 'the', 'one', 'ObservableSource', ...",java,test,"['▁', '@', 'SuppressWarnings', '(""', 'unchecke...",107,109
3,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Concatenates elements of each ObservableSource...,"['Concatenates', 'elements', 'of', 'each', 'Ob...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",79,83
4,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","['@', 'SuppressWarnings', '(', '{', '""unchecke...",Returns an Observable that emits the items emi...,"['Returns', 'an', 'Observable', 'that', 'emits...",java,test,"['▁', '@', 'SuppressWarnings', '({', '▁""', 'un...",91,112


In [10]:
java_samples = java_df.sample(10)

In [11]:
java_samples.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition,bpe32k,code_len,bpe32_len
14702,wildfly/wildfly-core,controller/src/main/java/org/jboss/as/controll...,https://github.com/wildfly/wildfly-core/blob/c...,@Override\n public void validateParameter(S...,"['@', 'Override', 'public', 'void', 'validateP...",{@inheritDoc},['{'],java,train,"['▁', '@', 'Override', '▁public', '▁void', '▁v...",78,86
29105,cloudant/java-cloudant,cloudant-client/src/main/java/com/cloudant/cli...,https://github.com/cloudant/java-cloudant/blob...,"public InputStream find(String id, String rev)...","['public', 'InputStream', 'find', '(', 'String...",Finds the document with the specified document...,"['Finds', 'the', 'document', 'with', 'the', 's...",java,train,"['▁public', '▁InputStream', '▁find', '(', 'Str...",22,23
18440,ops4j/org.ops4j.pax.logging,pax-logging-service/src/main/java/org/apache/l...,https://github.com/ops4j/org.ops4j.pax.logging...,public void close() {\n /**\n * Set clo...,"['public', 'void', 'close', '(', ')', '{', '/*...",Close this <code>AsyncAppender</code> by inter...,"['Close', 'this', '<code', '>', 'AsyncAppender...",java,train,"['▁public', '▁void', '▁close', '()', '▁{', '▁/...",78,115
1869,reactor/reactor-core,reactor-core/src/main/java/reactor/core/publis...,https://github.com/reactor/reactor-core/blob/d...,public final Mono<T> doAfterTerminate(Runnable...,"['public', 'final', 'Mono', '<', 'T', '>', 'do...",Add behavior (side-effect) triggered after the...,"['Add', 'behavior', '(', 'side', '-', 'effect'...",java,valid,"['▁public', '▁final', '▁Mon', 'o', '<', 'T', '...",38,50
3541,OpenLiberty/open-liberty,dev/com.ibm.ws.messaging.runtime/src/com/ibm/w...,https://github.com/OpenLiberty/open-liberty/bl...,@Override\n public void attachLocalPtoPLoca...,"['@', 'Override', 'public', 'void', 'attachLoc...",Method attachLocalPtoPLocalisation\n\n<p> Atta...,"['Method', 'attachLocalPtoPLocalisation']",java,train,"['▁', '@', 'Override', '▁public', '▁void', '▁a...",114,222


In [12]:
np.array(java_samples.index)

array([14702, 29105, 18440,  1869,  3541,  4346,  7034,  7423, 22602,
       14870])

## Parameterization

In [14]:
params = {
    "bpe32k_path": "/tf/main/dvc-ds4se/models/bpe/sentencepiece/deprecated/java_bpe_32k.model",
    "doc2vec_java_path": "/tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model",
    "hf_tokenizer": "/tf/main/nbs/tokenizer.json",
    "vectors_storage_path": "/tf/main/dvc-ds4se/results/d2v_vectors"
}

Configure directories to store obtained vectors

###  Test vectorization with Doc2Vec (based on SentencePiece)

In [9]:
sp_tokenizer = SPTokenizer(params['bpe32k_path'])

In [41]:
vectorizer = Doc2VecVectorizerSP(params['bpe32k_path'], params["doc2vec_java_path"], tokenizer=sp_tokenizer)

2021-05-25 17:53:24,968 : INFO : loading Doc2Vec object from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model
2021-05-25 17:53:25,538 : INFO : loading vocabulary recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.vocabulary.* with mmap=None
2021-05-25 17:53:25,539 : INFO : loading trainables recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.trainables.* with mmap=None
2021-05-25 17:53:25,539 : INFO : loading wv recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.wv.* with mmap=None
2021-05-25 17:53:25,540 : INFO : loading docvecs recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.docvecs.* with mmap=None
2021-05-25 17:53:25,541 : INFO : loading vectors_docs from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBO

In [44]:
tokenized_df = vectorizer.tokenize_df(java_samples, 'code')

In [46]:
tokenized_df

411      [▁public, ▁void, ▁each, Row, (, String, ▁sql, ...
1698     [▁private, ▁static, ▁int, ▁get, Block, Length,...
20430    [▁public, ▁static, ▁void, ▁incr, Compute, Spec...
25303    [▁public, ▁void, ▁info, (, User, Feed, back, E...
4869     [▁protected, ▁Reliability, ▁getRe, liability, ...
6272     [▁public, ▁static, ▁String, ▁message, Mo, ved,...
22668    [▁public, ▁List, <, Form, >, ▁getAll, By, Logg...
5547     [▁@, Worker, Thread, ▁public, ▁long, ▁insert, ...
19835    [▁@, XmlElementDecl, (, namespace, ▁=, ▁", htt...
5232     [▁public, ▁void, ▁pairs, Matching, (, final, ▁...
Name: code, dtype: object

In [42]:
indices, vectors = vectorizer.infer_d2v(java_samples, 'bpe32k-tokens', params["vectors_storage_path"],
                                        "human_trn", "10-sample-20052021", perform_tokenization=True)

In [32]:
indices

array([  411,  1698, 20430, 25303,  4869,  6272, 22668,  5547, 19835,
        5232])

In [43]:
vectors

array([[-0.07038781, -0.58109975, -0.9463178 , ..., -2.1921134 ,
        -1.7908362 ,  0.71608573],
       [ 0.5599993 ,  0.7043637 , -0.43915233, ...,  0.483188  ,
        -0.4529718 ,  0.28490466],
       [ 0.08634362,  0.06870021,  0.11204075, ..., -1.0273234 ,
         0.50954473,  0.4812691 ],
       ...,
       [-1.6385366 ,  0.26093784,  0.58297956, ..., -0.2630522 ,
        -1.3928666 , -1.9163388 ],
       [ 1.0110482 , -0.27105835, -0.2851525 , ..., -0.5329149 ,
        -0.7664558 , -0.67246526],
       [ 0.31932476,  1.1802573 , -0.26099774, ..., -0.74102426,
        -0.9270507 ,  0.55426735]], dtype=float32)

###  Test vectorization with Doc2Vec (based on HuggingFace's Tokenizer)

In [15]:
hf_tokenizer = HFTokenizer(params['hf_tokenizer'])

In [17]:
hf_vectorizer = Doc2VecVectorizerHF(params['hf_tokenizer'], params["doc2vec_java_path
                                                                   "], tokenizer=hf_tokenizer)

2021-05-25 17:57:17,865 : INFO : loading Doc2Vec object from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model
2021-05-25 17:57:18,469 : INFO : loading vocabulary recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.vocabulary.* with mmap=None
2021-05-25 17:57:18,470 : INFO : loading trainables recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.trainables.* with mmap=None
2021-05-25 17:57:18,471 : INFO : loading wv recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.wv.* with mmap=None
2021-05-25 17:57:18,472 : INFO : loading docvecs recursively from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBOW-500-20E-8k-1594569414.336389].model.docvecs.* with mmap=None
2021-05-25 17:57:18,473 : INFO : loading vectors_docs from /tf/main/dvc-ds4se/models/pv/bpe8k/[doc2vec-Java-PVDBO

In [18]:
tokenized_df = hf_vectorizer.tokenize_df(java_samples, 'code')

In [21]:
tokenized_df

14702    [@, Override, Ċ, ĠĠĠ, Ġp, ublic, Ġ, void, Ġval...
29105    [public, ĠInputStream, Ġfind, (, String, Ġid, ...
18440    [public, Ġ, void, Ġclose, (, ), Ġ, {, Ċ, ĠĠĠ, ...
1869     [public, Ġ, final, ĠMon, o, <, T, >, Ġ, do, Af...
3541     [@, Override, Ċ, ĠĠĠ, Ġp, ublic, Ġ, void, Ġatt...
4346     [@, NonNull, Ċ, ĠĠĠ, Ġp, ublic, Ġstat, ic, ĠSt...
7034     [@, Override, Ċ, ĠĠĠ, Ġp, ublic, ĠGet, Inte, g...
7423     [public, ĠProperty, Definition, [, ], ĠgetProp...
22602    [private, ĠInteger, Vector, ĠgetS, em, ant, ic...
14870    [public, Ġstat, ic, ĠPath, Address, Ġtrans, fo...
Name: code, dtype: object

In [20]:
indices, vectors = hf_vectorizer.infer_d2v(java_samples, 'bpe-hf-tokens', params["vectors_storage_path"],
                                        "human_trn", "10-sample-20052021", perform_tokenization=True)

In [22]:
indices

array([14702, 29105, 18440,  1869,  3541,  4346,  7034,  7423, 22602,
       14870])

In [23]:
vectors

array([[-9.4170451e-01,  1.5865822e+00, -1.5674361e-03, ...,
        -1.5819024e+00,  5.6919813e-01, -8.9502269e-01],
       [-6.8043721e-01, -2.6820338e-01,  3.9857324e-02, ...,
        -4.6697325e-01, -2.4018152e-01, -1.2825546e-02],
       [-9.2789513e-01,  2.2848961e+00,  4.6906880e-01, ...,
        -1.4570843e+00,  4.1882795e-01,  2.0528805e+00],
       ...,
       [-4.0173218e-01, -5.2009028e-01, -7.1974285e-02, ...,
         1.7002009e-01, -3.4709036e-01,  2.7916936e-02],
       [-1.2270563e+00,  2.2932885e+00,  1.0075219e+00, ...,
        -1.7897085e+00,  9.4967000e-02,  3.8890353e-01],
       [-4.5752251e-01,  1.7355688e+00,  7.0712149e-01, ...,
        -9.3595511e-01, -3.8285625e-01,  2.4485412e+00]], dtype=float32)

In [None]:
# TODO: Export code as module

In [24]:
from nbdev.export import notebook2script
notebook2script()

Converted 0.0_mgmnt.prep.i.ipynb.
Converted 0.1_mgmnt.prep.conv.ipynb.
Converted 0.3_mgmnt.prep.bpe.ipynb.
Converted 0.6_mgmnt.prep.nltk.ipynb.
Converted 0.7_mgmnt.prep.files_mgmnt.ipynb.
Converted 0.8_mgmnt.prep.bpe_tokenization.ipynb.
Converted 1.0_exp.i.ipynb.
Converted 1.1_exp.info-[inspect].ipynb.
Converted 1.1_exp.info.ipynb.
Converted 1.2_exp.csnc.ipynb.
Converted 1.2_exp.gen.code.ipynb.
Converted 1.3_exp.csnc_python.ipynb.
Converted 10.0_utils.clusterization.ipynb.
Converted 10.1_utils.visualization.ipynb.
Converted 2.0_repr.codebert.ipynb.
Converted 2.0_repr.i.ipynb.
Converted 2.1_repr.codeberta.ipynb.
Converted 2.1_repr.roberta.train.ipynb.
Converted 2.2_repr.roberta.eval.ipynb.
Converted 2.3_repr.word2vec.train.ipynb.
Converted 2.6_repr.word2vec.eval.ipynb.
Converted 2.7_repr.distmetrics.ipynb.
Converted 2.8_repr.sentence_transformers.ipynb.
Converted 3.1_mining.unsupervised.traceability.eda.ipynb.
Converted 3.2_mining.unsupervised.eda.traceability.d2v.ipynb.
This cell doesn