In [17]:
# default_exp mgmnt.prep.tokenization_counting

In [7]:
# export

import pandas as pd

from tokenizers import Tokenizer
import sentencepiece as spm

from pathlib import Path
from pprint import pprint

from typing import Dict, Optional

from ds4se.mgmnt.prep.bpe_tokenization import CustomTokenizer, HFTokenizer

In [4]:
# export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

## tokenization_counting

> Module to obtain the vocabulary (based on BPE tokenization) from a collection of code snippets.

> @Alvaro 16 May 2021

In [18]:
params = {
    "bpe32k_path": "/tf/main/dvc-ds4se/models/bpe/sentencepiece/deprecated/java_bpe_32k.model"
}

This method leverages Tokenization from <i>ds4se.mgmnt.prep.bpe_tokenization.CustomTokenizer</i> to perform a counting of tokens given certain code dataset. Such counting severs as input for computing <i>Mutual information</i>.

In [10]:
# export

def get_tokens_frequency_hf_tkzr(df: pd.DataFrame,
                                  tokenizer: CustomTokenizer,
                                  include_pad_token:Optional[bool]=False) -> Dict[str, int]:
    """
    Function to get the frequencies of a code dataset given a tokenizer.
    
    :param df: DataFrame containing the code snippets
    :param tokenizer: CustomTokenizer instance (HF or SP) with the corresponding implementation for tokenization
    
    :return: Dictionary containing the frequency of tokens for the given set of code snippets.
    """    
    freqs = { }
    
    for idx, row in df.iterrows():
        tokens = tokenizer.tokenize_txt(row.code)
        
        for tok in tokens:
            if tok == "<pad>" and not include_pad_token: # Special padding token is ignored by default
                continue
            freqs[tok] = freqs[tok]+1 if tok in freqs.keys() else 1
                
    return freqs

## Usage example with CodeNetSearch data

In [11]:
java_df = pd.read_csv('/tf/main/dvc-ds4se/code/searchnet/clean_java.csv')

In [12]:
java_samples = java_df.sample(10)

In [5]:
# Load tokenizer

hf_tokenizer = HFTokenizer('tokenizer.json')

In [14]:
frequencies = get_tokens_frequency_hf_tkzr(java_samples, hf_tokenizer)

In [15]:
pprint(frequencies)

{'!': 5,
 '"': 2,
 '&': 2,
 '(': 60,
 ')': 60,
 ',': 20,
 '.': 43,
 '0': 2,
 ':': 3,
 ';': 29,
 '<': 7,
 '=': 12,
 '>': 9,
 '?': 1,
 '@': 5,
 'APPLICATION': 1,
 'ARRAY': 2,
 'ATTRIBUTE': 1,
 'All': 2,
 'Attributes': 1,
 'Begin': 2,
 'Bot': 1,
 'Builder': 2,
 'Cas': 2,
 'Char': 3,
 'Character': 1,
 'Commerce': 2,
 'Consum': 1,
 'Containers': 1,
 'DI': 1,
 'DOT': 2,
 'Discount': 7,
 'E': 2,
 'EN': 1,
 'ENTITY': 1,
 'En': 7,
 'FOUND': 1,
 'From': 1,
 'G': 1,
 'H': 1,
 'Helper': 1,
 'IAtomContainer': 2,
 'ID': 2,
 'INDEX': 1,
 'IT': 1,
 'Id': 3,
 'Ident': 1,
 'Index': 1,
 'Initial': 1,
 'JC': 1,
 'JSON': 1,
 'LET': 1,
 'Links': 3,
 'List': 1,
 'Look': 2,
 'MP': 2,
 'MediaType': 1,
 'Menu': 2,
 'NOT': 1,
 'Name': 4,
 'Names': 2,
 'New': 1,
 'Next': 1,
 'Object': 2,
 'Override': 1,
 'PH': 1,
 'PUT': 1,
 'Param': 1,
 'Path': 1,
 'Persistence': 1,
 'Pre': 2,
 'Prop': 2,
 'Property': 1,
 'R': 1,
 'RE': 1,
 'Release': 2,
 'Resources': 1,
 'Response': 1,
 'Service': 2,
 'Set': 3,
 'Status': 1,
 '

In [2]:
from nbdev.export import notebook2script
notebook2script()

Converted 0.1_mgmnt.prep.ipynb.
Converted 0.2_mgmnt.prep.files_mgmnt.ipynb.
Converted 0.3_mgmnt.prep.bpe_tokenization.ipynb.
Converted 0.4_mgmnt.prep.tokenization_counting.ipynb.
Converted 0.5_mgmnt.prep.token_mgmnt.ipynb.
Converted 1.1_exp.info.ipynb.
Converted 1.2_exp.desc.metrics.java.ipynb.
Converted 1.4_exp.metrics_python.ipynb.
Converted 1.5_exp.metrics_java.ipynb.
Converted 2.0_repr.codebert.ipynb.
Converted 2.0_repr.i.ipynb.
Converted 2.1_repr.codeberta.ipynb.
Converted 2.1_repr.roberta.train.ipynb.
Converted 2.2_repr.roberta.eval.ipynb.
Converted 2.3_repr.word2vec.train.ipynb.
Converted 2.6_repr.word2vec.eval.ipynb.
Converted 2.7_repr.distmetrics.ipynb.
Converted 2.8_repr.sentence_transformers.ipynb.
Converted 3.1_traceability.unsupervised.eda.ipynb.
Converted 3.2_traceability.unsupervised.approach.d2v.ipynb.
Converted 3.2_traceability.unsupervised.approach.w2v.ipynb.
Converted 4.0_infoxplainer.ir.ipynb.
Converted 4.1_infoxplainer.ir.unsupervised.d2v.ipynb.
Converted 4.2_infox