In [64]:
# default_exp interpretability.info_theory_processing

In [61]:
# export

import pandas as pd
from tokenizers import Tokenizer
from pathlib import Path
import sentencepiece as spm
import logging
import os
from typing import Dict, Optional
from pprint import pprint

In [12]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

## info_theoery_processing

> Module to obtain the vocabulary (based on BPE tokenization) from a collection of code snippets.

> @Alvaro 16 May 2021

In [18]:
params = {
    "bpe32k_path": "/tf/main/dvc-ds4se/models/bpe/sentencepiece/deprecated/java_bpe_32k.model"
}

In [13]:
# utils

def check_file_existence(path):
    if not os.path.exists(path):
        logging.error('Provided file cannot be found.')
        return False
    return True

In [15]:
def __load_sp_model(model_path: str) -> spm.SentencePieceProcessor:
    """
    Loads the sentence piece model stored in the specified path 
    :param model_path: Path to the model file
    :return: SentencePieceProcessor object (corresponding to loaded model)
    """
    if not check_file_existence(model_path):
        msg = 'Sentence piece model could no be loaded'
        logging.error(msg)
        raise Exception(msg)

    sp_processor = spm.SentencePieceProcessor()
    sp_processor.load(model_path)
    return sp_processor    

In [29]:
# export

def __load_hf_tokenizer_model(path: str) -> Tokenizer:
    """
    Function to load a saved HuggingFace tokenizer
    
    :param path: Path containing the tokenizer file
    :return:
    """
    
    if not check_file_existence(path):
        msg = 'HuggingFace model could no be loaded'
        logging.error(msg)
        raise Exception(msg)
    return Tokenizer.from_file(path)

In [58]:
# export

def get_tokens_frequency_hf_tkzr(df: pd.DataFrame,
                                  tokenizer_path: str,
                                  include_pad_token:Optional[bool]=False) -> Dict[str, int]:
    """
    Function to get the frequencies given a tokenizer
    
    :param df: DataFrame containing the code snippets
    :param tokenizer_path: Path containing the file for the HF Tokenizer to be loaded
    :return: Dictionary containing the frequency of tokens for the given set of code snippets
    """    
    freqs = { }
    tokenizer = __load_hf_tokenizer_model(tokenizer_path)
    for idx, row in df.iterrows():
        tokens = tokenizer.encode(row.code).tokens
        
        for tok in tokens:
            if tok == "<pad>" and not include_pad_token:
                continue
            freqs[tok] = freqs[tok]+1 if tok in freqs.keys() else 1
                
    return freqs

## Usage example with CodeNetSearch data

In [47]:
java_df = pd.read_csv('/tf/main/dvc-ds4se/code/searchnet/clean_java.csv')

In [48]:
java_samples = java_df.sample(10)

In [59]:
frequencies = get_tokens_frequency_hf_tkzr(java_samples, "./tokenizer.json")

In [62]:
pprint(frequencies)

{'!': 5,
 '"': 4,
 '&': 3,
 '(': 70,
 ')': 68,
 '*': 2,
 '+': 10,
 ',': 13,
 '-': 9,
 '.': 39,
 '."': 2,
 '/': 9,
 '0': 6,
 '1': 2,
 '22': 2,
 '4': 2,
 '50': 1,
 '7': 1,
 ';': 39,
 '<': 6,
 '=': 35,
 '>': 4,
 '@': 3,
 'Allowed': 1,
 'Already': 1,
 'Api': 1,
 'App': 2,
 'ArgumentException': 1,
 'Arguments': 1,
 'Available': 1,
 'B': 1,
 'Builder': 6,
 'Cache': 3,
 'Channel': 3,
 'CharSequence': 2,
 'ClassLoader': 2,
 'Content': 3,
 'Context': 1,
 'D': 1,
 'Data': 3,
 'Desc': 1,
 'EncodingException': 2,
 'Entities': 1,
 'Entity': 1,
 'Exception': 5,
 'Exists': 1,
 'Flow': 2,
 'From': 1,
 'G': 1,
 'Height': 6,
 'HttpInfo': 1,
 'Id': 3,
 'If': 1,
 'Index': 1,
 'Initialized': 1,
 'Interfaces': 1,
 'K': 2,
 'Leg': 1,
 'Loader': 4,
 'Managed': 3,
 'NaN': 1,
 'Name': 1,
 'NonNull': 2,
 'Not': 5,
 'Null': 1,
 'Of': 11,
 'Offset': 3,
 'OutOfBoundsException': 1,
 'Over': 1,
 'Override': 1,
 'Page': 1,
 'Panel': 2,
 'Properties': 1,
 'Property': 1,
 'Reader': 1,
 'S': 10,
 'Schema': 1,
 'Sequence'

In [65]:
from nbdev.export import notebook2script
notebook2script()

Converted 0.0_mgmnt.prep.i.ipynb.
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
e
This cell doesn't have an export destination and was ignored:
e
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was ignored:
 
Converted 0.10_error_checker.ipynb.
Converted 0.11_info_theory_processing.ipynb.
This cell doesn't have an export destination and was ignored:
 
This cell doesn't have an export destination and was 