In [None]:
# default_exp data.preprocessing

# preprocessing

> This module comprises all preprocessing techniques applied to software artifacts:
>
>> Text-based Artifacts: Classical preprocessing (stemming, lemas, etc) and BPE Binary Artifacts:
>
>> To Do Vision-based Artifacts:
>
>> To Do Parsing: Techniques to control and manipulate source code (complete with deep generator project)

In [None]:
# export
# Imports
import pandas as pd
import sentencepiece as sp

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
def jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f,
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [None]:
# export
def get_dfs(path):
    """
        Grabs the different data splits and converts them into dataframes.
        Expects format from Code Search Net Challenge.
    """
    dfs = []
    for split in ["train", "valid", "test"]:
        files = sorted((path/split).glob("**/*.gz"))
        df = jsonl_list_to_dataframe(files, ["code", "docstring"])
        dfs.append(df)
        
    return dfs

In [None]:
path = Path('/tf/data/')

In [None]:
df_trn, df_val, df_tst = get_dfs(path/"java/final/jsonl")
df_trn.head()

Unnamed: 0,code,docstring
0,protected final void bindIndexed(Configuration...,Bind indexed elements to the supplied collecti...
1,public void setServletRegistrationBeans(\n\t\t...,Set {@link ServletRegistrationBean}s that the ...
2,public void addServletRegistrationBeans(\n\t\t...,Add {@link ServletRegistrationBean}s for the f...
3,public void setServletNames(Collection<String>...,Set servlet names that the filter will be regi...
4,public void addServletNames(String... servletN...,Add servlet names for the filter.\n@param serv...


In [None]:
# Save some test data
df_trn.sample(frac = 0.01).to_csv('./test_data/trn.csv', index = False)
df_val.sample(frac = 0.01).to_csv('./test_data/val.csv', index = False)
df_tst.sample(frac = 0.01).to_csv('./test_data/tst.csv', index = False)

In [None]:
# export
def df_to_txt_file(df, output, cols):
    """Converts a dataframe and converts it into a text file that SentencePiece can use to train a BPE model"""
    if cols is None: cols = list(df.columns)
    merged_df = pd.concat([df[col] for col in cols])
    
    with open(output/'text.txt', 'w') as f:
        f.write('\n'.join(list(merged_df)))
    return output/'text.txt'

In [None]:
# export
def gen_sp_model(df, output, model_name, cols = None):
    """Trains a SentencePiece BPE model from a pandas dataframe"""
    fname = df_to_txt_file(df, output, cols)
    sp.SentencePieceTrainer.train(f'--input={fname} --model_prefix={output / model_name} --hard_vocab_limit=false')

In [None]:
# export
def gen_hugface_model(df, output, tokenizer = ByteLevelBPETokenizer(), vocab_sz = 30_000, min_freq = 3, cols = None):
    fname = df_to_txt_file(df, output, cols)
    tokenizer.train(files = [str(fname)], vocab_size = vocab_sz, min_frequency = min_freq, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])
    
    return tokenizer

In [None]:
path = Path("./test_data")
model_name = "test"

In [None]:
df = pd.read_csv(path / 'trn.csv')
df.head()

Unnamed: 0,code,docstring
0,private static void createCode(String packageN...,Create the Java
1,@Override\n public void flushCache() {\n ...,LI3492-2
2,"public void addRule(IntDependency dependency, ...",Add this dependency with the given count to th...
3,@Override\n public boolean removeIfEquals(K k...,Remove the object from the cache.
4,public void marshall(DatasetContentDeliveryDes...,Marshall the given parameter object.


In [None]:
tokenizer = gen_hugface_model(df, path)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [None]:
print(tokenizer.encode("public static void main(String[] args) { getDirFromLib(); }").tokens)

['<s>', 'public', 'Ġstatic', 'Ġvoid', 'Ġmain', '(', 'String', '[]', 'Ġargs', ')', 'Ġ{', 'Ġget', 'Dir', 'From', 'Lib', '();', 'Ġ}', '</s>']


In [None]:
tokenizer.save(str(path), "java_tokenizer")

['test_data/java_tokenizer-vocab.json', 'test_data/java_tokenizer-merges.txt']

In [None]:
dummy_data = {
        'first': ['1', '2', '6', '7', '8'],
        'second': ['K', 'M', 'O', 'Q', 'S'],
        'third': ['L', 'N', 'P', 'R', 'T']}

In [None]:
df = pd.DataFrame(dummy_data2); df

Unnamed: 0,id,Feature1,Feature2
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [None]:
df_to_txt_file(df, Path('./test_data'), list(df.columns))

PosixPath('test_data/text.txt')

In [None]:
path = Path("./test_data")
model_name = "test"

In [None]:
gen_sp_model(df, path, model_name, list(df.columns))

In [None]:
spm = sp.SentencePieceProcessor()
spm.Load(str(path/f"{model_name}.model"))

True

In [None]:
spm.EncodeAsPieces("Hello, world!")

['▁', 'Hello,', '▁', 'world!']