In [None]:
# default_exp mgmnt.prep

# Main Preprocessing

> This module comprises preprocessing techniques applied to software artifacts (TODO:cite here the papers employed for this preprocessings):
>
>This is an adapted version of Daniel McCrystal Nov 2019
>
>This version also includes BPE preprocesing and NLTK. It's the main class to execute conventional pipelines. 

>Author: @danaderp March 2020

In [None]:
#! pip install dit
#! pip install nltk
#! pip install tokenizers
#! pip install tensorflow_datasets
! pip install -U tensorflow-gpu

In [None]:
! pip install tensorflow_datasets

In [4]:
#export
from typing import List, Set, Callable, Tuple, Dict, Optional
import re
from nltk.stem.snowball import SnowballStemmer
import nltk
import pandas as pd
import glob
import os
import pathlib
from string import punctuation
import csv

In [None]:
from nltk.stem.snowball import SnowballStemmer
englishStemmer=SnowballStemmer("english")

In [None]:
#! pip install nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
#export
from tensorflow.keras.preprocessing import text
from pathlib import Path
import glob
from datetime import datetime

In [6]:
#export
# Imports
import pandas as pd
import sentencepiece as sp
import numpy as np
import json
from pathlib import Path
import sys
import sentencepiece as spm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [7]:
#export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from zipfile import ZipFile

In [None]:
! unzip -qq cisco/CSB-CICDPipelineEdition-master.zip

## Setup

In [None]:
#hide
path_data = '../dvc-ds4se/' #dataset path

In [None]:
def libest_params():
    return {
        'system': 'libest',
        #'path_zip': Path("cisco/sacp-python-common.zip"),
        'saving_path': path_data+ 'se-benchmarking/traceability/testbeds/processed/libest_data',
        'language': 'english',
        'dataset' : path_data + ''
        #'model_prefix': path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_128k' #For BPE Analysis
        #'model_prefix': path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_32k'
        'model_prefix':path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'
    }

SyntaxError: invalid syntax (<ipython-input-9-1d55101459cd>, line 10)

In [None]:
model_prefix = {
    'bpe8k' : path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k',
    'bpe32k' : path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_32k',
    'bpe128k' : path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_128k'
}

In [None]:
#params = default_params()
params = libest_params()

# Conventional Preprocessing Class

In [None]:
#export
class ConventionalPreprocessing():
    '''NLTK libraries for Conventional Preprocessing'''
    def __init__(self, params, bpe = False):
        self.params = params
        
        #If BPE provided, then preprocessing with BPE is allowed on CONV
        if bpe:
            self.sp_bpe = spm.SentencePieceProcessor()
            self.sp_bpe.load(params['model_prefix']+'.model')
        else:
            self.sp_bpe = None

        pass
    
    def bpe_pieces_pipeline(self, doc_list):
        '''Computes BPE preprocessing according to params'''
        encoded_str = ''
        if self.sp_bpe is None:
            logging.info('Provide a BPE Model!')
        else:
            encoded_str = [self.sp_bpe.encode_as_pieces(doc) for doc in doc_list]  
        return encoded_str
    
    #ToDo Transforme it into a For-Comprenhension
    def clean_punctuation(self, token): 
        #remove terms !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0123456789
        return re.sub(r'[^a-zA-Z\s]', ' ', token, re.I|re.A)

    def split_camel_case_token(self, token):
        return re.sub('([a-z])([A-Z])', r'\1 \2', token)

    def remove_terms(self, filtered_tokens):
        remove_terms = punctuation + '0123456789'
        return [token for token in filtered_tokens if token not in remove_terms and len(token)>2 and len(token)<21]

    def stemmer(self, filtered_tokens):
        return [englishStemmer.stem(token) for token in filtered_tokens ]

    def stop_words(self, filtered_tokens):
        stop_words = nltk.corpus.stopwords.words(self.params['language'])
        return [token for token in filtered_tokens if token not in stop_words]
    
    def basic_pipeline(self, dict_filenames):
        '''@dict_filenames: {filename: code}'''
        pre_process = [( key.replace('.txt', '-pre.txt') , self.clean_punctuation(dict_filenames[key][0])  ) for key in dict_filenames]
        pre_process = [( doc[0] , self.split_camel_case_token(doc[1])  ) for doc in pre_process]
        pre_process = [( doc[0] , doc[1].lower()  ) for doc in pre_process]
        pre_process = [( doc[0] , doc[1].strip()) for doc in pre_process] # Leading whitepsace are removed
        pre_process_tokens = [(doc[0] , nltk.WordPunctTokenizer().tokenize(doc[1])) for doc in pre_process]
        filtered_tokens = [(doc[0], self.stop_words(doc[1]) ) for doc in pre_process_tokens] #Stop Words
        filtered_tokens = [(doc[0], self.stemmer(doc[1]) ) for doc in filtered_tokens] #Filtering Stemmings
        filtered_tokens = [(doc[0], self.remove_terms(doc[1])) for doc in filtered_tokens] #Filtering remove-terms
        pre_process = [(doc[0], ' '.join(doc[1])) for doc in filtered_tokens]
        return pre_process
    
    def fromdocs_pipeline(self, docs):
        #TODO
        """@tokenized_file: a list of tokens that represents a document/code"""
        pre_process = [ self.clean_punctuation(doc) for doc in docs]
        logging.info('fromtokens_pipeline: clean punctuation')
        pre_process = [ self.split_camel_case_token(doc) for doc in pre_process]
        logging.info('fromtokens_pipeline: camel case')
        pre_process = [ doc.lower() for doc in pre_process] 
        logging.info('fromtokens_pipeline: lowe case')
        pre_process = [ doc.strip() for doc in pre_process] # Leading whitepsace are removed
        logging.info('fromtokens_pipeline: white space removed')
        pre_process_tokens = [ nltk.WordPunctTokenizer().tokenize(doc) for doc in pre_process]
        logging.info('fromtokens_pipeline: WordPunctTokenizer')
        filtered_tokens = [ self.stop_words(doc) for doc in pre_process_tokens] #Stop Words
        logging.info('fromtokens_pipeline: Stop words')
        filtered_tokens = [ self.stemmer(doc) for doc in filtered_tokens] #Filtering Stemmings
        logging.info('fromtokens_pipeline: Stemmings')
        filtered_tokens = [ self.remove_terms(doc) for doc in filtered_tokens] #Filtering remove-terms
        logging.info('fromtokens_pipeline: Removed Special Terns')
        pre_process = [ ' '.join(doc) for doc in filtered_tokens]
        logging.info('fromtokens_pipeline END')
        return pre_process
    
    def frombatch_pipeline(self, batch):
        #TODO
        """@batch: a TensorFlow Dataset Batch"""
        pre_process = [ self.clean_punctuation( doc.decode("utf-8") ) for doc in batch]
        logging.info('frombatch_pipeline: clean punctuation')
        pre_process = [ self.split_camel_case_token(doc) for doc in pre_process]
        logging.info('frombatch_pipeline: camel case')
        pre_process = [ doc.lower() for doc in pre_process] 
        logging.info('frombatch_pipeline: lowe case')
        pre_process = [ doc.strip() for doc in pre_process] # Leading whitepsace are removed
        logging.info('frombatch_pipeline: white space removed')
        pre_process_tokens = [ nltk.WordPunctTokenizer().tokenize(doc) for doc in pre_process]
        logging.info('frombatch_pipeline: WordPunctTokenizer')
        filtered_tokens = [ self.stop_words(doc) for doc in pre_process_tokens] #Stop Words
        logging.info('frombatch_pipeline: Stop words')
        filtered_tokens = [ self.stemmer(doc) for doc in filtered_tokens] #Filtering Stemmings
        logging.info('frombatch_pipeline: Stemmings')
        filtered_tokens = [ self.remove_terms(doc) for doc in filtered_tokens] #Filtering remove-terms
        logging.info('frombatch_pipeline: Removed Special Terns')
        #pre_process = [ ' '.join(doc) for doc in filtered_tokens]
        logging.info('frombatch_pipeline [END]')
        return filtered_tokens
    
    def fromtensor_pipeline(self, ts_x):
        """@ts_x: es un elemento del tensor"""
        #TODO
        pre_process = self.clean_punctuation(ts_x)
        pre_process = self.split_camel_case_token(pre_process)
        pre_process = pre_process.lower()
        pre_process = pre_process.strip()
        pre_process = nltk.WordPunctTokenizer().tokenize(pre_process)
        filtered_tokens = self.stop_words(pre_process)
        filtered_tokens = self.stemmer(filtered_tokens)
        filtered_tokens = self.remove_terms(filtered_tokens)
        pre_process = ' '.join(filtered_tokens)
        logging.info('fromtokens_pipeline END')
        return pre_process
    
    def SaveCorpus(self, df, language='js', sep=',', mode='a'):
        timestamp = datetime.timestamp(datetime.now())
        path_to_link = self.params['saving_path'] + '['+ self.params['system']  + '-' + language + '-{}].csv'.format(timestamp)

        df.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)     
        logging.info('Saving in...' + path_to_link)
        pass
    
    def LoadCorpus(self, timestamp, language='js', sep=',', mode='a'):
        path_to_link = self.params['saving_path'] + '['+ self.params['system']  + '-' + language + '-{}].csv'.format(timestamp)
        return pd.read_csv(path_to_link, header=0, index_col=0, sep=sep)
        

In [None]:
#export
def open_file(f, encoding='utf-8'):
    try:
        #return open(filename, 'r', encoding="ISO-8859-1").read()
        return open(f, 'r', encoding = encoding).read()
    except:
        print("Exception: ", sys.exc_info()[0])

In [None]:
#export
def get_files(system, ends):
    path = Path("cisco/CSB-CICDPipelineEdition-master/")
    names = [entry for entry in path.glob('**/*' +ends)]
    filenames = [(filename, os.path.basename(filename), open_file(filename) ) for filename in names]
    return pd.DataFrame( filenames ,columns = ['names','filenames','content'])

# 2. Utils 
> From @Nathan

In [1]:
# export
def jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f,
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [10]:
# export
#This if for SearchNet Dataset
def get_dfs(path):
    """
        Grabs the different data splits and converts them into dataframes.
        Expects format from Code Search Net Challenge.
        SearchNetDataset
    """
    dfs = []
    for split in ["train", "valid", "test"]:
        files = sorted((path/split).glob("**/*.gz"))
        df = jsonl_list_to_dataframe(files, ["code", "docstring"])
        dfs.append(df)
        
    return dfs

In [27]:
# export
def df_to_txt_file(df, output, cols):
    """Converts a dataframe and converts it into a text file that SentencePiece can use to train a BPE model"""
    if cols is None: cols = list(df.columns)
    merged_df = pd.concat([df[col] for col in cols])
    
    with open(output/'text.txt', 'w') as f:
        f.write('\n'.join(list(merged_df)))
    return output/'text.txt'

In [28]:
# export
def sp_model_from_df(df, output, model_name, cols = None):
    """Trains a SentencePiece BPE model from a pandas dataframe"""
    fname = df_to_txt_file(df, output, cols)
    sp.SentencePieceTrainer.train(f'--input={fname} --model_prefix={output / model_name} --hard_vocab_limit=false')

In [29]:
# export
def sp_model_from_glob(path, glob, model_name):
    fns = list(path.glob(glob))
    fns = ",".join(map(str, fns))
    sp.SentencePieceTrainer.train(f'--input={fns} --model_prefix={path / model_name} --hard_vocab_limit=false')

In [30]:
# export
def gen_hugface_model(df, output, tokenizer = ByteLevelBPETokenizer(), vocab_sz = 30_000, min_freq = 3, cols = None):
    fname = df_to_txt_file(df, output, cols)
    tokenizer.train(files = [str(fname)], vocab_size = vocab_sz, min_frequency = min_freq, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])
    
    return tokenizer

In [40]:
# export
def tokenize_fns(fns, tokenizer, exts, output, data_type):
    docs = []
    for fn in fns:
        system = fn.parent.name
        output_path = output/system/data_type
        output_path.mkdir(parents=True, exist_ok=True)
        files = []
        for ext in exts:
            files.extend(fn.glob(f'**/*.{ext}'))
        for file in files:
            if 'README' not in file.name:
                with open(file, encoding='ISO-8859-1') as f:
                    docs.append(tokenizer.EncodeAsPieces(f.read()))
                with open((output_path/file.name).with_suffix('.bpe'), 'w') as f:
                    f.write(' '.join(docs[-1]))
            
    return docs

In [41]:
# export
def read_bpe_files(path):
    bpe_files = []
    for file in path.glob('**/*.bpe'):
        with open(file) as f:
            bpe_files.append(f.read().split(' '))
    
    return bpe_files

In [42]:
# export 
#This implementation was oriented to traceability datasets
def split_lines_to_files(lines, fn_pattern, output_path, tokenizer):
    for line in lines:
        fn, content = line.split(fn_pattern)
        fn = fn.replace('"', '')
        fn = fn.replace(' Test ', '')
        content = tokenizer.EncodeAsPieces(content)
        with open((output_path/fn).with_suffix('.bpe'), 'w') as f:
                    f.write(' '.join(content))

### Testing Utils

In [15]:
#hide
path_data = Path('../dvc-ds4se/') #dataset path

In [19]:
def utils_params():
    return {
        'system': 'searchnet',
        'path_data': path_data / 'code/searchnet/java/final/jsonl',
        'path_test_out': path_data /'nbs_experiments'
    }

In [20]:
#path = Path('/tf/data/')
params = utils_params()

In [18]:
df_trn, df_val, df_tst = get_dfs( params['path_data'] )
df_trn.head()

Unnamed: 0,code,docstring
0,protected final void bindIndexed(Configuration...,Bind indexed elements to the supplied collecti...
1,public void setServletRegistrationBeans(\n\t\t...,Set {@link ServletRegistrationBean}s that the ...
2,public void addServletRegistrationBeans(\n\t\t...,Add {@link ServletRegistrationBean}s for the f...
3,public void setServletNames(Collection<String>...,Set servlet names that the filter will be regi...
4,public void addServletNames(String... servletN...,Add servlet names for the filter.\n@param serv...


In [23]:
params['path_test_out'] / 'trn.csv'

PosixPath('../dvc-ds4se/nbs_experiments/trn.csv')

In [25]:
#Sampling some data
df_trn.sample(frac = 0.01).to_json(params['path_test_out'] /'trn.jsonl', index = False)
df_val.sample(frac = 0.01).to_csv(params['path_test_out'] /'val.csv', index = False)
df_tst.sample(frac = 0.01).to_csv(params['path_test_out'] /'tst.csv', index = False)

In [44]:
df_trn.sample(frac = 0.01).to_json(params['path_test_out'] /'trn.jsonl')

ValueError: DataFrame index must be unique for orient='columns'.

In [26]:
df = pd.read_csv(params['path_test_out'] / 'trn.csv')
df.head()

Unnamed: 0,code,docstring
0,public static DiscountCurveInterface createDis...,Create a discount curve from forwards given by...
1,public static MaskFormat maskFormat(final Stri...,<p>\nReturns a {@link MaskFormat} instance for...
2,"public void init(RecordOwnerParent parent, Rec...",Initialize the RecordOwner.\n@param parentSess...
3,public Content getLink(LinkInfo linkInfo) {\n ...,Constructs a link from the given link informat...
4,public static String separatorsToSystem(String...,Converts all separators to the system separato...


### tst tokenizer hugface

In [31]:
tokenizer = gen_hugface_model(df, path)

In [32]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [33]:
print(tokenizer.encode("public static void main(String[] args) { getDirFromLib(); }").tokens)

['<s>', 'public', 'Ġstatic', 'Ġvoid', 'Ġmain', '(', 'String', '[]', 'Ġargs', ')', 'Ġ{', 'Ġget', 'Dir', 'From', 'Lib', '();', 'Ġ}', '</s>']


In [36]:
tokenizer.save(str( params['path_test_out'] ), "java_tokenizer")

TypeError: Can't convert 'java_tokenizer' to PyBool

In [37]:
dummy_data = {
        'first': ['1', '2', '6', '7', '8'],
        'second': ['K', 'M', 'O', 'Q', 'S'],
        'third': ['L', 'N', 'P', 'R', 'T']}

In [39]:
df = pd.DataFrame(dummy_data); df

Unnamed: 0,first,second,third
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T
