In [None]:
# default_exp mgmnt.prep

# Main Preprocessing

> This module comprises preprocessing techniques applied to software artifacts (TODO:cite here the papers employed for this preprocessings):
>
>This is an adapted version of Daniel McCrystal Nov 2019
>
>This version also includes BPE preprocesing and NLTK. It's the main class to execute conventional pipelines. 

>Author: @danaderp March 2020

In [None]:
#! pip install dit
#! pip install nltk
#! pip install tokenizers
#! pip install tensorflow_datasets
! pip install -U tensorflow-gpu

In [None]:
! pip install tensorflow_datasets

In [None]:
#export
from typing import List, Set, Callable, Tuple, Dict, Optional
import re
from nltk.stem.snowball import SnowballStemmer
import nltk
import pandas as pd
import glob
import os
import pathlib
from string import punctuation
import csv

In [None]:
from nltk.stem.snowball import SnowballStemmer
englishStemmer=SnowballStemmer("english")

In [None]:
#! pip install nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#export
from tensorflow.keras.preprocessing import text
from pathlib import Path
import glob
from datetime import datetime

In [None]:
#export
# Imports
import pandas as pd
import sentencepiece as sp
import numpy as np
import json
from pathlib import Path
import sys
import sentencepiece as spm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [None]:
#export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from zipfile import ZipFile

In [None]:
! unzip -qq cisco/CSB-CICDPipelineEdition-master.zip

## Setup

In [None]:
#hide
path_data = '../dvc-ds4se/' #dataset path

In [None]:
def libest_params():
    return {
        'system': 'libest',
        #'path_zip': Path("cisco/sacp-python-common.zip"),
        'saving_path': path_data+ 'se-benchmarking/traceability/testbeds/processed/libest_data',
        'language': 'english',
        'dataset' : path_data + ''
        #'model_prefix': path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_128k' #For BPE Analysis
        #'model_prefix': path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_32k'
        'model_prefix':path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'
    }

SyntaxError: invalid syntax (<ipython-input-9-1d55101459cd>, line 10)

In [None]:
model_prefix = {
    'bpe8k' : path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k',
    'bpe32k' : path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_32k',
    'bpe128k' : path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_128k'
}

In [None]:
#params = default_params()
params = libest_params()

# Conventional Preprocessing Class

In [None]:
#export
class ConventionalPreprocessing():
    '''NLTK libraries for Conventional Preprocessing'''
    def __init__(self, params, bpe = False):
        self.params = params
        
        #If BPE provided, then preprocessing with BPE is allowed on CONV
        if bpe:
            self.sp_bpe = spm.SentencePieceProcessor()
            self.sp_bpe.load(params['model_prefix']+'.model')
        else:
            self.sp_bpe = None

        pass
    
    def bpe_pieces_pipeline(self, doc_list):
        '''Computes BPE preprocessing according to params'''
        encoded_str = ''
        if self.sp_bpe is None:
            logging.info('Provide a BPE Model!')
        else:
            encoded_str = [self.sp_bpe.encode_as_pieces(doc) for doc in doc_list]  
        return encoded_str
    
    #ToDo Transforme it into a For-Comprenhension
    def clean_punctuation(self, token): 
        #remove terms !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~0123456789
        return re.sub(r'[^a-zA-Z\s]', ' ', token, re.I|re.A)

    def split_camel_case_token(self, token):
        return re.sub('([a-z])([A-Z])', r'\1 \2', token)

    def remove_terms(self, filtered_tokens):
        remove_terms = punctuation + '0123456789'
        return [token for token in filtered_tokens if token not in remove_terms and len(token)>2 and len(token)<21]

    def stemmer(self, filtered_tokens):
        return [englishStemmer.stem(token) for token in filtered_tokens ]

    def stop_words(self, filtered_tokens):
        stop_words = nltk.corpus.stopwords.words(self.params['language'])
        return [token for token in filtered_tokens if token not in stop_words]
    
    def basic_pipeline(self, dict_filenames):
        '''@dict_filenames: {filename: code}'''
        pre_process = [( key.replace('.txt', '-pre.txt') , self.clean_punctuation(dict_filenames[key][0])  ) for key in dict_filenames]
        pre_process = [( doc[0] , self.split_camel_case_token(doc[1])  ) for doc in pre_process]
        pre_process = [( doc[0] , doc[1].lower()  ) for doc in pre_process]
        pre_process = [( doc[0] , doc[1].strip()) for doc in pre_process] # Leading whitepsace are removed
        pre_process_tokens = [(doc[0] , nltk.WordPunctTokenizer().tokenize(doc[1])) for doc in pre_process]
        filtered_tokens = [(doc[0], self.stop_words(doc[1]) ) for doc in pre_process_tokens] #Stop Words
        filtered_tokens = [(doc[0], self.stemmer(doc[1]) ) for doc in filtered_tokens] #Filtering Stemmings
        filtered_tokens = [(doc[0], self.remove_terms(doc[1])) for doc in filtered_tokens] #Filtering remove-terms
        pre_process = [(doc[0], ' '.join(doc[1])) for doc in filtered_tokens]
        return pre_process
    
    def fromdocs_pipeline(self, docs):
        #TODO
        """@tokenized_file: a list of tokens that represents a document/code"""
        pre_process = [ self.clean_punctuation(doc) for doc in docs]
        logging.info('fromtokens_pipeline: clean punctuation')
        pre_process = [ self.split_camel_case_token(doc) for doc in pre_process]
        logging.info('fromtokens_pipeline: camel case')
        pre_process = [ doc.lower() for doc in pre_process] 
        logging.info('fromtokens_pipeline: lowe case')
        pre_process = [ doc.strip() for doc in pre_process] # Leading whitepsace are removed
        logging.info('fromtokens_pipeline: white space removed')
        pre_process_tokens = [ nltk.WordPunctTokenizer().tokenize(doc) for doc in pre_process]
        logging.info('fromtokens_pipeline: WordPunctTokenizer')
        filtered_tokens = [ self.stop_words(doc) for doc in pre_process_tokens] #Stop Words
        logging.info('fromtokens_pipeline: Stop words')
        filtered_tokens = [ self.stemmer(doc) for doc in filtered_tokens] #Filtering Stemmings
        logging.info('fromtokens_pipeline: Stemmings')
        filtered_tokens = [ self.remove_terms(doc) for doc in filtered_tokens] #Filtering remove-terms
        logging.info('fromtokens_pipeline: Removed Special Terns')
        pre_process = [ ' '.join(doc) for doc in filtered_tokens]
        logging.info('fromtokens_pipeline END')
        return pre_process
    
    def frombatch_pipeline(self, batch):
        #TODO
        """@batch: a TensorFlow Dataset Batch"""
        pre_process = [ self.clean_punctuation( doc.decode("utf-8") ) for doc in batch]
        logging.info('frombatch_pipeline: clean punctuation')
        pre_process = [ self.split_camel_case_token(doc) for doc in pre_process]
        logging.info('frombatch_pipeline: camel case')
        pre_process = [ doc.lower() for doc in pre_process] 
        logging.info('frombatch_pipeline: lowe case')
        pre_process = [ doc.strip() for doc in pre_process] # Leading whitepsace are removed
        logging.info('frombatch_pipeline: white space removed')
        pre_process_tokens = [ nltk.WordPunctTokenizer().tokenize(doc) for doc in pre_process]
        logging.info('frombatch_pipeline: WordPunctTokenizer')
        filtered_tokens = [ self.stop_words(doc) for doc in pre_process_tokens] #Stop Words
        logging.info('frombatch_pipeline: Stop words')
        filtered_tokens = [ self.stemmer(doc) for doc in filtered_tokens] #Filtering Stemmings
        logging.info('frombatch_pipeline: Stemmings')
        filtered_tokens = [ self.remove_terms(doc) for doc in filtered_tokens] #Filtering remove-terms
        logging.info('frombatch_pipeline: Removed Special Terns')
        #pre_process = [ ' '.join(doc) for doc in filtered_tokens]
        logging.info('frombatch_pipeline [END]')
        return filtered_tokens
    
    def fromtensor_pipeline(self, ts_x):
        """@ts_x: es un elemento del tensor"""
        #TODO
        pre_process = self.clean_punctuation(ts_x)
        pre_process = self.split_camel_case_token(pre_process)
        pre_process = pre_process.lower()
        pre_process = pre_process.strip()
        pre_process = nltk.WordPunctTokenizer().tokenize(pre_process)
        filtered_tokens = self.stop_words(pre_process)
        filtered_tokens = self.stemmer(filtered_tokens)
        filtered_tokens = self.remove_terms(filtered_tokens)
        pre_process = ' '.join(filtered_tokens)
        logging.info('fromtokens_pipeline END')
        return pre_process
    
    def SaveCorpus(self, df, language='js', sep=',', mode='a'):
        timestamp = datetime.timestamp(datetime.now())
        path_to_link = self.params['saving_path'] + '['+ self.params['system']  + '-' + language + '-{}].csv'.format(timestamp)

        df.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)     
        logging.info('Saving in...' + path_to_link)
        pass
    
    def LoadCorpus(self, timestamp, language='js', sep=',', mode='a'):
        path_to_link = self.params['saving_path'] + '['+ self.params['system']  + '-' + language + '-{}].csv'.format(timestamp)
        return pd.read_csv(path_to_link, header=0, index_col=0, sep=sep)
        

In [None]:
#export
def open_file(f, encoding='utf-8'):
    try:
        #return open(filename, 'r', encoding="ISO-8859-1").read()
        return open(f, 'r', encoding = encoding).read()
    except:
        print("Exception: ", sys.exc_info()[0])

In [None]:
#export
def get_files(system, ends):
    path = Path("cisco/CSB-CICDPipelineEdition-master/")
    names = [entry for entry in path.glob('**/*' +ends)]
    filenames = [(filename, os.path.basename(filename), open_file(filename) ) for filename in names]
    return pd.DataFrame( filenames ,columns = ['names','filenames','content'])

## 1. Processing Software Corpora from GitHub
> Cisco Repositories

In [None]:
path = Path("cisco/CSB-CICDPipelineEdition-master/")
path

PosixPath('cisco/CSB-CICDPipelineEdition-master')

In [None]:
#hide
def sacp_params(model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'):
    return {
        'system': 'sacp-python-common',
        'path_zip': Path("/tf/data/cisco/sacp_data/sacp-python-common.zip"),
        'dataset': 'tf/data/cisco/sacp_data/',
        'saving_path': '../../'+'data/cisco/sacp_data/',
        'language': 'english',
        'model_prefix':model_prefix #For BPE Analysis
    }

In [None]:
params = sacp_params()
params

{'system': 'sacp-python-common',
 'path_zip': PosixPath('/tf/data/cisco/sacp_data/sacp-python-common.zip'),
 'dataset': 'tf/data/cisco/sacp_data/',
 'saving_path': '../../data/cisco/sacp_data/',
 'language': 'english',
 'model_prefix': '../dvc-ds4se/models/bpe/sentencepiece/wiki_py_java_bpe_8k'}

In [None]:
prep = ConventionalPreprocessing(params, bpe = True)

In [None]:
########################################################

In [None]:
#explore
archive = ZipFile(params['path_zip'], 'r')
files = archive.namelist()

In [None]:
#explore
files = [name for name in archive.namelist() if name.endswith('.py')] #recursively finds files

In [None]:
len(files)

97

In [None]:
#export
def get_file_zip(params, ends):
    archive = ZipFile( params['path_zip'], 'r')
    names = [name for name in archive.namelist() if name.endswith(ends)]
    filenames = [(filename, os.path.basename(filename), archive.read(filename) ) for filename in names]
    return pd.DataFrame( filenames ,columns = ['names','filenames','content'])

In [None]:
#tst
df_sampling = get_file_zip(params = params, ends='.py')

In [None]:
#tst
df_sampling.head()

Unnamed: 0,names,filenames,content
0,sacp-python-common/sacp_python_common/__init__.py,__init__.py,b''
1,sacp-python-common/sacp_python_common/auth_uti...,auth_utility.py,"b'""""""\r\nCreated on Aug 12, 2019\r\n\r\n@autho..."
2,sacp-python-common/sacp_python_common/bandit/_...,__init__.py,b''
3,sacp-python-common/sacp_python_common/bandit/b...,bandit.py,b'#!/usr/bin/env python3\r\nimport argparse\r\...
4,sacp-python-common/sacp_python_common/bandit/b...,banditReport.py,b'import json\r\n\r\nfrom jinja2 import Enviro...


In [None]:
df_sampling[df_sampling['names'].str.contains('auth_utility.py')]

Unnamed: 0,names,filenames,content
1,sacp-python-common/sacp_python_common/auth_uti...,auth_utility.py,"b'""""""\r\nCreated on Aug 12, 2019\r\n\r\n@autho..."
79,sacp-python-common/test/python/test_auth_utili...,test_auth_utility.py,b'import unittest\r\nimport mock\r\nimport req...


In [None]:
#tst
prep.SaveCorpus(df_sampling, language='py')

In [None]:
#tst
df_sampling = prep.LoadCorpus(1595859280.080238, language='py')
df_sampling.head()

### Creating standard dataframe for issues and pull-request (cisco)

In [None]:
pr_data = pd.read_csv('cisco/sacp-pullrequest-01.csv', sep = '~', header = 0, encoding='UTF-8')

In [None]:
pr_data

Merging all the system artifacts in one file

In [None]:
pr_all_sys = pr_data.copy()
pr_all_sys = pr_all_sys.replace(np.nan, ' ', regex=True)

In [None]:
pr_all_sys['text'] = pr_all_sys['title'].astype(str) + pr_all_sys['labels'].astype(str) + pr_all_sys['body'].astype(str)#merging tree columns for the text
pr_all_sys = pr_all_sys[['id-pr','text']]
pr_all_sys = pr_all_sys.rename(columns={'id-pr': 'ids'})
pr_all_sys['type'] = 'pr' #<------- File Type Standard for Target or Source
pr_all_sys.head()

In [None]:
pr_all_code = df_sampling.copy()
#pr_all_code['text'] =  pr_all_code.apply(lambda row: row['content'].decode("utf-8"), axis = 1)
pr_all_code['content'] =  pr_all_code['content'].apply(lambda x: eval(x))
pr_all_code['text'] =  pr_all_code['content'].apply(lambda x: x.decode("utf-8"))
pr_all_code = pr_all_code[['names','text']]
pr_all_code = pr_all_code.rename(columns={'names': 'ids'})
pr_all_code['type'] = 'py'

In [None]:
pr_all_sys = pd.concat([pr_all_sys, pr_all_code])

In [None]:
pr_all_sys['conv'] = prep.fromdocs_pipeline( pr_all_sys['text'].values ) #Conventional Preprocessing

In [None]:
pr_all_sys

In [None]:
prep.SaveCorpus(pr_all_sys, language='all-corpus', sep='~')

Loading for Preprocessing

In [None]:
df_sacp = prep.LoadCorpus(1595953540.866044, language='all-corpus', sep='~')

In [None]:
df_sacp.head()

Unnamed: 0,ids,text,type,conv
0,295,Production Merge * Feed release name through t...,pr,product merg feed releas name upload bom allow...
1,294,Add test fields for DARE push * Added test dat...,pr,add test field dare push test data json sent d...
2,293,"Allow passing a release to uploadBom by name, ...",pr,allow pass releas upload bom name rather chang...
3,287,Allow append images #363 - Changed how image n...,pr,allow append imag chang imag name creat send c...
4,274,Move docker/blackduck test to slave 4,pr,move docker blackduck test slave


In [None]:
df_sacp.dropna( inplace = True ) #empty files are not considered

In [None]:
#Iterating All Possible BPEs configs
for bpe in model_prefix.keys():
    mpr = model_prefix[bpe]
    prep = ConventionalPreprocessing(sacp_params(model_prefix = mpr), bpe = True)
    df_sacp[bpe] = prep.bpe_pieces_pipeline( df_sacp['text'].values ) #BPE Preprocessing

In [None]:
df_sacp.head()

Unnamed: 0,ids,text,type,conv,bpe8k,bpe32k,bpe128k
0,295,Production Merge * Feed release name through t...,pr,product merg feed releas name upload bom allow...,"[▁production, ▁mer, ge, ▁*, ▁feed, ▁release, ▁...","[▁production, ▁merge, ▁*, ▁feed, ▁release, ▁na...","[▁production, ▁merge, ▁*, ▁feed, ▁release, ▁na..."
1,294,Add test fields for DARE push * Added test dat...,pr,add test field dare push test data json sent d...,"[▁add, ▁test, ▁fields, ▁for, ▁d, are, ▁p, ush,...","[▁add, ▁test, ▁fields, ▁for, ▁dare, ▁push, ▁*,...","[▁add, ▁test, ▁fields, ▁for, ▁dare, ▁push, ▁*,..."
2,293,"Allow passing a release to uploadBom by name, ...",pr,allow pass releas upload bom name rather chang...,"[▁allow, ▁passing, ▁a, ▁release, ▁to, ▁up, loa...","[▁allow, ▁passing, ▁a, ▁release, ▁to, ▁up, loa...","[▁allow, ▁passing, ▁a, ▁release, ▁to, ▁upload,..."
3,287,Allow append images #363 - Changed how image n...,pr,allow append imag chang imag name creat send c...,"[▁allow, ▁app, end, ▁images, ▁#, 3, 63, ▁-, ▁c...","[▁allow, ▁append, ▁images, ▁#3, 63, ▁-, ▁chang...","[▁allow, ▁append, ▁images, ▁#3, 63, ▁-, ▁chang..."
4,274,Move docker/blackduck test to slave 4,pr,move docker blackduck test slave,"[▁move, ▁d, ock, er, /, black, d, uck, ▁test, ...","[▁move, ▁dock, er, /, black, d, uck, ▁test, ▁t...","[▁move, ▁docker, /, black, duck, ▁test, ▁to, ▁..."


In [None]:
prep.SaveCorpus(df_sacp, language='all-corpus', sep='~')

2020-12-29 06:52:58,801 : INFO : Saving in...../../data/cisco/sacp_data/[sacp-python-common-all-corpus-1609224778.517111].csv


old code down [becareful]

In [None]:
#debugging
path = Path("cisco/CSB-CICDPipelineEdition-master/")
names = [entry for entry in path.glob('**/*.py')]

In [None]:
#hide
#here looking for a file with encoding problems
temp_list=[]
for filename in names:
    print(filename)
    try:
        temp_list.append(open(filename, 'r', encoding="ISO-8859-1").read())
    except FileNotFoundError as err:
        print('lookattheerr' + str(err))
    except:
        print('bydefault')

In [None]:
len(temp_list)

In [None]:
df_java = get_files(system = params['system'], ends='.py')

In [None]:
df_java.head()

In [None]:
df_java.shape

In [None]:
SaveCorpus(df_java, language='py')

In [None]:
df_test = LoadCorpus(1592266849.29903,language='py')

In [None]:
df_test.head()

# 2. Processing Software Corpora from CodeSearchNet

In [None]:
#CodeSearchNet Parameters
params = {
    'system':'codesearchnet',
    'saving_path': 'test_data/',
    'language': 'english'
}

In [None]:
#[step1] Create Preprocesser <----------
preprocess_pipeline = ConventionalPreprocessing(params= params)

In [None]:
python_files = sorted(Path('codesearch/python/').glob('**/*.gz'))
java_files = sorted(Path('codesearch/java/').glob('**/*.gz'))

In [None]:
java_files

In [None]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [None]:
python_searchnet_df = jsonl_list_to_dataframe(python_files)

In [None]:
java_searchnet_df = jsonl_list_to_dataframe(java_files)

In [None]:
java_searchnet_df.head()

In [None]:
java_searchnet_df.shape

In [None]:
pytrain = java_searchnet_df[java_searchnet_df.partition.eq('train')]

In [None]:
javatrain = java_searchnet_df[java_searchnet_df.partition.eq('train')].copy()

In [None]:
pytrain.shape

In [None]:
javatrain.shape

In [None]:
preprocess_pipeline.SaveCorpus(javatrain, language='java') #Saving codesearchnet only training samples

### Testing Preprocessing for CodeSearchNet

In [None]:
java_searchnet_df = preprocess_pipeline.LoadCorpus(1592409554.097457, language='java')

In [None]:
java_searchnet_df.head()

In [None]:
java_searchnet_df['code'].values[0]

In [None]:
preprocessed_df = preprocess_pipeline.fromdocs_pipeline(java_searchnet_df['code'].values)

In [None]:
preprocessed_df[0]

In [None]:
df_preprocessed = java_searchnet_df.copy()
df_preprocessed['preprocessed'] = preprocessed_df

In [None]:
df_preprocessed.head()

In [None]:
SaveCorpus(df_preprocessed, language='preprocessed-java') #Saving codesearchnet

# 3. Processing from Wikipedia 
>Inspired by [KD](https://www.kdnuggets.com/2017/11/building-wikipedia-text-corpus-nlp.html)
>
>Dump Wiki File [here](https://dumps.wikimedia.org/enwiki/latest/)

In [None]:
#export
import tensorflow_datasets as tfds

In [None]:
#Config description: Wikipedia dataset for en, parsed from 20190301 dump.
#Download size: 15.72 GiB
#Dataset size: Unknown size
#Examples: train 5,824596
dataset_name = 'wikipedia/20200301.en' #'wikipedia/20190301.en'

In [None]:
#Download the dataset and create a tf.data.Dataset
ds, info = tfds.load(dataset_name, split='train', with_info=True)

In [None]:
#Accessing Metadata with DatasetInfo
print(info.splits['train'].num_examples)

In [None]:
info

In [None]:
dataset_wiki = []
#dataset_wiki = ds.map(lambda ex_text, ex_title: preprocess_pipeline.fromtensor_pipeline( ex_text.decode("utf-8") ))

In [None]:
dataset_wiki = [preprocess_pipeline.fromtensor_pipeline( ex['text'].decode("utf-8") ) for ex in  tfds.as_numpy(ds)]

In [None]:
df_dataset_wiki = pd.DataFrame( dataset_wiki ,columns = ['text'])

In [None]:
# Build your input pipeline
ds = ds.batch(2)

In [None]:
# Get Numpy Arrays
for ex in tfds.as_numpy(ds):
    #print( preprocess_pipeline.fromtensor_pipeline( ex['text'].decode("utf-8") ) )
    #print("NEXT!!!")
    #print(ex['text'].decode("utf-8"))
    #print(ex)
    #np_text, np_title = ex['text'], ex['title']
    print(preprocess_pipeline.frombatch_pipeline( ex['text'] ))

In [None]:
ds = ds.unbatch()

In [None]:
np_text[90]

In [None]:
len(np_text)

In [None]:
for ex in ds.take(4):
    print(ex)

In [None]:
ds

In [None]:
params = {
    'system':'wiki',
    'saving_path': 'test_data/',
    'language': 'english'
}

In [None]:
import functools

# 4. Processing from Semeru Format and Converting into Mappings
> @danaderp July 29'20

In [None]:
import functools 

In [None]:
#hide
semeru_format =  path_data + 'se-benchmarking/traceability/datasets/formatted/semeru_format/'

In [None]:
#Setting Up SemeruFormat
def libest_params(model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'):
    return {
        'system': 'libest',
        'saving_path': path_data+ 'se-benchmarking/traceability/testbeds/processed/',
        'language': 'english',
        'dataset' : {
            'req':pathlib.Path( semeru_format + 'LibEST_semeru_format/requirements'),
            'src':pathlib.Path( semeru_format + 'LibEST_semeru_format/source_code'),
            'tc':pathlib.Path( semeru_format + 'LibEST_semeru_format/test')
        },
        'ends': ['.txt','.c','.h'],
        'model_prefix':model_prefix,
        'encoding':'utf-8'
    }

In [None]:
def ebt_params(model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'):
    return {
        'system': 'ebt',
        'saving_path': path_data+ 'se-benchmarking/traceability/testbeds/processed/',
        'language': 'english',
        'dataset' : {
            'req':pathlib.Path( semeru_format + 'EBT_semeru_format/requirements'),
            'tc': pathlib.Path( semeru_format + 'EBT_semeru_format/test_cases'),
            'src':pathlib.Path( semeru_format + 'EBT_semeru_format/source_code')
        },
        'ends': ['.txt','.java','.c','.h','.TXT'],
        'model_prefix':model_prefix,
        #'encoding':'ISO-8859-1'
        'encoding':'utf-8' #english encoding
    }

In [None]:
def itrust_params(model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'):
    return {
        'system': 'itrust',
        'saving_path': path_data+ 'se-benchmarking/traceability/testbeds/processed/',
        'language': 'english',
        'dataset' : {
            'uc':pathlib.Path( semeru_format + 'iTrust_semeru_format/use_cases'),
            'src':pathlib.Path( semeru_format + 'iTrust_semeru_format/source_code')
        },
        'ends': ['.txt','.java','.c','.h','.TXT','.jsp'],
        'model_prefix':model_prefix,
        'encoding':'ISO-8859-1'
        #'encoding':'utf-8' #english encoding
    }

In [None]:
def smos_params(model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'):
    return {
        'system': 'smos',
        'saving_path': path_data+ 'se-benchmarking/traceability/testbeds/processed/',
        'language': 'italian',
        'dataset' : {
            'uc':pathlib.Path( semeru_format + 'SMOS_semeru_format/use_cases'),
            'src':pathlib.Path( semeru_format + 'SMOS_semeru_format/source_code')
        },
        'ends': ['.txt','.java','.c','.h','.TXT','.jsp'],
        'model_prefix':model_prefix,
        'encoding':'ISO-8859-1'
        #'encoding':'utf-8' #english encoding
    }

In [None]:
#parameters = libest_params(model_prefix=model_prefix['bpe8k'])
parameters = smos_params()
parameters

{'system': 'smos',
 'saving_path': '../dvc-ds4se/se-benchmarking/traceability/testbeds/processed/',
 'language': 'italian',
 'dataset': {'uc': PosixPath('../dvc-ds4se/se-benchmarking/traceability/datasets/formatted/semeru_format/SMOS_semeru_format/use_cases'),
  'src': PosixPath('../dvc-ds4se/se-benchmarking/traceability/datasets/formatted/semeru_format/SMOS_semeru_format/source_code')},
 'ends': ['.txt', '.java', '.c', '.h', '.TXT', '.jsp'],
 'model_prefix': '../dvc-ds4se/models/bpe/sentencepiece/wiki_py_java_bpe_8k',
 'encoding': 'ISO-8859-1'}

In [None]:
parameters['dataset'].keys()

dict_keys(['uc', 'src'])

In [None]:
logging.info("artifacts: "  +  str(parameters['dataset'].keys()) )
logging.info("artifacts: "  +  str(parameters['dataset']['uc'] ) )

2021-01-11 23:50:05,499 : INFO : artifacts: dict_keys(['uc', 'src'])
2021-01-11 23:50:05,500 : INFO : artifacts: ../dvc-ds4se/se-benchmarking/traceability/datasets/formatted/semeru_format/SMOS_semeru_format/use_cases


In [None]:
lst = [entry for entry in parameters['dataset']['uc'].glob('**/*' + ".txt" )]
lst[0]

PosixPath('../dvc-ds4se/se-benchmarking/traceability/datasets/formatted/semeru_format/SMOS_semeru_format/use_cases/SMOS37.txt')

In [None]:
tmp = [(filename, os.path.basename(filename), open_file(filename, encoding=parameters['encoding']) ) for filename in lst]

In [None]:
tmp[0]

(PosixPath('../dvc-ds4se/se-benchmarking/traceability/datasets/formatted/semeru_format/SMOS_semeru_format/use_cases/SMOS37.txt'),
 'SMOS37.txt',
 "Nome: InserisciNota\nAttori: Amministratore\nDescrizione: Inserimento Note Disciplinari\nPrecondizioni:\nâ\x80¢ â\x80¢ â\x80¢ â\x80¢\nL'utente deve essere loggato al sistema come Amministratore L'utente ha svolto il caso d'uso â\x80\x9cVisualizzaDettagliSingoloRegistroâ\x80\x9d L'utente ha giÃ\xa0 svolto il caso d'uso â\x80\x9cVisualizzaElencoNoteâ\x80\x9d L'utente clicca sul pulsante â\x80\x9cNuova notaâ\x80\x9d\nSequenza degli eventi\nUtente\nSistema\n2. Compila il form 3. Clicca su â\x80\x9cSalvaâ\x80\x9d\n1. Mostra un form con i campi della nota (studente, data, docente , descrizione).\n4. Salva la nota e invia una notifica via e-mail al genitore\nPostcondizioni:\nâ\x80¢\nâ\x80¢ â\x80¢\nI dati della nota sono stati inseriti nel sistema, ed il sistema ha inviato la notifica ai genitori. Il sistema ritorna alla schermata del registro. Lâ\x

In [None]:
#export
def loading_artifacts( params ):
    #Creating the mappings
    
    df_sys_g = pd.DataFrame( [] ,columns = ['ids','filenames','text']) #global dataframe
    
    for art in parameters['dataset'].keys():
        sys_names = [[entry for entry in parameters['dataset'][art].glob('**/*' + ex )] for ex in parameters['ends']]
        sys_names = functools.reduce(lambda a,b : a+b,sys_names) #Flatting
        logging.info("artifacts: "  +  str( len(sys_names) ) )
        sys_filenames = [(filename, os.path.basename(filename), open_file(filename, encoding=params['encoding']) ) for filename in sys_names]
        df_sys_l = pd.DataFrame( sys_filenames ,columns = ['ids','filenames','text']) #local dataframe
        df_sys_l['type'] = art
        df_sys_g = pd.concat([df_sys_g, df_sys_l ], ignore_index=True, sort=False)
    
    return df_sys_g

In [None]:
df_test = loading_artifacts( params = parameters )

2021-01-11 23:50:14,606 : INFO : artifacts: 67
2021-01-11 23:50:14,619 : INFO : artifacts: 100


In [None]:
df_test[df_test['type']=='src'].head()

Unnamed: 0,ids,filenames,text,type
67,../dvc-ds4se/se-benchmarking/traceability/data...,ManagerClassroom.java,package smos.storage;\n\n\nimport java.sql.Con...,src
68,../dvc-ds4se/se-benchmarking/traceability/data...,ServletUpdateUser.java,package smos.application.userManagement;\n\nim...,src
69,../dvc-ds4se/se-benchmarking/traceability/data...,ServletComputateStatistics.java,package smos.application.registerManagement;\n...,src
70,../dvc-ds4se/se-benchmarking/traceability/data...,ServletProva.java,package smos.application.teachingManagement;\n...,src
71,../dvc-ds4se/se-benchmarking/traceability/data...,ServletInitialize.java,package smos.application;\n\nimport javax.serv...,src


In [None]:
#export
def processing_artifacts( model_prefix, df_sys_all, funct_params ):
    df_sys_all = df_sys_all.copy()
    for bpe in model_prefix.keys(): #BPE Preprocessing
        prep = ConventionalPreprocessing( funct_params( model_prefix[bpe] ) , bpe = True) #Creating the Preprocessing Object
        df_sys_all[ bpe ] = prep.bpe_pieces_pipeline( df_sys_all['text'].values ) 
        
    df_sys_all['conv'] = prep.fromdocs_pipeline( df_sys_all['text'].values ) #Conventional Preprocessing
    return df_sys_all, prep

In [None]:
df_test_sys,r_prep = processing_artifacts( model_prefix = model_prefix, 
                                          df_sys_all = df_test, 
                                          funct_params = smos_params #itrust_params#ebt_params
                                         )

2021-01-11 23:51:45,591 : INFO : fromtokens_pipeline: clean punctuation
2021-01-11 23:51:45,627 : INFO : fromtokens_pipeline: camel case
2021-01-11 23:51:45,630 : INFO : fromtokens_pipeline: lowe case
2021-01-11 23:51:45,631 : INFO : fromtokens_pipeline: white space removed
2021-01-11 23:51:45,663 : INFO : fromtokens_pipeline: WordPunctTokenizer
2021-01-11 23:51:45,922 : INFO : fromtokens_pipeline: Stop words
2021-01-11 23:51:46,655 : INFO : fromtokens_pipeline: Stemmings
2021-01-11 23:51:46,670 : INFO : fromtokens_pipeline: Removed Special Terns
2021-01-11 23:51:46,672 : INFO : fromtokens_pipeline END


In [None]:
df_test_sys.head()

Unnamed: 0,ids,filenames,text,type,bpe8k,bpe32k,bpe128k,conv
0,../dvc-ds4se/se-benchmarking/traceability/data...,SMOS37.txt,Nome: InserisciNota\nAttori: Amministratore\nD...,uc,"[▁n, ome, :, ▁ins, er, isc, in, ota, \n, att, ...","[▁n, ome, :, ▁ins, er, isc, in, ota, \n, att, ...","[▁nome, :, ▁ins, er, isc, in, ota, \n, att, or...",nome inserisci nota attori amministrator descr...
1,../dvc-ds4se/se-benchmarking/traceability/data...,SMOS16.txt,Nome: EliminaClasse\nAttori: Amministratore\nD...,uc,"[▁n, ome, :, ▁elimin, ac, las, se, \n, att, or...","[▁n, ome, :, ▁elimin, ac, las, se, \n, att, or...","[▁nome, :, ▁elimin, ac, lasse, \n, att, ori, :...",nome elimina class attori amministrator descri...
2,../dvc-ds4se/se-benchmarking/traceability/data...,SMOS24.txt,Nome:VisualizzaDettagliInsegnamento\nAttori: A...,uc,"[▁n, ome, :, vis, ual, iz, z, ad, ett, ag, li,...","[▁n, ome, :, vis, ual, izz, ad, ett, ag, li, i...","[▁nome, :, visual, izz, ad, ett, agli, in, se,...",nome visualizza dettag insegnamento attori amm...
3,../dvc-ds4se/se-benchmarking/traceability/data...,SMOS36.txt,Nome: InserisciGiustifica\nAttori: Amministrat...,uc,"[▁n, ome, :, ▁ins, er, isc, ig, i, ust, ific, ...","[▁n, ome, :, ▁ins, er, isc, igi, ust, ific, a,...","[▁nome, :, ▁ins, er, isc, igi, ust, ific, a, \...",nome inserisci giustifica attori amministrator...
4,../dvc-ds4se/se-benchmarking/traceability/data...,SMOS26.txt,Nome: EliminaInsegnamento\nAttori: Amministrat...,uc,"[▁n, ome, :, ▁elimin, ain, se, gn, ament, o, \...","[▁n, ome, :, ▁elimin, ain, se, gn, amento, \n,...","[▁nome, :, ▁elimin, ain, se, gn, amento, \n, a...",nome elimina insegnamento attori amministrator...


In [None]:
df_test_sys['ids'].values[0]

PosixPath('../dvc-ds4se/se-benchmarking/traceability/datasets/formatted/semeru_format/SMOS_semeru_format/use_cases/SMOS37.txt')

In [None]:
df_test_sys[df_test_sys['filenames'].str.contains('.java', regex=False)]

Unnamed: 0,ids,filenames,text,type,bpe8k,bpe32k,bpe128k,conv
67,../dvc-ds4se/se-benchmarking/traceability/data...,ManagerClassroom.java,package smos.storage;\n\n\nimport java.sql.Con...,src,"[▁pack, age, ▁sm, os, ., st, or, age, ;, \n\n\...","[▁package, ▁sm, os, ., stor, age, ;, \n\n\n, i...","[▁package, ▁sm, os, ., storage, ;, \n\n\n, imp...",packag smos storag import java sql connect imp...
68,../dvc-ds4se/se-benchmarking/traceability/data...,ServletUpdateUser.java,package smos.application.userManagement;\n\nim...,src,"[▁pack, age, ▁sm, os, ., ap, pl, ication, ., u...","[▁package, ▁sm, os, ., appl, ication, ., us, e...","[▁package, ▁sm, os, ., application, ., us, erm...",packag smos applic user manag import smos envi...
69,../dvc-ds4se/se-benchmarking/traceability/data...,ServletComputateStatistics.java,package smos.application.registerManagement;\n...,src,"[▁pack, age, ▁sm, os, ., ap, pl, ication, ., r...","[▁package, ▁sm, os, ., appl, ication, ., reg, ...","[▁package, ▁sm, os, ., application, ., registe...",packag smos applic regist manag import java io...
70,../dvc-ds4se/se-benchmarking/traceability/data...,ServletProva.java,package smos.application.teachingManagement;\n...,src,"[▁pack, age, ▁sm, os, ., ap, pl, ication, ., t...","[▁package, ▁sm, os, ., appl, ication, ., te, a...","[▁package, ▁sm, os, ., application, ., teachin...",packag smos applic teach manag import javax se...
71,../dvc-ds4se/se-benchmarking/traceability/data...,ServletInitialize.java,package smos.application;\n\nimport javax.serv...,src,"[▁pack, age, ▁sm, os, ., ap, pl, ication, ;, \...","[▁package, ▁sm, os, ., appl, ication, ;, \n\n,...","[▁package, ▁sm, os, ., application, ;, \n\n, i...",packag smos applic import javax servlet servle...
...,...,...,...,...,...,...,...,...
162,../dvc-ds4se/se-benchmarking/traceability/data...,ServletReportTeachings.java,package smos.application.userManagement;\n\nim...,src,"[▁pack, age, ▁sm, os, ., ap, pl, ication, ., u...","[▁package, ▁sm, os, ., appl, ication, ., us, e...","[▁package, ▁sm, os, ., application, ., us, erm...",packag smos applic user manag import smos envi...
163,../dvc-ds4se/se-benchmarking/traceability/data...,ServletAssignRole.java,package smos.application.userManagement;\n\nim...,src,"[▁pack, age, ▁sm, os, ., ap, pl, ication, ., u...","[▁package, ▁sm, os, ., appl, ication, ., us, e...","[▁package, ▁sm, os, ., application, ., us, erm...",packag smos applic user manag import smos envi...
164,../dvc-ds4se/se-benchmarking/traceability/data...,display.java,\n\n/****************** displaytag stylesheet ...,src,"[▁, \n\n, /, *, *, *, *, *, *, *, *, *, *, *, ...","[▁, \n\n, /, *, *, *, *, *, *, *, *, *, *, *, ...","[▁, \n\n, /, **, **, **, **, **, **, **, **, *...",displaytag stylesheet tabl datat border color ...
165,../dvc-ds4se/se-benchmarking/traceability/data...,ServletInsertReport.java,package smos.application.reportManagement;\n\n...,src,"[▁pack, age, ▁sm, os, ., ap, pl, ication, ., r...","[▁package, ▁sm, os, ., appl, ication, ., re, p...","[▁package, ▁sm, os, ., application, ., report,...",packag smos applic report manag import smos en...


In [None]:
r_prep.SaveCorpus(df_test_sys, language='all-corpus', sep='~')

2021-01-11 23:46:32,071 : INFO : Saving in...../dvc-ds4se/se-benchmarking/traceability/testbeds/processed/[itrust-all-corpus-1610408791.737875].csv


In [None]:
r_prep.LoadCorpus(1609221582.171744,language='all-corpus', sep='~')

### Pre-processing from Semeru Format

In [None]:
#Special Case EBT To Create Separate Files [Only one implementation]

In [None]:
#Canonical EBT
def ebt_params(model_prefix = path_data+'models/bpe/sentencepiece/wiki_py_java_bpe_8k'):
    return {
        'system': 'ebt',
        'saving_path': path_data+ 'se-benchmarking/traceability/testbeds/processed/',
        'language': 'english',
        'dataset' : {
            'req':pathlib.Path( semeru_format + 'EBT_semeru_format/requirements.txt'),
            'tc': pathlib.Path( semeru_format + 'EBT_semeru_format/test_cases.txt'),
            'src':pathlib.Path( semeru_format + 'EBT_semeru_format/source_code')
        },
        'ends': ['.txt','.java','.c','.h','.TXT'],
        'model_prefix':model_prefix,
        #'encoding':'ISO-8859-1'
        'encoding':'utf-8' #english encoding
    }

In [None]:
params = ebt_params()

In [None]:
params['dataset']['req']

In [None]:
pd_ebt = pd.read_fwf(params['dataset']['req'],header=None,sep="/t")

In [None]:
with open(params['dataset']['tc']) as fp:
    Lines = fp.readlines()
    for line in Lines: 
        print(line.split("\t"))
        l = line.split("\t")
        p = semeru_format + 'EBT_semeru_format/test_cases/'+l[0]+'.txt'
        with open(p, "w") as wp: 
            wp.writelines(l[1]) 

In [None]:
dict_filenames = {}

In [None]:
#creating the function
#base_dir = os.path.abspath(os.getcwd())
test_dir = pathlib.Path('test_data/LibEST_semeru_format/test')
#path = os.path.join(base_dir, test_dir)

In [None]:
#reading all files in a folder
for filename in glob.glob(os.path.join(test_dir, '*.txt')):
    with open(filename, 'r') as f: # open in readonly mode
        dict_filenames[filename] = [f.read()]      

In [None]:
[os.path.join(test_dir,filename) for filename in os.listdir(test_dir)]

In [None]:
#reading all files in a folder
for filename in [os.path.join(test_dir,filename) for filename in os.listdir(test_dir)]:
    with open(filename, 'r') as f: # open in readonly mode
        dict_filenames[filename] = [f.read()]    

In [None]:
os.path.basename('test_data/LibEST_semeru_format/requirements/RQ17.txt').replace('.txt', '-pre.txt')

In [None]:
dict_filenames

In [None]:
re.sub(r'[^a-zA-Z\s]', ' ', "Ho:;<le_C$%&\oMe_estTa?@[\\is34~", re.I|re.A).split()

In [None]:
remove_terms(clean_punctuation("their corresponding URIs:\n\n\n   +------------------------+-----------------+-------------------+\n   | Operation              |Operation path   | Details           |\n   +========================+=================+===================+\n   | Distribution of CA     | /cacerts        | Section 4.1       |\n   | Certificates (MUST)    |                 |                   |\n   +------------------------+-----------------+-------------------+\n   | Enrollment of          | /simpleenroll   | Section 4.2       |\n   | Clients (MUST)         |                 |                   |\n   +------------------------+-----------------+-------------------+\n   | Re-enrollment of       | /simplereenroll | Section 4.2.2     |\n   | Clients (MUST)         |                 |                   |\n   +------------------------+-----------------+-------------------+\n   | Full CMC (OPTIONAL)    | /fullcmc        | Section 4.3       |\n   +------------------------+-----------------+-------------------+\n   | Server-Side Key        | /serverkeygen   | Section 4.4       |\n   | Generation (OPTIONAL)  |                 |                   |\n   +------------------------+-----------------+-------------------+\n   | CSR Attributes         | /csrattrs       | Section 4.5       |\n   | (OPTIONAL)             |                 |                   |\n   +------------------------+-----------------+-------------------+\n\n  "))

In [None]:
remove_terms(split_camel_case_token(dict_filenames['test_data/LibEST_semeru_format/requirements/RQ17.txt'][0]))

In [None]:
pre_process = preprocess_pipeline.basic_pipeline(dict_filenames)

In [None]:
pre_process[0]

In [None]:
#Writing Into A File
df_pre_processed = pd.DataFrame(pre_process, columns =['filename', 'text']) 
#/.../benchmarking/traceability/testbeds/nltk

In [None]:
df_pre_processed

In [None]:
base_dir = os.path.abspath(os.getcwd())
pre_path = '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-tc].csv'
final_path = '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-vocab-tc].csv'

In [None]:
df_pre_processed.to_csv(pre_path, header=None, index=None, sep=' ', mode='a')

In [None]:
def save_dict(a_dict, path):
    a_file = open(path, "w")

    writer = csv.writer(a_file)
    for key, value in a_dict.items():
        writer.writerow([key, value])
    a_file.close()

In [None]:
#1-Building the corpus vocabulary
tokenizer_corpora = text.Tokenizer()
tokenizer_corpora.fit_on_texts([doc[1] for doc in pre_process])

word2id = tokenizer_corpora.word_index
id2word = {v:k for k, v in word2id.items()}

In [None]:
save_dict(id2word,final_path)

In [None]:
id2word

In [None]:
#Merging All the Vocabulary
vocab_path_tc = '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-vocab-tc].csv'
df_read_vocab_tc = pd.read_csv(vocab_path_tc, names=['ids', 'text'], header=None)  

In [None]:
df_read_vocab_tc.shape

In [None]:
vocab_path_src = '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-vocab-src].csv'
df_read_vocab_src = pd.read_csv(vocab_path_src, names=['ids', 'text'], header=None)  

In [None]:
df_read_vocab_src.shape

In [None]:
vocab_path_req = '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-vocab-req].csv'
df_read_vocab_req = pd.read_csv(vocab_path_req, names=['ids', 'text'], header=None)  

In [None]:
df_read_vocab_req.shape

In [None]:
super_vocab_set = set(df_read_vocab_tc['text']) | set(df_read_vocab_src['text']) | set(df_read_vocab_req['text'])

In [None]:
len(super_vocab_set)

In [None]:
df_all_vocab = pd.DataFrame(list(super_vocab_set))
print(df_all_vocab)

In [None]:
df_all_vocab.to_csv('/tf/main/benchmarking/traceability/testbeds/nltk/[libest-vocab-all].csv', 
                    header=None, index=None, sep=' ', mode='a')

In [None]:
#Merging all the corpuses
pre_doc_path_tc = '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-tc].csv'
pre_doc_path_req = '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-req].csv'
pre_doc_path_src = '/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-src].csv'

#df_read_pre_tc = pd.read_csv(pre_doc_path_tc, header=None, sep=' ') #Need to inclide sep 
pre_doc_path = [pre_doc_path_tc, pre_doc_path_req, pre_doc_path_src]
lis= [list(df_read[1]) for df_read in [pd.read_csv(path, header=None, sep=' ')for path in pre_doc_path]]
print(len(lis[0]), len(lis[1]), len(lis[2]))    

In [None]:
lis = functools.reduce(lambda a,b : a+b,lis)

In [None]:
lis

In [None]:
df_reduced_pre = pd.DataFrame(lis) 
df_reduced_pre

In [None]:
df_reduced_pre.to_csv('/tf/main/benchmarking/traceability/testbeds/nltk/[libest-pre-all].csv')

#########################################################

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
! nbdev_build_docs

In [None]:
from nbdev.export import *

In [None]:
notebook2script()