In [None]:
# default_exp mgmnt.prep.bpe

# BPE Preprocessing

> This module comprises all preprocessing techniques applied to software artifacts:
>
>> Text-based Artifacts: Classical preprocessing (stemming, lemas, etc) and BPE Binary Artifacts:
>
>> To Do Vision-based Artifacts:
>
>> To Do Parsing: Techniques to control and manipulate source code (complete with deep generator project)
>
> Author: @danaderp Jun 2020

In [None]:
#! pip install dit
#! pip install fastprogress
#! pip install nltk
! pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.8.1-cp36-cp36m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 4.0 MB/s eta 0:00:01
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.8.1


In [None]:
#Just for testing
!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

--2020-07-01 14:30:59--  https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.200.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.200.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 278779 (272K) [text/plain]
Saving to: ‘botchan.txt’


2020-07-01 14:31:00 (5.91 MB/s) - ‘botchan.txt’ saved [278779/278779]



In [None]:
#export
import sentencepiece as spm
import tensorflow_datasets as tfds

In [None]:
#export
from tensorflow.keras.preprocessing import text
from pathlib import Path
import glob
from datetime import datetime

In [None]:
#export
from typing import List, Set, Callable, Tuple, Dict, Optional
import re
from nltk.stem.snowball import SnowballStemmer
import nltk
import pandas as pd
import numpy as np
import glob
import os
import pathlib
from string import punctuation
import csv

In [None]:
#export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
params = {
    'system':'codesearchnet',
    'saving_path': 'test_data/sentencepiece/',
    'language': 'english',
    'wiki_size': 60000,
    'bpe_filename':'test_data/sentencepiece/py_java_bpe_training.txt',
    'model_prefix':'test_data/sentencepiece/wiki_py_java_bpe_128k'
}

In [None]:
def saving_bpe_training(np_str, mode='w'):
    with open(params['bpe_filename'], mode) as f: 
        # Writing data to a file 
        f.write('\n'.join(np_str)) #adding space between elements

### Testing Sentence Piece

In [None]:
# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
spm.SentencePieceTrainer.train('--input=test_data/botchan.txt --model_prefix=m --vocab_size=2000')

True

In [None]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')

True

In [None]:
# encode: text => id
print(sp.encode_as_pieces('This is a test'))
print(sp.encode_as_ids('This is a test'))

['▁This', '▁is', '▁a', '▁t', 'est']
[209, 31, 9, 375, 586]


In [None]:
# decode: id => text
print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))
print(sp.decode_ids([209, 31, 9, 375, 586]))

This is a test
This is a test


In [None]:
# returns vocab size
print(sp.get_piece_size())

# id <=> piece conversion
print(sp.id_to_piece(209))
print(sp.piece_to_id('▁This'))

# returns 0 for unknown tokens (we can change the id for UNK)
print(sp.piece_to_id('__MUST_BE_UNKNOWN__'))

# <unk>, <s>, </s> are defined by default. Their ids are (0, 1, 2)
# <s> and </s> are defined as 'control' symbol.
for id in range(3):
    print(sp.id_to_piece(id), sp.is_control(id))

2000
▁This
209
0
<unk> False
<s> True
</s> True


### SettingUp Software Corpora from CodeSearchNet

In [None]:
python_files = sorted(Path('codesearch/python/').glob('**/*.gz'))
java_files = sorted(Path('codesearch/java/').glob('**/*.gz'))

In [None]:
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [None]:
python_searchnet_df = jsonl_list_to_dataframe(python_files)
java_searchnet_df = jsonl_list_to_dataframe(java_files)

In [None]:
java_searchnet_df.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition
0,ReactiveX/RxJava,src/main/java/io/reactivex/internal/observers/...,https://github.com/ReactiveX/RxJava/blob/ac841...,protected final void fastPathOrderedEmit(U val...,"[protected, final, void, fastPathOrderedEmit, ...",Makes sure the fast-path emits in order.\n@par...,"[Makes, sure, the, fast, -, path, emits, in, o...",java,test
1,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,@CheckReturnValue\n @NonNull\n @Schedule...,"[@, CheckReturnValue, @, NonNull, @, Scheduler...",Mirrors the one ObservableSource in an Iterabl...,"[Mirrors, the, one, ObservableSource, in, an, ...",java,test
2,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings(""unchecked"")\n @CheckRetu...","[@, SuppressWarnings, (, ""unchecked"", ), @, Ch...",Mirrors the one ObservableSource in an array o...,"[Mirrors, the, one, ObservableSource, in, an, ...",java,test
3,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","[@, SuppressWarnings, (, {, ""unchecked"", ,, ""r...",Concatenates elements of each ObservableSource...,"[Concatenates, elements, of, each, ObservableS...",java,test
4,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,https://github.com/ReactiveX/RxJava/blob/ac841...,"@SuppressWarnings({ ""unchecked"", ""rawtypes"" })...","[@, SuppressWarnings, (, {, ""unchecked"", ,, ""r...",Returns an Observable that emits the items emi...,"[Returns, an, Observable, that, emits, the, it...",java,test


In [None]:
java_searchnet_df.shape

(496688, 9)

In [None]:
pytrain = python_searchnet_df[python_searchnet_df.partition.eq('train')].copy()

In [None]:
pytrain.shape

(412178, 9)

In [None]:
javatrain = java_searchnet_df[java_searchnet_df.partition.eq('train')].copy()

In [None]:
javatrain.shape

(454451, 9)

In [None]:
pytrain.head()

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition
0,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/b...,"def train(train_dir, model_save_path=None, n_n...","[def, train, (, train_dir, ,, model_save_path,...",Trains a k-nearest neighbors classifier for fa...,"[Trains, a, k, -, nearest, neighbors, classifi...",python,train
1,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/b...,"def predict(X_img_path, knn_clf=None, model_pa...","[def, predict, (, X_img_path, ,, knn_clf, =, N...",Recognizes faces in given image using a traine...,"[Recognizes, faces, in, given, image, using, a...",python,train
2,ageitgey/face_recognition,examples/face_recognition_knn.py,https://github.com/ageitgey/face_recognition/b...,"def show_prediction_labels_on_image(img_path, ...","[def, show_prediction_labels_on_image, (, img_...",Shows the face recognition results visually.\n...,"[Shows, the, face, recognition, results, visua...",python,train
3,ageitgey/face_recognition,face_recognition/api.py,https://github.com/ageitgey/face_recognition/b...,"def _rect_to_css(rect):\n """"""\n Convert ...","[def, _rect_to_css, (, rect, ), :, return, rec...",Convert a dlib 'rect' object to a plain tuple ...,"[Convert, a, dlib, rect, object, to, a, plain,...",python,train
4,ageitgey/face_recognition,face_recognition/api.py,https://github.com/ageitgey/face_recognition/b...,"def _trim_css_to_bounds(css, image_shape):\n ...","[def, _trim_css_to_bounds, (, css, ,, image_sh...","Make sure a tuple in (top, right, bottom, left...","[Make, sure, a, tuple, in, (, top, right, bott...",python,train


In [None]:
len(pytrain.sample(frac=0.01, replace=False, random_state=1))

222

In [None]:
a = pytrain['code'].sample(frac=0.01, replace=False, random_state=1).values 

In [None]:
saving_bpe_training(list(a)) #writing mode

In [None]:
b = javatrain['code'].sample(frac=0.01, replace=False, random_state=1).values

In [None]:
saving_bpe_training(list(b)) #writing mode

In [None]:
saving_bpe_training(list(b), mode='a') #append mode

In [None]:
print(len(a),len(b))

222 269


### SettingUp Software Corpora from Wikipedia

In [None]:
#np_bpe_text = np.load(params['saving_path']+'data_np_bpe_text.npy', allow_pickle=True)

In [None]:
#Config description: Wikipedia dataset for en, parsed from 20190301 dump.
#Download size: 15.72 GiB
#Dataset size: Unknown size
#Examples: train 5,824596
dataset_name = 'wikipedia/20200301.en' #'wikipedia/20190301.en'

In [None]:
#Download the dataset and create a tf.data.Dataset
ds, info = tfds.load(dataset_name, split='train', with_info=True)

2020-07-06 22:11:38,428 : INFO : Load dataset info from /root/tensorflow_datasets/wikipedia/20200301.en/1.0.0
2020-07-06 22:11:38,444 : INFO : Reusing dataset wikipedia (/root/tensorflow_datasets/wikipedia/20200301.en/1.0.0)
2020-07-06 22:11:38,445 : INFO : Constructing tf.data.Dataset for split train, from /root/tensorflow_datasets/wikipedia/20200301.en/1.0.0


In [None]:
#Accessing Metadata with DatasetInfo
print(info.splits['train'].num_examples)

6033151


In [None]:
# Build your input pipeline
#ds = ds.shuffle(60000) #.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
list_wiki = [tfds.as_numpy(ex)['text'].decode("utf-8") for ex in ds.take(params['wiki_size'])]

In [None]:
len(list_wiki)

60000

In [None]:
saving_bpe_training(list_wiki, mode='a') #append mode

### BPE (Byte pair encoding) model

In [None]:
#BPE Training
#file_bpe = params['bpe_filename']
file_bpe = 'test_data/sentencepiece/py_java_bpe_training_c.txt'
#m_prefix = params['model_prefix']
m_prefix = 'test_data/sentencepiece/py_java_bpe_32k_c'
text_norm = 'nfkc_cf' #nfkc_cf: nfkc + Unicode case folding
#vocab_size = '128000'
vocab_size = '32000' #approx by word2vec statistics 32K
#vocab_size = '8000'
symbols = '<n>,<t>,<@>'

In [None]:
spm.SentencePieceTrainer.train(f'--input={file_bpe} --model_prefix={m_prefix} --vocab_size={vocab_size} --model_type=bpe --normalization_rule_name={text_norm}')

True

In [None]:
#Without normalization
spm.SentencePieceTrainer.train(f'--input={file_bpe} --model_prefix={m_prefix} --vocab_size={vocab_size} --model_type=bpe')

True

In [None]:
#Use this for Generators
spm.SentencePieceTrainer.train(f'--input={file_bpe} --user_defined_symbols={symbols} --model_prefix={m_prefix} --vocab_size={vocab_size} --model_type=bpe')

True

In [None]:
sp_bpe = spm.SentencePieceProcessor()
sp_bpe.load(m_prefix +'.model')

True

In [None]:
print('*** BPE ***')
print(sp_bpe.encode_as_pieces('this is a test hello world for{ int i: 21}'))
#print(sp_bpe.nbest_encode_as_pieces('hello world', 5))  # returns an empty list.

*** BPE ***
['▁this', '▁is', '▁a', '▁test', '▁hello', '▁world', '▁for', '{', '▁int', '▁i', ':', '▁2', '1}']


In [None]:
print(sp_bpe.encode_as_pieces('this is a test \n hello world for{ int i: 21}'))
print(sp_bpe.encode_as_pieces('this is a test <n>\n hello world for{ int i: 21}\n'))

['▁this', '▁is', '▁a', '▁test', '▁hello', '▁world', '▁for', '{', '▁int', '▁i', ':', '▁2', '1}']
['▁this', '▁is', '▁a', '▁test', '▁', '<n>', '▁hello', '▁world', '▁for', '{', '▁int', '▁i', ':', '▁2', '1}']


In [None]:
print(sp_bpe.encode_as_pieces('this is a test \t hello world for{ int i: 21}'))
print(sp_bpe.encode_as_pieces('this is a test <t>\t\t\t<t> hello world for{ int i: 21}'))

['▁this', '▁is', '▁a', '▁test', '▁hello', '▁world', '▁for', '{', '▁int', '▁i', ':', '▁2', '1}']
['▁this', '▁is', '▁a', '▁test', '▁', '<t>', '▁', '<t>', '▁hello', '▁world', '▁for', '{', '▁int', '▁i', ':', '▁2', '1}']


In [None]:
print(sp_bpe.encode_as_pieces('this is a test @ @  hello world for{ int @i: 21}'))
print(sp_bpe.encode_as_pieces('this is a test <@>@@<@> hello world for{ int <@>i: 21}'))

['▁this', '▁is', '▁a', '▁test', '▁', '@', '▁', '@', '▁hello', '▁world', '▁for', '{', '▁int', '▁', '@', 'i', ':', '▁2', '1}']
['▁this', '▁is', '▁a', '▁test', '▁', '<@>', '@@', '<@>', '▁hello', '▁world', '▁for', '{', '▁int', '▁', '<@>', 'i', ':', '▁2', '1}']


In [None]:
print(sp_bpe.encode_as_ids('this is a test @ @  hello world for{ int @i: 21}'))

[207, 249, 213, 3196, 31894, 0, 31894, 0, 198, 13409, 125, 8925, 160, 31925, 161, 31894, 0, 31899, 31954, 691, 18712]


In [None]:
# encode: text => id
print(sp_bpe.encode_as_pieces('private Hi_Altituted(float j = 00; j++)'))
print(sp_bpe.encode_as_ids('private Hi_Altituted(float j = 00; j++)'))

['▁private', '▁H', 'i', '_', 'Alt', 'it', 'uted', '(', 'float', '▁j', '▁=', '▁0', '0;', '▁j', '++)']
[423, 481, 31901, 31940, 6458, 35, 6814, 31909, 2523, 343, 15, 160, 1519, 343, 714]


In [None]:
vocab = {sp_bpe.id_to_piece(id): 0 for id in range(sp_bpe.get_piece_size())}

In [None]:
vocab['@']

KeyError: '@'

In [None]:
# <unk>, <s>, </s> are defined by default. Their ids are (0, 1, 2)
# <s> and </s> are defined as 'control' symbol.
for id in range(3):
    print(sp_bpe.id_to_piece(id), sp_bpe.is_control(id))

<unk> False
<s> True
</s> True


In [None]:
##############################################################################

In [None]:
# export
# Imports
import pandas as pd
import random
import sentencepiece as sp

from fastprogress.fastprogress import master_bar
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# export
def jsonl_list_to_dataframe(file_list, columns=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f,
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [None]:
# export
def get_dfs(path):
    """
        Grabs the different data splits and converts them into dataframes.
        Expects format from Code Search Net Challenge.
    """
    dfs = []
    for split in ["train", "valid", "test"]:
        files = sorted((path/split).glob("**/*.gz"))
        df = jsonl_list_to_dataframe(files, ["code", "docstring"])
        dfs.append(df)
        
    return dfs

In [None]:
path = Path('/tf/data/')

In [None]:
df_trn, df_val, df_tst = get_dfs(path/"java/final/jsonl")
df_trn.head()

Unnamed: 0,code,docstring
0,protected final void bindIndexed(Configuration...,Bind indexed elements to the supplied collecti...
1,public void setServletRegistrationBeans(\n\t\t...,Set {@link ServletRegistrationBean}s that the ...
2,public void addServletRegistrationBeans(\n\t\t...,Add {@link ServletRegistrationBean}s for the f...
3,public void setServletNames(Collection<String>...,Set servlet names that the filter will be regi...
4,public void addServletNames(String... servletN...,Add servlet names for the filter.\n@param serv...


In [None]:
# Save some test data
df_trn.sample(frac = 0.01).to_csv('./test_data/trn.csv', index = False)
df_val.sample(frac = 0.01).to_csv('./test_data/val.csv', index = False)
df_tst.sample(frac = 0.01).to_csv('./test_data/tst.csv', index = False)

In [None]:
# export
def df_to_txt_file(df, output, cols):
    """Converts a dataframe and converts it into a text file that SentencePiece can use to train a BPE model"""
    if cols is None: cols = list(df.columns)
    merged_df = pd.concat([df[col] for col in cols])
    
    with open(output/'text.txt', 'w') as f:
        f.write('\n'.join(list(merged_df)))
    return output/'text.txt'

In [None]:
# export
def sp_model_from_df(df, output, model_name, cols = None):
    """Trains a SentencePiece BPE model from a pandas dataframe"""
    fname = df_to_txt_file(df, output, cols)
    sp.SentencePieceTrainer.train(f'--input={fname} --model_prefix={output / model_name} --hard_vocab_limit=false')

In [None]:
# export
def sp_model_from_glob(path, glob, model_name):
    fns = list(path.glob(glob))
    fns = ",".join(map(str, fns))
    sp.SentencePieceTrainer.train(f'--input={fns} --model_prefix={path / model_name} --hard_vocab_limit=false')

In [None]:
# export
def gen_hugface_model(df, output, tokenizer = ByteLevelBPETokenizer(), vocab_sz = 30_000, min_freq = 3, cols = None):
    fname = df_to_txt_file(df, output, cols)
    tokenizer.train(files = [str(fname)], vocab_size = vocab_sz, min_frequency = min_freq, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])
    
    return tokenizer

In [None]:
path = Path("./test_data")
model_name = "test"

In [None]:
df = pd.read_csv(path / 'trn.csv')
df.head()

Unnamed: 0,code,docstring
0,private static void createCode(String packageN...,Create the Java
1,@Override\n public void flushCache() {\n ...,LI3492-2
2,"public void addRule(IntDependency dependency, ...",Add this dependency with the given count to th...
3,@Override\n public boolean removeIfEquals(K k...,Remove the object from the cache.
4,public void marshall(DatasetContentDeliveryDes...,Marshall the given parameter object.


In [None]:
tokenizer = gen_hugface_model(df, path)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [None]:
print(tokenizer.encode("public static void main(String[] args) { getDirFromLib(); }").tokens)

['<s>', 'public', 'Ġstatic', 'Ġvoid', 'Ġmain', '(', 'String', '[]', 'Ġargs', ')', 'Ġ{', 'Ġget', 'Dir', 'From', 'Lib', '();', 'Ġ}', '</s>']


In [None]:
tokenizer.save(str(path), "java_tokenizer")

['test_data/java_tokenizer-vocab.json', 'test_data/java_tokenizer-merges.txt']

In [None]:
dummy_data = {
        'first': ['1', '2', '6', '7', '8'],
        'second': ['K', 'M', 'O', 'Q', 'S'],
        'third': ['L', 'N', 'P', 'R', 'T']}

In [None]:
df = pd.DataFrame(dummy_data2); df

Unnamed: 0,id,Feature1,Feature2
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [None]:
df_to_txt_file(df, Path('./test_data'), list(df.columns))

PosixPath('test_data/text.txt')

In [None]:
path = Path("./test_data")
model_name = "test"

In [None]:
sp_model_from_dfs(df, path, model_name, list(df.columns))

In [None]:
spm = sp.SentencePieceProcessor()
spm.Load(str(path/f"{model_name}.model"))

True

In [None]:
spm.EncodeAsPieces("Hello, world!")

['▁', 'Hello,', '▁', 'world!']

In [None]:
# export
def tokenize_fns(fns, tokenizer, exts, output, data_type):
    docs = []
    for fn in fns:
        system = fn.parent.name
        output_path = output/system/data_type
        output_path.mkdir(parents=True, exist_ok=True)
        files = []
        for ext in exts:
            files.extend(fn.glob(f'**/*.{ext}'))
        for file in files:
            if 'README' not in file.name:
                with open(file, encoding='ISO-8859-1') as f:
                    docs.append(tokenizer.EncodeAsPieces(f.read()))
                with open((output_path/file.name).with_suffix('.bpe'), 'w') as f:
                    f.write(' '.join(docs[-1]))
            
    return docs

In [None]:
# export
def read_bpe_files(path):
    bpe_files = []
    for file in path.glob('**/*.bpe'):
        with open(file) as f:
            bpe_files.append(f.read().split(' '))
    
    return bpe_files

In [None]:
# export
def split_lines_to_files(lines, fn_pattern, output_path, tokenizer):
    for line in lines:
        fn, content = line.split(fn_pattern)
        fn = fn.replace('"', '')
        fn = fn.replace(' Test ', '')
        content = tokenizer.EncodeAsPieces(content)
        with open((output_path/fn).with_suffix('.bpe'), 'w') as f:
                    f.write(' '.join(content))

In [None]:
path = Path('../benchmarking/traceability/')

In [None]:
spm = sp.SentencePieceProcessor()
spm.Load(str(path/'datasets/italian/italian_bpe.model'))

True

In [None]:
ebt_path = path/'datasets/italian/ebt'; ebt_path

PosixPath('../benchmarking/traceability/datasets/italian/ebt')

In [None]:
with open(ebt_path/'[ebt-raw-req].txt') as f:
    split_lines_to_files(f.read().split('\n')[:-1], '\t', path/'testbeds/bpe/italian/ebt/req', spm)

In [None]:
with open(ebt_path/'[ebt-raw-tc].txt') as f:
    split_lines_to_files(f.read().split('\n')[:-1], 'case:', path/'testbeds/bpe/italian/ebt/tc', spm)

In [None]:
# export
def get_ground_truth(path, language):
    all_links = pd.DataFrame([], columns = [
        'sys', 'from_type', 'to_type', 'from_file', 'to_file', 'from_doc', 'to_doc'
    ])
    for fn in path.glob('*.txt'):
        content = str(fn.name).split('.')[0][1:-1]
        content = content.split('-')
        
        sys, from_type, to_type = content[0], content[2], content[4]
        
        with open(fn) as f:
            links = f.read().split('\n')[:-1]
            
        for link in links:
            link = link.split(' ')
            root, children = link[0], link[1:]
            root = Path(root).with_suffix('.bpe').name
            with open(path.parent.parent/'bpe'/language/sys/from_type/root) as f:
                root_content = f.read().split(' ')
            children = [Path(child).with_suffix('.bpe').name for child in children]
            children = [Path('.'.join(str(child).split('.')[-2:])) for child in children]
            for child in children:
                with open(path.parent.parent/'bpe'/language/sys/to_type/child) as f:
                    child_content = f.read().split(' ')
                all_links = all_links.append({'sys': sys,
                                              'from_type': from_type,
                                              'to_type': to_type,
                                              'from_file': root,
                                              'to_file': str(child),
                                              'from_doc': root_content,
                                              'to_doc': child_content},
                                             ignore_index=True)
            
    return all_links

In [None]:
# export
def get_non_ground_truth(path, language, gt):
    all_non_links = []
    
    existing_links = ['->'.join(link) for link in zip(gt['from_file'].to_list(), gt['to_file'].to_list())]
    bpe_files = list(path.glob('**/*.bpe'))
    random.shuffle(bpe_files)
    for i in bpe_files[:500]:
        sys = i.parent.parent.name
        from_type = i.parent.name
        if str(from_type) != 'req': continue
        with open(i) as f:
            i_content = f.read().split(' ')
        random.shuffle(bpe_files)
        for j in bpe_files[:500]:
            if i == j: continue
            if '->'.join([i.name, j.name]) in existing_links: continue
            to_type = j.parent.name
            if str(to_type) == 'req': continue
#             if from_type == to_type: continue
            with open(j) as f:
                j_content = f.read().split(' ')
            all_non_links.append([sys, from_type, to_type, i.name, j.name, i_content, j_content])
    
    all_non_links = pd.DataFrame(all_non_links, columns = [
        'sys', 'from_type', 'to_type', 'from_file', 'to_file', 'from_doc', 'to_doc'
    ])
    return all_non_links

In [None]:
# export
def gen_gt_ngt(path, lang):
    gt = get_ground_truth(path/'groundtruth'/lang, lang)
    ngt = get_non_ground_truth(path/'bpe'/lang, lang, gt)
    
    return gt, ngt

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_mgmnt.prep.i.ipynb.
Converted 01_exp.i.ipynb.
Converted 02_mgmnt.db.mongo.ipynb.
Converted 03_repr.i.ipynb.
Converted 04_mining.ir.model.ipynb.
Converted 05_mining.ir.i.ipynb.
Converted 06_benchmark.traceability.ipynb.
Converted 07_repr.roberta.train.ipynb.
Converted 08_exp.info.ipynb.
Converted 09_desc.stats.ipynb.
Converted 10_vis.ipynb.
Converted 11_mgmnt.prep.nltk.ipynb.
Converted 12_repr.roberta.eval.ipynb.
Converted 14_mgmnt.prep.bpe.ipynb.
Converted 15_desc.metrics.se.ipynb.
Converted 16_repr.word2vec.train.ipynb.
Converted 17_repr.doc2vec.train.ipynb.
Converted 18_repr.doc2vec.eval.ipynb.
Converted 19_repr.word2vec.eval.ipynb.
Converted 20_benchmark.codegen.ipynb.
Converted 21_inf.i.ipynb.
Converted 22_inf.bayesian.ipynb.
Converted 23_inf.causal.ipynb.
Converted aa_blog.example.ipynb.
Converted ab_templates.example.ipynb.
Converted ac_emp.eval.pp1.rq1.ipynb.
Converted ad_emp.eval.pp1.rq2.ipynb.
Converted ae_emp.eval.pp1.rq3.ipynb.
Converted af_emp.eval.pp1.rq4.ipyn