In [None]:
# default_exp benchmark.codegen

In [None]:
# hide
%load_ext autoreload
%autoreload 2

# CodeGen Benchmark

> This module is dedicated benchmarking

In [None]:
#export
# Imports
import matplotlib.pyplot as plt
import pandas as pd
import sentencepiece as sp

# ds4se
from ds4se.data.preprocessing import *
from ds4se.data.exploratory.information import *
from ds4se.data.exploratory.stats import *

from pathlib import Path

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
path = Path('../benchmarking/traceability/')

## BPE Testbed

In [None]:
english_bpe = 'english_bpe'
italian_bpe = 'italian_bpe'

In [None]:
sp_model_from_glob(path/'datasets/english','*/*all*', english_bpe)
sp_model_from_glob(path/'datasets/italian','*/*all*', italian_bpe)

In [None]:
path/'datasets'

PosixPath('../benchmarking/traceability/datasets')

In [None]:
def output_bpe_tokenization(path, languages):
    for language in languages:
        req_fns = list((path/'datasets'/language).glob('**/*req]'))
        src_fns = list((path/'datasets'/language).glob('**/*src]'))
        tc_fns = list((path/'datasets'/language).glob('**/*tc]'))

        spm = sp.SentencePieceProcessor()
        spm.Load(str((path/'datasets'/language/f"{language}_bpe.model")))
        
        output = path/'testbeds'/'bpe'/language
        req_docs = tokenize_fns(req_fns, spm, ['txt', 'TXT'], output, 'req')
        src_docs = tokenize_fns(src_fns, spm, ['c', 'java'], output, 'src')
        tc_docs = tokenize_fns(tc_fns, spm, ['c', 'java'], output, 'tc')

In [None]:
languages = ['english', 'italian']
output_bpe_tokenization(path, languages)

# Entropy Benchmark

## Read in the data

In [None]:
english_systems = {'itrust': ['req', 'src'], 'libest': ['req', 'src', 'tc']}
italian_systems = {'albergate': ['req', 'src'], 'ebt': ['req', 'src', 'tc'], 'etour': ['req', 'src'], 'smos': ['req', 'src']}

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [None]:
def calc_entropy_benchmark(systems, language):
    for sys in systems:
        sys_docs = []
        print('System:', sys)
        for data_type in systems[sys]:
            data_path = path/'testbeds/bpe'/language/sys/data_type
            sys_docs.append(read_bpe_files(data_path))
            entropies = get_entropies_from_docs(sys_docs[-1])

            print('Data Type:', data_type)
            report_stats(entropies)

            entropy = get_entropy_from_docs(sys_docs[-1])
            print('Total Corpus Entropy:', entropy)
            print()

        entropy = get_entropy_from_docs(flatten(sys_docs))
        print('Total System Entropy:', entropy)
        entropy = shared_entropy_from_docs(sys_docs)
        print('Shared Entropy:', entropy)
        print()

In [None]:
calc_entropy_benchmark(english_systems, 'english')

System: itrust
Data Type: req
Max: 6.655947403853904
Min: 3.6464393446710157
Average: 5.125309432616202
Median: 5.238901256602631
Standard Deviation: 0.7675282320547024
Median Absolute Deviation: 0.9497244658563296
95% of the data fall within 4.992640720488694 and 5.25797814474371
Total Corpus Entropy: 8.138886303909846

Data Type: src
Max: 7.6191109926622875
Min: 4.881336276904696
Average: 6.522153794169928
Median: 6.456654661625311
Standard Deviation: 0.47046257540776115
Median Absolute Deviation: 0.42263165151349985
95% of the data fall within 6.460067634640814 and 6.584239953699043
Total Corpus Entropy: 8.562837202994778

Total System Entropy: 8.68235305057625
shared counts...
Shared Entropy: 6.675375899716576

System: libest
Data Type: req
Max: 8.133644403908326
Min: 4.694019357121934
Average: 6.543663643429754
Median: 6.5960839256764
Standard Deviation: 0.7998515650224866
Median Absolute Deviation: 0.8070430386925508
95% of the data fall within 6.3209835459644115 and 6.7663437408

In [None]:
calc_entropy_benchmark(english_systems, 'english')

System: itrust
Data Type: req
Max: 6.655947403853904
Min: 3.6464393446710157
Average: 5.125309432616202
Median: 5.238901256602631
Standard Deviation: 0.7675282320547024
Median Absolute Deviation: 0.9497244658563296
95% of the data fall within 4.992640720488694 and 5.25797814474371
Total Corpus Entropy: 8.138886303909846

Data Type: src
Max: 7.6191109926622875
Min: 4.881336276904696
Average: 6.522153794169928
Median: 6.456654661625311
Standard Deviation: 0.47046257540776115
Median Absolute Deviation: 0.42263165151349985
95% of the data fall within 6.460067634640814 and 6.584239953699043
Total Corpus Entropy: 8.562837202994778

Total System Entropy: 8.68235305057625
Shared Entropy: 6.675375899716576

System: libest
Data Type: req
Max: 8.133644403908326
Min: 4.694019357121934
Average: 6.543663643429754
Median: 6.5960839256764
Standard Deviation: 0.7998515650224866
Median Absolute Deviation: 0.8070430386925508
95% of the data fall within 6.3209835459644115 and 6.766343740895097
Total Corpu

In [None]:
calc_entropy_benchmark(italian_systems, 'italian')

System: albergate
Data Type: req
Max: 7.451061154959707
Min: 6.7760271692033065
Average: 7.114662232676978
Median: 7.136128583124726
Standard Deviation: 0.18028670118666462
Median Absolute Deviation: 0.19215087521795557
95% of the data fall within 7.021967364311235 and 7.207357101042722
Total Corpus Entropy: 8.333064635075106

Data Type: src
Max: 7.632003142360007
Min: 5.694455777930451
Average: 6.698395952158591
Median: 6.585514345171939
Standard Deviation: 0.47533702601616096
Median Absolute Deviation: 0.55637410510539
95% of the data fall within 6.569894354034409 and 6.826897550282773
Total Corpus Entropy: 8.02635009717346

Total System Entropy: 8.284551907349753
Shared Entropy: 5.704935783592468

System: ebt
Data Type: req
Max: 4.85798099512757
Min: 3.169925001442312
Average: 4.036522483018428
Median: 4.037401197654112
Standard Deviation: 0.4423825943264807
Median Absolute Deviation: 0.49957908952600216
95% of the data fall within 3.896889307383322 and 4.176155658653533
Total Corpu

# SCRATCH WORK

In [None]:

for language in languages:
    req_fns = list((path/'datasets'/language).glob('**/*req]'))
    src_fns = list((path/'datasets'/language).glob('**/*src]'))
    tst_fns = list((path/'datasets'/language).glob('**/*tc]'))
    
    spm = sp.SentencePieceProcessor()
    spm.Load(str((path/'datasets'/language/f"{model_name}_bpe.model"))\
    
    all_fns = flatten(req_fns + src_fns + tst_fns)
    all_docs = tokenize_fns(all_fns, spm)

In [None]:
req_fns = list((path/'datasets/english').glob('**/*req]'))
src_fns = list((path/'datasets/english').glob('**/*src]'))
tst_fns = list((path/'datasets/english').glob('**/*tc]'))
req_fns[:5], src_fns[:5], tst_fns[:5]

([PosixPath('../benchmarking/traceability/datasets/english/libest/[libest-raw-req]'),
  PosixPath('../benchmarking/traceability/datasets/english/itrust/[itrust-raw-req]')],
 [PosixPath('../benchmarking/traceability/datasets/english/libest/[libest-raw-src]'),
  PosixPath('../benchmarking/traceability/datasets/english/itrust/[itrust-raw-src]')],
 [PosixPath('../benchmarking/traceability/datasets/english/libest/[libest-raw-tc]')])

In [None]:
all_fns = flatten(req_fns + src_fns + tst_fns)
all_docs = tokenize_fns(all_fns, spm)

In [None]:
req_docs = 

In [None]:
src_fns[0].parent.name

'libest'

In [None]:
list(path.glob('datasets/italian/*/*'))

[PosixPath('../benchmarking/traceability/datasets/italian/smos/[smos-raw-src]'),
 PosixPath('../benchmarking/traceability/datasets/italian/smos/[smos-raw-req]'),
 PosixPath('../benchmarking/traceability/datasets/italian/smos/[smos-all].txt'),
 PosixPath('../benchmarking/traceability/datasets/italian/albergate/[albergate-all].txt'),
 PosixPath('../benchmarking/traceability/datasets/italian/albergate/[albergate-raw-src]'),
 PosixPath('../benchmarking/traceability/datasets/italian/albergate/[albergate-raw-req]'),
 PosixPath('../benchmarking/traceability/datasets/italian/ebt/[ebt-all].txt'),
 PosixPath('../benchmarking/traceability/datasets/italian/ebt/[ebt-raw-src]'),
 PosixPath('../benchmarking/traceability/datasets/italian/ebt/[ebt-raw-tc].txt'),
 PosixPath('../benchmarking/traceability/datasets/italian/ebt/[ebt-raw-req].txt'),
 PosixPath('../benchmarking/traceability/datasets/italian/etour/[etour-raw-src]'),
 PosixPath('../benchmarking/traceability/datasets/italian/etour/[etour-raw-req

In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_mgmnt.prep.i.ipynb.
Converted 01_exp.i.ipynb.
Converted 02_mgmnt.db.mongo.ipynb.
Converted 03_repr.i.ipynb.
Converted 04_mining.ir.model.ipynb.
Converted 05_mining.ir.i.ipynb.
Converted 06_benchmark.traceability.ipynb.
Converted 07_repr.roberta.train.ipynb.
Converted 08_exp.info.ipynb.
Converted 09_desc.stats.ipynb.
Converted 10_vis.ipynb.
Converted 11_mgmnt.prep.nltk.ipynb.
Converted 12_repr.roberta.eval.ipynb.
Converted 14_mgmnt.prep.bpe.ipynb.
Converted 15_desc.metrics.se.ipynb.
Converted 16_repr.word2vec.train.ipynb.
Converted 17_repr.doc2vec.train.ipynb.
Converted 18_repr.doc2vec.eval.ipynb.
Converted 19_repr.word2vec.eval.ipynb.
Converted 20_benchmark.codegen.ipynb.
Converted 21_inf.i.ipynb.
Converted 22_inf.bayesian.ipynb.
Converted 23_inf.causal.ipynb.
Converted aa_blog.example.ipynb.
Converted ab_templates.example.ipynb.
Converted ac_emp.eval.pp1.rq1.ipynb.
Converted ad_emp.eval.pp1.rq2.ipynb.
Converted ae_emp.eval.pp1.rq3.ipynb.
Converted af_emp.eval.pp1.rq4.ipyn