In [None]:
# default_exp data.exploratory

# Exploration of your data

> This module comprises all the statistical and inference techniques to describe the inner properties of software data. The submodules might include:
>
> - Descriptive statistics
> - Software Metrics
> - Information Theory
> - Learning Principels Detection (Occams' Razor, Biased data, and Data Snooping)
> - Inference: Probabilistic and Causal

In [None]:
# export
# Imports
import dit

import sentencepiece as sp

from collections import Counter
from pathlib import Path

In [None]:
#hide
from nbdev.showdoc import *

# Albergate

In [None]:
path = Path('../benchmarking/data/traceability/raw/Albergate_semeru_format'); path

PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format')

In [None]:
system_name = "albergate"

In [None]:
req_fns = sorted((path/'requirements').glob("*.txt"))
src_fns = sorted((path/'source_code').glob("*.java"))
req_fns[:5], src_fns[:5]

([PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/requirements/F-GES-01.txt'),
  PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/requirements/F-GES-02.txt'),
  PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/requirements/F-GES-03.txt'),
  PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/requirements/F-GES-04.txt'),
  PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/requirements/F-PRE-01.txt')],
 [PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/source_code/AggiungiBeneServizio.java'),
  PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/source_code/AggiungiSoggiornante.java'),
  PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/source_code/AskChiudiConto.java'),
  PosixPath('../benchmarking/data/traceability/raw/Albergate_semeru_format/source_code/AskChiudiSingoloConAgenzia.java'),
  PosixPath('..

In [None]:
output = Path('/tf/data/')
all_fns = ",".join(map(str, req_fns + src_fns))

In [None]:
sp.SentencePieceTrainer.train(f'--input={all_fns} --model_prefix={output / system_name} --hard_vocab_limit=false')

True

In [None]:
spm = sp.SentencePieceProcessor()
spm.Load(f"{output / system_name}.model")

True

In [None]:
spm.EncodeAsPieces("this is a test, public static void main")

['▁this',
 '▁i',
 's',
 '▁a',
 '▁',
 'tes',
 't',
 ',',
 '▁pu',
 'blic',
 '▁sta',
 'tic',
 '▁vo',
 'id',
 '▁ma',
 'in']

## Get countings

In [None]:
def get_counter(fns, spm):
    toks = []
    for fn in fns:
        with open(fn, encoding="ISO-8859-1") as f:
            toks.extend(spm.EncodeAsPieces(f.read()))
            
    cnt = Counter()
    for tok in toks:
        cnt[tok] += 1  
    return cnt

In [None]:
sum(req_cnt)

In [None]:
req_set = set(req_cnt)
src_set = set(src_cnt)

In [None]:
z = req_set.intersection(src_set)

In [None]:
len(z)

535

In [None]:
tot = 0
for i in z:
    tot += req_cnt[i] + src_cnt[i]

In [None]:
tot

43339

In [None]:
sum(req_cnt.values())

6617

In [None]:
(sum(req_cnt.values()) + sum(src_cnt.values()))

103002

In [None]:
tot / (sum(req_cnt.values()) + sum(src_cnt.values()))

0.42075882021708316

In [None]:
sum(z


{'"',
 "'",
 ')',
 ').',
 ');',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '1.',
 '2',
 '3',
 '4',
 '4.',
 '7',
 '8',
 '9',
 ':',
 ';',
 'Aggiornamento',
 'All',
 'C',
 'Con',
 'Dati',
 'Descrizione',
 'E',
 'ES',
 'Effettuat',
 'G',
 'Gestione',
 'M',
 'N',
 'Not',
 'Numero',
 'O',
 'PR',
 'Per',
 'Periodo',
 'Q',
 'Richiest',
 'TAL',
 'TO',
 'U',
 'Un',
 'Vis',
 'a',
 'acco',
 'addebitare',
 'addebito',
 'agenzia',
 'aggio',
 'albergo',
 'alva',
 'amente',
 'ance',
 'ando',
 'anno',
 'annulla',
 'ano',
 'anti',
 'appe',
 'ar',
 'are',
 'ari',
 'ata',
 'ate',
 'ategoria',
 'ati',
 'ativo',
 'atti',
 'ava',
 'ave',
 'azione',
 'azioni',
 'b',
 'bia',
 'bile',
 'blic',
 'c',
 'ce',
 'certa',
 'ch',
 'ci',
 'co',
 'cola',
 'compar',
 'con',
 'conosc',
 'consiste',
 'conto',
 'corr',
 'd',
 'data',
 'e',
 'ed',
 'elenco',
 'empi',
 'end',
 'ene',
 'ener',
 'ente',
 'enti',
 'er',
 'ere',
 'ero',
 'errore',
 'es',
 'esce',
 'etto',
 'fa',
 'fficien',
 'ger',
 'get',
 'ggio',
 'gh',
 'gli',
 'gua

In [None]:
req_cnt = get_counter(req_fns, spm)
src_cnt = get_counter(src_fns, spm)

In [None]:
req_cnt.most_common(100)

[('▁', 281),
 (':', 232),
 ('.', 176),
 ('e', 171),
 ('▁di', 149),
 ('o', 140),
 ('i', 134),
 ('à', 112),
 ('▁F', 82),
 (',', 81),
 ('a', 79),
 ("'", 59),
 ('▁il', 57),
 ('re', 54),
 ('▁e', 54),
 ('-', 53),
 ('▁in', 53),
 ('sione', 52),
 ('▁per', 45),
 ('t', 45),
 ('▁del', 45),
 ('▁che', 44),
 ('▁la', 43),
 ('▁stanza', 42),
 ('▁delle', 41),
 ('▁cliente', 40),
 ('▁un', 38),
 ('▁a', 38),
 ('▁stanze', 38),
 ('▁da', 37),
 ('s', 36),
 ('▁prenotazione', 35),
 ('▁Requisito', 34),
 ('to', 34),
 ('ive', 34),
 ('▁collegat', 34),
 ('▁Stabil', 34),
 ('▁essere', 33),
 ('▁una', 33),
 ('▁compren', 33),
 ('▁o', 31),
 ('▁della', 30),
 ('▁si', 29),
 ('G', 28),
 ('▁dell', 28),
 ('▁le', 28),
 ('\x92', 28),
 ('E', 28),
 ('it', 26),
 ('▁i', 26),
 ('▁al', 26),
 ('▁numero', 25),
 ('▁l', 25),
 ('▁conto', 25),
 ('▁L', 24),
 ('zione', 23),
 ('▁dati', 23),
 ('PR', 23),
 ('▁non', 22),
 ('▁richiesto', 22),
 ('▁A', 22),
 ('le', 22),
 ('C', 21),
 ('▁sistema', 20),
 ('spetta', 20),
 ('▁S', 20),
 ('ato', 20),
 ('▁deve'

In [None]:
src_cnt.most_common(100)

[('.', 4883),
 ('▁', 4854),
 ('(', 3461),
 (');', 3017),
 (',', 2373),
 ('_', 2107),
 ('onstrain', 1671),
 ('▁=', 1664),
 ('▁{', 1438),
 ('▁}', 1438),
 ('t', 1310),
 ('s', 1287),
 ('C', 1283),
 ('E', 1177),
 ('5,', 1174),
 (';', 1152),
 ('▁new', 1109),
 ('0,', 1044),
 ('▁1,', 1033),
 ('▁GridBag', 1002),
 ('();', 785),
 ('()', 696),
 ('▁i', 668),
 ('get', 659),
 (')', 654),
 ('Utils', 650),
 ('▁(', 640),
 ('0.0,', 628),
 ('c', 585),
 ('List', 572),
 ('▁if', 563),
 ('id', 541),
 ('ener', 538),
 ('panel', 533),
 ('▁vo', 530),
 ('("', 524),
 ('set', 495),
 ('Action', 492),
 ('Text', 460),
 ('▁pu', 454),
 ('▁panel', 452),
 ('blic', 449),
 ('W', 446),
 ('ST', 446),
 ('NON', 423),
 ('▁"', 398),
 ('setE', 389),
 ('▁testo', 376),
 ('1,', 371),
 ('new', 368),
 ('▁1.0,', 366),
 ('▁e', 363),
 ('this', 356),
 ('i', 352),
 ('▁//', 344),
 ('e', 337),
 ('d', 333),
 ('NORTH', 332),
 ('"', 324),
 ('o', 321),
 ('▁5', 312),
 ('mport', 310),
 ('add', 310),
 ('false', 301),
 ('a', 289),
 ('*', 274),
 ('Stat

In [None]:
sz = sum(src_cnt.values())

In [None]:
alphabet = []
ps = []
for i, tok in enumerate(src_cnt):
    print(tok, src_cnt[tok])
    alphabet.append(f'{i:04}')
    ps.append(src_cnt[tok] / sz)

▁pack 111
age 55
▁interfacce 59
; 1152
▁i 668
mport 310
▁j 178
ava 148
. 4883
a 289
w 110
t 1310
* 274
event 55
▁comm 125
on 107
utili 57
ty 69
▁mod 69
uli 62
▁pu 454
blic 449
▁class 60
▁Aggiungi 30
BeneServizio 54
▁ext 60
end 61
s 1287
▁Inserisci 19
BeniServizi 18
▁{ 1438
▁Button 155
▁annulla 48
1, 371
2, 168
▁conferma 32
2; 73
▁TextField 171
▁ 4854
tf 26
_ 2107
supp 24
, 2373
rid 31
stanza 148
mot 17
▁Label 248
▁label 247
( 3461
Frame 16
▁parent 46
) 654
▁sup 84
er 99
(" 524
Cre 131
azione 6
▁di 136
▁un 50
▁nuovo 6
▁bene 23
/ 16
servizio 31
riduzione 2
supplemento 1
" 324
); 3017
▁padre 141
▁= 1664
setE 389
n 245
able 213
d 333
false 301
▁setup 26
Nuov 2
i 352
(); 785
▁inizializz 25
▁setS 3
iz 4
e 337
4 35
5 24
0, 1044
6 15
00 4
setV 104
isi 104
ble 119
true 237
▁} 1438
▁vo 530
id 541
() 696
▁// 344
o 321
▁pannelli 22
remove 51
panel 533
[0] 59
▁panel 452
▁new 1109
▁Panel 133
[0]. 127
set 495
Layout 127
grid 7
1); 14
▁il 195
▁pannello 90
▁in 52
▁alto 18
Tip 2
Font 123
Configurazione 

scanVector 12
▁!( 26
("") 32
index 21
Period 10
+ 12
▁Sca 1
▁seguente 5
▁utilizz 2
ero 2
▁SubStanze 6
▁caller 9
▁title 8
▁caption 10
title 9
caption 6
nformazioni 2
▁sul 2
▁posti 3
tiche 1
ari 1
▁4 21
▁50 1
▁botton 4
▁cliente 28
caller 17
▁2) 3
==3) 2
▁pro 17
ected 17
Base 3
mycheckboxes 1
setDisp 1
▁errori 9
<= 50
▁4) 3
anca 18
▁52) 5
scrizione 3
▁52 1
(350, 2
))+ 1
torna 1
questo 9
▁serv 5
▁quanto 5
▁este 2
▁classe 2
iniz 3
▁utilizzat 2
▁dall 5
[10]. 9
[10] 11
=3; 2
lo 5
▁while 10
▁5) 4
RicercaStanza 12
▁mas 24
ricerca 12
("",1 1
2) 3
Inserisci 4
",1 4
COMMISSIONATA 9
startAvvi 3
("",2 1
ag 33
PENSIONE 9
COMPLETA 5
nome 16
[7]. 7
[8]. 11
[6]. 9
▁AskDialog 2
▁ask 4
▁an 1
sw 8
myCheckbox 3
▁Assegna 8
▁Bloccar 2
▁fin 2
▁ricev 1
imento 1
▁caparra 1
ggiorna 2
ASSEGNATA 4
▁tipo 24
ins 7
tot 10
Locale 2
OCCUPATA 9
▁1) 12
▁inserito 10
▁digitat 9
▁cognome 9
▁documento 2
8. 9
9. 9
10. 10
11. 9
12. 10
▁nazione 2
▁InserisciStanze 4
▁ListaStanze 9
Stanze 8
Im 1
possibile 1
▁inseri 1
gi 9
unt 1
▁g

In [None]:
sz

96385

In [None]:
src_cnt

Counter({'▁pack': 111,
         'age': 55,
         '▁interfacce': 59,
         ';': 1152,
         '▁i': 668,
         'mport': 310,
         '▁j': 178,
         'ava': 148,
         '.': 4883,
         'a': 289,
         'w': 110,
         't': 1310,
         '*': 274,
         'event': 55,
         '▁comm': 125,
         'on': 107,
         'utili': 57,
         'ty': 69,
         '▁mod': 69,
         'uli': 62,
         '▁pu': 454,
         'blic': 449,
         '▁class': 60,
         '▁Aggiungi': 30,
         'BeneServizio': 54,
         '▁ext': 60,
         'end': 61,
         's': 1287,
         '▁Inserisci': 19,
         'BeniServizi': 18,
         '▁{': 1438,
         '▁Button': 155,
         '▁annulla': 48,
         '1,': 371,
         '2,': 168,
         '▁conferma': 32,
         '2;': 73,
         '▁TextField': 171,
         '▁': 4854,
         'tf': 26,
         '_': 2107,
         'supp': 24,
         ',': 2373,
         'rid': 31,
         'stanza': 148,
         'mot': 

In [None]:
ps

[0.0011516314779270633,
 0.00057062820978368,
 0.0006121284432224931,
 0.011952067230378172,
 0.006930538984281786,
 0.0032162680915080147,
 0.0018467603880271827,
 0.0015355086372360845,
 0.05066140997043108,
 0.002998391865954246,
 0.00114125641956736,
 0.013591326451211288,
 0.002842765990558697,
 0.00057062820978368,
 0.0012968822949629092,
 0.0011101312444882503,
 0.0005913783265030865,
 0.0007158790268195259,
 0.0007158790268195259,
 0.0006432536183016029,
 0.004710276495305286,
 0.00465840120350677,
 0.0006225035015821964,
 0.0003112517507910982,
 0.0005602531514239768,
 0.0006225035015821964,
 0.0006328785599418997,
 0.013352700108938112,
 0.0001971261088343622,
 0.00018675105047465893,
 0.014919333921253308,
 0.0016081340457540073,
 0.0004980028012657571,
 0.0038491466514499143,
 0.0017430098044301499,
 0.00033200186751050474,
 0.0007573792602583389,
 0.0017741349795092596,
 0.05036053327799969,
 0.00026975151735228513,
 0.021860247963894797,
 0.00024900140063287854,
 0.024620

In [None]:
alphabet

['0000',
 '0001',
 '0002',
 '0003',
 '0004',
 '0005',
 '0006',
 '0007',
 '0008',
 '0009',
 '0010',
 '0011',
 '0012',
 '0013',
 '0014',
 '0015',
 '0016',
 '0017',
 '0018',
 '0019',
 '0020',
 '0021',
 '0022',
 '0023',
 '0024',
 '0025',
 '0026',
 '0027',
 '0028',
 '0029',
 '0030',
 '0031',
 '0032',
 '0033',
 '0034',
 '0035',
 '0036',
 '0037',
 '0038',
 '0039',
 '0040',
 '0041',
 '0042',
 '0043',
 '0044',
 '0045',
 '0046',
 '0047',
 '0048',
 '0049',
 '0050',
 '0051',
 '0052',
 '0053',
 '0054',
 '0055',
 '0056',
 '0057',
 '0058',
 '0059',
 '0060',
 '0061',
 '0062',
 '0063',
 '0064',
 '0065',
 '0066',
 '0067',
 '0068',
 '0069',
 '0070',
 '0071',
 '0072',
 '0073',
 '0074',
 '0075',
 '0076',
 '0077',
 '0078',
 '0079',
 '0080',
 '0081',
 '0082',
 '0083',
 '0084',
 '0085',
 '0086',
 '0087',
 '0088',
 '0089',
 '0090',
 '0091',
 '0092',
 '0093',
 '0094',
 '0095',
 '0096',
 '0097',
 '0098',
 '0099',
 '0100',
 '0101',
 '0102',
 '0103',
 '0104',
 '0105',
 '0106',
 '0107',
 '0108',
 '0109',
 '0110',
 

In [None]:
def get_dist(cnt):
    sz = sum(cnt.values())
    alphabet = []
    ps = []
    for i, tok in enumerate(cnt):
        alphabet.append(f'{i:04}')
        ps.append(cnt[tok] / sz)
    
    return dit.Distribution(alphabet, ps)

In [None]:
dit.shannon.entropy(get_dist(req_cnt))

8.257906993359518

In [None]:
dit.shannon.entropy(get_dist(src_cnt))

8.005993952354078