In [1]:
import logging
from pathlib import Path
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import torch
import os
import re
pd.options.display.float_format = '{:.3f}'.format

In [2]:
from tokenizers import ByteLevelBPETokenizer
import torch
import importlib
from fairseq.models.transformer import TransformerModel
from tokenizers import ByteLevelBPETokenizer

In [3]:
logging.basicConfig(
    filename="../datax/logs/logger_bart_sampling.txt",
    filemode='a',
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.INFO
    )

In [5]:
def param_default():
    corpus = 'fm' # 'fm_fc_ms_ff' #<-- Scope
    data_path = Path('/workspaces/code-rationales/semeru-datasets/athena_test/' + corpus + '/')
    data_path_raw = Path(data_path/ 'raw')
    return {
        'bpe_path' : '/workspaces/code-rationales/scripts/tokenizer/universal_tokenizer/roberta_aug_spaces',
        'eval_raw': [data_path_raw / 'eval/input.methods.txt',
                        data_path_raw / 'eval/output.tests.txt'],
        'test_raw': [data_path_raw / 'test/input.methods.txt', 
                        data_path_raw / 'test/output.tests.txt'],
        'train_raw': [data_path_raw / 'train/input.methods.txt', 
                        data_path_raw / 'train/output.tests.txt'],
        'data_labels' : ['test_raw'],#['eval_raw','test_raw','train_raw'], <----- Just Test
        'super_data_checkpoint' : data_path / 'pandas',
        'out_processed' : '/datasets/out_processed/',
        'model_name_or_path' : '/workspaces/code-rationales/data/bart-fairseq/checkpoint_dir_athena_ms/models/', #Model Path
        'checkpoint_file': 'checkpoint_best.pt', #Model
        'output_sample' : '/workspaces/code-rationales/data/sampling/bart/',
        'corpus': corpus
    }

In [6]:
params = param_default()
params['checkpoint_file']

'checkpoint_best.pt'

In [7]:
params['test_raw']

[PosixPath('/workspaces/code-rationales/semeru-datasets/athena_test/fm/raw/test/input.methods.txt'),
 PosixPath('/workspaces/code-rationales/semeru-datasets/athena_test/fm/raw/test/output.tests.txt')]

## Universal Tokenizer

In [8]:
def load_tokenizer(bpe_path):
    return ByteLevelBPETokenizer(str(bpe_path)+'-vocab.json',str(bpe_path)+'-merges.txt')

In [9]:
def lazy_decode(bpe_java):
    return bpe_java.replace(' ','').replace('Ġ',' ').replace('Ċ','\n')

In [10]:
params = param_default()

In [11]:
def prettify_java(minified_java):
    "tries to undo Michele's minification. Works decently, although for loops and sets get newlines inserted, and there are no empty lines or comments"
    minified_java = minified_java.replace('{','{\n').replace('}','}\n').replace(';',';\n')
    num_indents = 0
    pretty_java = ''
    for line in minified_java.splitlines():
        if line.lstrip().startswith('}'):
            num_indents -= 1
        pretty_java += num_indents*'    '+line+'\n'
        if line.endswith('{'):
            num_indents += 1
        if line.endswith('}') and not line.lstrip().startswith('}'):
            num_indents -= 1
    return pretty_java

In [12]:
params['bpe_path']

'/workspaces/code-rationales/scripts/tokenizer/universal_tokenizer/roberta_aug_spaces'

In [13]:
tokenizer = load_tokenizer(params['bpe_path'])

## Data Loading

In [14]:
def method_size_vector( method_vector ):
    '''Return the size of the tokens for a give method based on id
        Assuming that method_vector is an array of tokens
    '''
    input_ids = [ len(mtd) for mtd in method_vector ]
    return input_ids

### Super Set Code Preprocessess configured datasets

In [15]:
def super_set_code():
    df = pd.DataFrame()
    for label in params['data_labels']:
        for val, path_data in enumerate( params[ label ] ):
            new_label= re.split('\.|\/',str(path_data))[-3]
            df = pd.concat([df, pd.read_csv( path_data, sep="\0", header=None, names=[new_label])], axis=1) #reading file
            df[new_label+'_bpe'] = [ enc.tokens for enc in tokenizer.encode_batch( df[new_label].values ) ] #bpe
            df[new_label+'_method_size'] = method_size_vector( df[new_label+'_bpe'].values ) #counting tokens
            
    return df

In [16]:
super_data = super_set_code() #[WARNING] Use it when not computed! Otherwise use Loading Json

In [17]:
super_data.head()

Unnamed: 0,input,input_bpe,input_method_size,output,output_bpe,output_method_size
0,public static Date yearStart() { final Gregori...,"[public, Ġstatic, ĠDate, Ġyear, Start, (), Ġ{,...",42,@Test public void yearStart() { Date date = Da...,"[@, Test, Ġpublic, Ġvoid, Ġyear, Start, (), Ġ{...",61
1,public static Date yearEnd() { final Gregorian...,"[public, Ġstatic, ĠDate, Ġyear, End, (), Ġ{, Ġ...",65,@Test public void yearEnd() { Date date = Date...,"[@, Test, Ġpublic, Ġvoid, Ġyear, End, (), Ġ{, ...",74
2,public void validate(TokenBinding clientDataTo...,"[public, Ġvoid, Ġvalidate, (, Token, B, inding...",170,@Test void validate_invalid_bindingId_test() {...,"[@, Test, Ġvoid, Ġvalidate, _, in, valid, _, b...",110
3,public static int getUnsignedShort(ByteBuffer ...,"[public, Ġstatic, Ġint, Ġget, Un, signed, Shor...",29,@Test void getUnsignedShort_test1() { byte[] b...,"[@, Test, Ġvoid, Ġget, Un, signed, Short, _, t...",73
4,public static boolean isWithinUnsignedLong(Big...,"[public, Ġstatic, Ġboolean, Ġis, Within, Un, s...",41,@Test void isWithinUnsignedLong_test() { asser...,"[@, Test, Ġvoid, Ġis, Within, Un, signed, Long...",145


### Saving each dataset from super_data as checkpoint 

In [18]:
super_data.to_parquet(params['super_data_checkpoint'] / 'test_data_input_output.parquet')

### Loading Super Set

In [19]:
#super_data = pd.read_parquet(params['super_data_checkpoint'] / 'test_data_input_output.parquet')

### Testing Super Set

In [20]:
#Size Statistics of Source Set
super_data.input_method_size.describe()

count   78388.000
mean      162.572
std       357.421
min         4.000
25%        38.000
50%        80.000
75%       173.000
max      6016.000
Name: input_method_size, dtype: float64

## Model Loading and Testing

In [21]:
#Loading a pretrain model
model = TransformerModel.from_pretrained(
  model_name_or_path = params['model_name_or_path'],
  checkpoint_file = params['checkpoint_file'],
  #data_name_or_path = params['data_preprocessed']
)

2023-07-23 22:53:04 | INFO | fairseq.file_utils | loading archive file /workspaces/code-rationales/data/bart-fairseq/checkpoint_dir_athena_ms/models/
2023-07-23 22:53:05 | INFO | fairseq.tasks.translation | [input.methods] dictionary: 50348 types
2023-07-23 22:53:05 | INFO | fairseq.tasks.translation | [output.tests] dictionary: 50348 types
2023-07-23 22:53:07 | INFO | fairseq.models.fairseq_model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'tensorboard_logdir': '/home/davidna/data/dummy/models/checkpoint_dir/tensorboard', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 2, 'fp16_scale_window': 512, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp'

In [22]:
## Move model to GPU if available and trigger evaluation mode
if torch.cuda.is_available():
  model.cuda()
model.eval()

GeneratorHubInterface(
  (models): ModuleList(
    (0): BARTModel(
      (encoder): TransformerEncoderBase(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(50348, 512, padding_idx=1)
        (embed_positions): SinusoidalPositionalEmbedding()
        (layers): ModuleList(
          (0): TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=512, out_features=512, bias=True)
              (v_proj): Linear(in_features=512, out_features=512, bias=True)
              (q_proj): Linear(in_features=512, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=512, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (dropout_module): FairseqDropout()
            (activation_dropout_module): FairseqDropout()
            (fc1): Linear(in_features=512,

In [23]:
model.model = model.models[0]

In [24]:
model.device

device(type='cuda', index=0)

In [25]:
SET_METHOD_SIZE = 412 #<---- HARDCODED comes from the mean of input method size
super_data[super_data.input_method_size <= SET_METHOD_SIZE ].input_method_size.describe()

count   72928.000
mean      104.267
std        90.265
min         4.000
25%        36.000
50%        73.000
75%       145.000
max       412.000
Name: input_method_size, dtype: float64

In [26]:
def joining_encode_tokens( arr_tokens, model ):
    if len(arr_tokens) > SET_METHOD_SIZE:
        arr_tokens = arr_tokens[0:SET_METHOD_SIZE]
    focal_code = " ".join(arr_tokens)
    return model.encode( focal_code )

In [27]:
#Sampling without replacement
#Testing size: 78388
#Sampling size with 95% of confidence and 3% Error = 1053 ~ 1000
def code_sampling(df_super_data ,  FLAG_SAMPLING = True, SIZE_SAMPLING = 1000, random_state = 3):
    
    df_sampled_code = super_data[super_data.input_method_size <= SET_METHOD_SIZE ].sample(
            n = SIZE_SAMPLING,
            replace = False,
            random_state = random_state # For reproducibility
    )

    if FLAG_SAMPLING:
        df_sampled_code['input_tokens'] = [ joining_encode_tokens(arr_sample, model=model) for arr_sample in df_sampled_code.input_bpe.values ]
    else:
        df_sampled_code['input_tokens_pos'] = [ joining_encode_tokens(arr_sample, model=model) for arr_sample in df_super_data.input_bpe.values]
    return df_sampled_code

In [28]:
df_sampled_code = code_sampling(
    df_super_data = super_data,
    SIZE_SAMPLING = 100
)

In [29]:
df_sampled_code['input_tokens']

1340     [tensor(15110), tensor(9527), tensor(41552), t...
1242     [tensor(1039), tensor(10089), tensor(3850), te...
36220    [tensor(15110), tensor(25156), tensor(28696), ...
21814    [tensor(15110), tensor(2010), tensor(47613), t...
8060     [tensor(37659), tensor(256), tensor(7511), ten...
                               ...                        
697      [tensor(15110), tensor(221), tensor(4628), ten...
70739    [tensor(15110), tensor(25156), tensor(47893), ...
158      [tensor(15110), tensor(25156), tensor(13842), ...
33352    [tensor(1039), tensor(49116), tensor(285), ten...
11193    [tensor(15110), tensor(25156), tensor(26602), ...
Name: input_tokens, Length: 100, dtype: object

In [30]:
df_sampled_code.shape

(100, 7)

In [31]:
df_sampled_code.input_tokens.values[0]

tensor([15110,  9527, 41552,   495, 26769,  6761, 15698,   120, 48720,   495,
        26769, 14768,  1640,  6156, 26602, 20686, 31723,     6,   507, 26602,
        20686, 47322,     6,   507, 49210,   455, 21109, 47779,     6,   507,
        49210,  2365,     6,   507, 49210,   371, 38210,    43,  6989,   272,
         8645,   293, 38644, 14086, 48847, 25522,   507, 33536,  3653,  5457,
          120, 47952, 47006,   507,  6494, 47279,  5799,  5457,  3653,     4,
        44814,  1640, 33806, 42703,   322, 22609,  1640, 45589, 41967,  5290,
            4,  6460, 23295, 46460,   495, 26769, 14768,  1640, 48095, 31723,
            6, 20686, 47322, 48749,   507, 33536, 47806,  1263,  5457,  5799,
            4, 48360, 48348,  1640, 47576, 40104,     4,  3632, 23075,  1215,
        10370, 11337,  3850,  1215, 14280,  2620,     6,    22, 29225,  8070,
          479, 48360, 48348,  1640, 47576, 40104,     4,  3632, 23075,  1215,
        35654,  2688,  1691,  1215, 14280,  2620,     6,    22, 

In [32]:
df_sampled_code.head()

Unnamed: 0,input,input_bpe,input_method_size,output,output_bpe,output_method_size,input_tokens
1340,public List<Dependency> getModuleDependencies(...,"[public, ĠList, <, D, epend, ency, >, Ġget, Mo...",385,@Test public void getModuleDependencies() thro...,"[@, Test, Ġpublic, Ġvoid, Ġget, Module, D, epe...",274,"[tensor(15110), tensor(9527), tensor(41552), t..."
1242,"@DELETE @Path(""/{name}"") public Response delet...","[@, DE, LE, TE, Ġ@, Path, ("", /, {, name, }, ""...",120,@Test public void deleteAProductWithoutDeletio...,"[@, Test, Ġpublic, Ġvoid, Ġdelete, AP, rodu, c...",179,"[tensor(1039), tensor(10089), tensor(3850), te..."
36220,"public static <T, U> FlowableSubscriber<T> sub...","[public, Ġstatic, Ġ<, T, ,, ĠU, >, ĠFlow, able...",84,@Test public void testResultFunctionThrows() {...,"[@, Test, Ġpublic, Ġvoid, Ġtest, Result, Funct...",214,"[tensor(15110), tensor(25156), tensor(28696), ..."
21814,public SecurityRoleFunctionEntity getSecurityR...,"[public, ĠSecurity, Role, Function, Entity, Ġg...",128,@Test public void testGetSecurityRoleFunctionE...,"[@, Test, Ġpublic, Ġvoid, Ġtest, Get, Security...",148,"[tensor(15110), tensor(2010), tensor(47613), t..."
8060,protected M loadByQuery(Bson query) { return l...,"[protected, ĠM, Ġload, By, Query, (, B, son, Ġ...",21,@Test public void testLoadByQuery() throws Exc...,"[@, Test, Ġpublic, Ġvoid, Ġtest, Load, By, Que...",130,"[tensor(37659), tensor(256), tensor(7511), ten..."


In [33]:
SAMPLES = 30 #<---- Hardocoded
MAX_GEN_TOK = 200

In [34]:
def df_sample_generation(
    df_sampled_code, 
    model, 
    n=1, 
    ):
    generated_input = lambda input,model,n: model.generate( 
        input,
        beam = n, 
        #maxlen = max_gen_tok, ##This parameter does not exists
        #max_length = n, 
        do_sample = False, 
        pad_token_id = 50256 ) ## HARDCODED
    arr_generated_code = np.array([ generated_input(input, model=model, n=n ) for input in df_sampled_code.input_tokens.values ]).T
    
    dict_generated_code = { i: [j['tokens'].cpu().data.numpy() for j in samples] for i,samples in enumerate(arr_generated_code) }
    dict_generated_code['input_id'] = [ i.cpu().data.numpy() for i in df_sampled_code.input_tokens.values] 
    #return arr_generated_code
    df_temp = pd.DataFrame().from_dict( data=dict_generated_code ) # DataFrame from Generation
    df_temp = pd.concat([df_sampled_code.reset_index(), df_temp ], axis=1) #Index before concating
    del df_temp['input_tokens']
    #return pd.DataFrame().from_dict( data=dict_generated_code )
    return df_temp

In [35]:
#TODO limit the number of tokens generated
#WARNING TIME CONSUMING
df_generated_input = df_sample_generation( 
    df_sampled_code = df_sampled_code, 
    model = model, 
    n = SAMPLES
    )
# [ sample_generation(input, model=model) for input in input_tokens[:2] ]

In [36]:
df_generated_input

Unnamed: 0,index,input,input_bpe,input_method_size,output,output_bpe,output_method_size,0,1,2,...,21,22,23,24,25,26,27,28,29,input_is
0,1340,public List<Dependency> getModuleDependencies(...,"[public, ĠList, <, D, epend, ency, >, Ġget, Mo...",385,@Test public void getModuleDependencies() thro...,"[@, Test, Ġpublic, Ġvoid, Ġget, Module, D, epe...",274,"[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...",...,"[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[1039, 34603, 285, 13842, 120, 48720, 495, 267...","[15110, 9527, 41552, 495, 26769, 6761, 15698, ..."
1,1242,"@DELETE @Path(""/{name}"") public Response delet...","[@, DE, LE, TE, Ġ@, Path, ("", /, {, name, }, ""...",120,@Test public void deleteAProductWithoutDeletio...,"[@, Test, Ġpublic, Ġvoid, Ġdelete, AP, rodu, c...",179,"[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...",...,"[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 34603, 285, 13842, 1296, 46006, 43048, ...","[1039, 10089, 3850, 6433, 787, 42119, 46469, 7..."
2,36220,"public static <T, U> FlowableSubscriber<T> sub...","[public, Ġstatic, Ġ<, T, ,, ĠU, >, ĠFlow, able...",84,@Test public void testResultFunctionThrows() {...,"[@, Test, Ġpublic, Ġvoid, Ġtest, Result, Funct...",214,"[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[1039, 34603, 285, 13842, 11222, 43048, 25522,...",...,"[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[1039, 34603, 285, 13842, 11222, 43048, 6989, ...","[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[1039, 34603, 285, 13842, 11222, 43048, 6989, ...","[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[1039, 34603, 285, 13842, 11222, 43048, 25522,...","[15110, 25156, 28696, 565, 6, 121, 15698, 2362..."
3,21814,public SecurityRoleFunctionEntity getSecurityR...,"[public, ĠSecurity, Role, Function, Entity, Ġg...",128,@Test public void testGetSecurityRoleFunctionE...,"[@, Test, Ġpublic, Ġvoid, Ġtest, Get, Security...",148,"[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...","[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...","[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...",...,"[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...","[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...","[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...","[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...","[1039, 34603, 1640, 10162, 5457, 35671, 7199, ...","[15110, 2010, 47613, 47802, 49448, 120, 36090,..."
4,8060,protected M loadByQuery(Bson query) { return l...,"[protected, ĠM, Ġload, By, Query, (, B, son, Ġ...",21,@Test public void testLoadByQuery() throws Exc...,"[@, Test, Ġpublic, Ġvoid, Ġtest, Load, By, Que...",130,"[1039, 34603, 1640, 10162, 5457, 44840, 26170,...","[1039, 34603, 1640, 10162, 5457, 44840, 26170,...","[1039, 34603, 1640, 10162, 5457, 44840, 26170,...",...,"[1039, 34603, 1640, 10162, 5457, 44840, 26170,...","[1039, 34603, 1640, 10162, 5457, 44840, 26170,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 44840, 26170,...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 5457, 5654, 45589, ...","[1039, 34603, 1640, 10162, 5457, 5654, 45589, ...","[37659, 256, 7511, 2765, 48382, 1640, 387, 147..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,697,public PagedSearchIterable<GHContent> findFile...,"[public, ĠP, aged, Search, Iter, able, <, GH, ...",205,"@Test(dataProvider = ""inputEmptyImages"", expec...","[@, Test, (, data, Provider, Ġ=, Ġ"", input, Em...",180,"[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...",...,"[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[1039, 34603, 285, 13842, 1296, 38195, 14824, ...","[15110, 221, 4628, 39954, 49628, 868, 41552, 1..."
96,70739,public static byte[] parse(byte[] payload) { i...,"[public, Ġstatic, Ġbyte, [], Ġparse, (, byte, ...",143,@Test public void testParseMessage2() { byte[]...,"[@, Test, Ġpublic, Ġvoid, Ġtest, Par, se, Mess...",85,"[1039, 34603, 1640, 10162, 5457, 36993, 13360,...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 5457, 36993, 13360,...",...,"[1039, 34603, 1640, 10162, 5457, 36993, 13360,...","[1039, 34603, 1640, 10162, 5457, 36993, 13360,...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 5457, 36993, 13360,...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[1039, 34603, 1640, 10162, 9089, 35529, 5457, ...","[15110, 25156, 47893, 48992, 43756, 1640, 4769..."
97,158,public static void validate(RegistrationData r...,"[public, Ġstatic, Ġvoid, Ġvalidate, (, Registr...",51,@Test void validate_AuthenticationData_with_au...,"[@, Test, Ġvoid, Ġvalidate, _, Authent, icatio...",118,"[1039, 34603, 285, 13842, 1296, 20320, 32890, ...","[1039, 34603, 285, 13842, 1296, 20320, 32890, ...","[1039, 34603, 285, 13842, 1296, 20320, 32890, ...",...,"[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 5654, 45589, ...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 5654, 45589, ...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[15110, 25156, 13842, 28754, 1640, 45366, 3038..."
98,33352,@Override public byte[] getBinary(final int i)...,"[@, Override, Ġpublic, Ġbyte, [], Ġget, B, ina...",30,@Test public void testBinary() { final byte[] ...,"[@, Test, Ġpublic, Ġvoid, Ġtest, B, inary, (),...",145,"[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...",...,"[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 34603, 285, 13842, 1296, 14181, 387, 15...","[1039, 49116, 285, 47893, 48992, 120, 387, 155..."


In [37]:
df_generated_input.shape

(100, 38)

### Statistics and Checkpoint

In [38]:
np_len_method = [ (np.array([ len(gen_method) for gen_method in df_generated_input[j] ]).mean(),
                   np.array([ len(gen_method) for gen_method in df_generated_input[j] ]).std()  )
                    for j in range(30) ]

In [39]:
np_len_method

[(123.32, 73.30823691782528),
 (123.14, 73.62296109231141),
 (121.06, 73.71171141684339),
 (120.62, 74.09760859838866),
 (118.27, 73.92845933738913),
 (120.2, 75.55249830415934),
 (118.55, 75.28617070883602),
 (116.27, 74.82176889114558),
 (118.58, 76.61098876793068),
 (116.51, 76.25254028555376),
 (115.5, 76.24965573692776),
 (116.08, 76.40545530261566),
 (115.85, 76.80486638228075),
 (113.15, 76.11075810948148),
 (111.23, 76.53324179727396),
 (112.74, 77.43728042745303),
 (111.63, 77.17819057220764),
 (111.32, 78.64513716689672),
 (110.63, 78.97134353675389),
 (110.08, 79.34843161651024),
 (107.62, 78.18462508703357),
 (106.93, 78.72258316391809),
 (104.6, 78.86748379401996),
 (103.11, 78.4015172047072),
 (100.3, 77.45469643604575),
 (98.88, 78.54709160751912),
 (98.46, 78.69350926220027),
 (95.95, 79.08367404211819),
 (93.31, 79.15133542777404),
 (86.36, 78.39853570061115)]

In [40]:
#Checkpoint of Generation
corpus = params['corpus']
def checkpoint_generation(df):
    df.to_json(params['output_sample']+corpus+'_generated.tests.json',  orient='records')

In [41]:
checkpoint_generation(df_generated_input )

In [42]:
## MEMORy DEALLOCATION
torch.cuda.empty_cache()