# Large Scale Encoder-Decoder (BART) Sampling
> Empirical netbook to sample bart for method2test benchmark.

In [1]:
from pathlib import Path
import csv
import seaborn as sns; sns.set_theme()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import functools

pd.options.display.float_format = '{:.2f}'.format

In [2]:
from tokenizers import ByteLevelBPETokenizer
import torch
import importlib
from fairseq.models.transformer import TransformerModel

In [3]:
rationalization = importlib.import_module("sequential-rationales.fairseq.rationalization")
rationalize = rationalization.rationalize_lm

In [4]:
import warnings
from matplotlib import colors
import os
#from rationalization import rationalize_lm, rationalize_conditional_model

In [17]:
def param_default():
    corpus = 'fm_fc_ms_ff' #<-- Scope
    #data_path = Path('../semeru-datasets/athena_test/' + corpus + '/')
    data_path_raw = Path('../semeru-datasets/athena_test/'+ corpus + '/raw/')
    tokenizer_path = Path('../scripts/tokenizer/')
    return {
        'bpe_path' : tokenizer_path / 'universal_tokenizer/roberta_aug_spaces',
        'eval_raw': [data_path_raw / 'eval/input.methods.txt',
                        data_path_raw / 'eval/output.tests.txt'],
        'test_raw': [data_path_raw / 'test/input.methods.txt', 
                        data_path_raw / 'test/output.tests.txt'],
        'train_raw': [data_path_raw / 'train/input.methods.txt', 
                        data_path_raw / 'train/output.tests.txt'],
        'data_labels' : ['test_raw'],#['eval_raw','test_raw','train_raw'], <----- Just Test
        #'output_pandas' : data_path / 'pandas/',
        'out_processed' : '/workspaces/code-rationales/data/athena-test-out/out_processed/',
        'model_name_or_path' : '/workspaces/code-rationales/data/bart-fairseq/checkpoint_dir_athena_ms/models/', #Model Path
        'checkpoint_file': 'checkpoint_best.pt', #Model
        #'data_preprocessed':'/home/davidna/data/dummy/sequential-rationales/fairseq/fairseq/data-bin/bins/',
        'output_results' : '/workspaces/code-rationales/data/athena-test-out/icse_results/' 
    }

In [18]:
params = param_default()
params['checkpoint_file']

'checkpoint_best.pt'

In [19]:
params['eval_raw']

[PosixPath('../semeru-datasets/athena_test/fm_fc_ms_ff/raw/eval/input.methods.txt'),
 PosixPath('../semeru-datasets/athena_test/fm_fc_ms_ff/raw/eval/output.tests.txt')]

In [8]:
#Setting experiments 
#! export CUDA_VISIBLE_DEVICES="1"

## Universal Tokenizer

In [20]:
def load_tokenizer(bpe_path):
    return ByteLevelBPETokenizer(str(bpe_path)+'-vocab.json',str(bpe_path)+'-merges.txt')

In [21]:
def lazy_decode(bpe_java):
    return bpe_java.replace(' ','').replace('Ġ',' ').replace('Ċ','\n')

In [22]:
def prettify_java(minified_java):
    "tries to undo Michele's minification. Works decently, although for loops and sets get newlines inserted, and there are no empty lines or comments"
    minified_java = minified_java.replace('{','{\n').replace('}','}\n').replace(';',';\n')
    num_indents = 0
    pretty_java = ''
    for line in minified_java.splitlines():
        if line.lstrip().startswith('}'):
            num_indents -= 1
        pretty_java += num_indents*'    '+line+'\n'
        if line.endswith('{'):
            num_indents += 1
        if line.endswith('}') and not line.lstrip().startswith('}'):
            num_indents -= 1
    return pretty_java

In [23]:
tokenizer = load_tokenizer(params['bpe_path'])

## Data Loading and Testing

In [24]:
#export
def method_size_vector( method_vector ):
    '''Return the size of the tokens for a give method based on id
        Assuming that method_vector is an array of tokens
    '''
    input_ids = [ len(mtd) for mtd in method_vector ]
    return input_ids

### Super Set Code Preprocessess configured datasets

In [25]:
def super_set_code():
    data = {}
    for label in params['data_labels']:
        for val, path_data in enumerate( params[ label ] ):
            df = pd.read_csv( path_data, sep="\n", header=None, names=[label+str(val)]) #reading file
            df[label+'_bpe' + str( val )] = [ enc.tokens for enc in tokenizer.encode_batch( df[label+str(val)].values ) ] #bpe
            df['method_size' + str( val )] = method_size_vector( df[label+'_bpe'+str(val)].values ) #counting tokens
            data[label+str(val)] =  df  
        #data[-1].columns = [ label ]
    return data

In [None]:
super_data = super_set_code() #[WARNING] Use it when not computed! Otherwise use Loading Json

### Loading Super Set

In [14]:
# Loading Json Sets
def load_checkpoint_1():
    super_df = {}
    for label in params['data_labels']:
        for val, _ in enumerate(params[ label ]):
            super_df[ label+str(val) ] = pd.read_json( params['output_pandas'] / (label+str(val) +'.json')  )
            print("read:",label+str(val))
    return super_df

In [14]:
super_data = load_checkpoint_1()

read: test_raw0
read: test_raw1


### Testing Super Set

In [15]:
super_data['test_raw0'].head(1) #Source

Unnamed: 0,test_raw0,test_raw_bpe0,method_size0
0,DateUtils { public static Date yearStart() { f...,"[Date, Ut, ils, Ġ{, Ġpublic, Ġstatic, ĠDate, Ġ...",227


In [16]:
#Size Statistics of Source Set
super_data['test_raw0'].method_size0.describe()

count   78388.00
mean      423.42
std       653.26
min         8.00
25%       143.00
50%       248.00
75%       446.00
max     31638.00
Name: method_size0, dtype: float64

In [17]:
SET_METHOD_SIZE = 100 #<---- HARDCODED
super_data['test_raw0'][super_data['test_raw0'].method_size0 <= SET_METHOD_SIZE ].method_size0.describe()

count   9445.00
mean      73.55
std       18.59
min        8.00
25%       60.00
50%       76.00
75%       89.00
max      100.00
Name: method_size0, dtype: float64

In [18]:
#Target Set
super_data['test_raw1'].head(1) #Target

Unnamed: 0,test_raw1,test_raw_bpe1,method_size1
0,@Test public void yearStart() { Date date = Da...,"[@, Test, Ġpublic, Ġvoid, Ġyear, Start, (), Ġ{...",61


## Model Loading and Testing

In [21]:
#Loading a pretrain model
model = TransformerModel.from_pretrained(
  model_name_or_path = params['model_name_or_path'],
  checkpoint_file = params['checkpoint_file'],
  #data_name_or_path = params['data_preprocessed']
)

In [22]:
## Move model to GPU if available and trigger evaluation mode
if torch.cuda.is_available():
  model.cuda()
model.eval()

GeneratorHubInterface(
  (models): ModuleList(
    (0): BARTModel(
      (encoder): TransformerEncoderBase(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(50348, 512, padding_idx=1)
        (embed_positions): SinusoidalPositionalEmbedding()
        (layers): ModuleList(
          (0): TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=512, out_features=512, bias=True)
              (v_proj): Linear(in_features=512, out_features=512, bias=True)
              (q_proj): Linear(in_features=512, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=512, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (dropout_module): FairseqDropout()
            (activation_dropout_module): FairseqDropout()
            (fc1): Linear(in_features=512,

In [23]:
model.model = model.models[0]

In [24]:
model.device

device(type='cuda', index=0)

In [54]:
def joining_encode_tokens( arr_tokens, model ):
    if len(arr_tokens) > SET_METHOD_SIZE:
        arr_tokens = arr_tokens[0:SET_METHOD_SIZE]
    focal_code = " ".join(arr_tokens)
    return model.encode( focal_code )

In [55]:
#Sampling without replacement
#Testing size: 78388
#Sampling size with 95% of confidence and 3% Error = 1053 ~ 1000
def code_sampling(df_super_data ,  FLAG_SAMPLING = True, SIZE_SAMPLING = 1000, random_state = 3): #<---- HARDCODED
    
    df_sampled_code = df_super_data['test_raw0'][df_super_data['test_raw0'].method_size0 <= SET_METHOD_SIZE ].sample(
            n = SIZE_SAMPLING,
            replace = False,
            random_state = random_state # For reproducibility
    )

    if FLAG_SAMPLING:
        df_sampled_code['input_tokens'] = [ joining_encode_tokens(arr_sample, model=model) for arr_sample in df_sampled_code.test_raw_bpe0.values ]
        #df_sampled_code['origin_pos'] = df_sampled_code.index
    else:
        df_sampled_code['input_tokens_pos'] = [ joining_encode_tokens(arr_sample, model=model) for arr_sample in df_super_data['test_raw0'].test_raw_bpe0.values]
        #df_sampled_code['origin'] = df_sampled_code.index
    return df_sampled_code

In [56]:
df_sampled_code = code_sampling(
    df_super_data = super_data,
    SIZE_SAMPLING = 1000
)

In [57]:
df_sampled_code.head()

Unnamed: 0,test_raw0,test_raw_bpe0,method_size0,input_tokens
5394,DropImpl extends BaseSqlPart implements Drop {...,"[Drop, Impl, Ġextends, ĠBase, S, ql, Part, Ġim...",162,"[tensor(43542), tensor(48455), tensor(14269), ..."
71045,RuleDatabaseItemUpdateRunnable implements Runn...,"[Rule, Database, Item, Update, Run, n, able, Ġ...",183,"[tensor(47181), tensor(49187), tensor(47599), ..."
30570,InChIToStructure { public IAtomContainer getAt...,"[In, Ch, IT, o, St, ructure, Ġ{, Ġpublic, ĠI, ...",123,"[tensor(1121), tensor(4771), tensor(2068), ten..."
28852,SgfParser { public List<Short> parseGameFromFi...,"[S, g, f, Parser, Ġ{, Ġpublic, ĠList, <, Short...",150,"[tensor(104), tensor(571), tensor(506), tensor..."
54142,PredictionContainerGenerator extends AbstractA...,"[Pred, iction, Container, Gener, ator, Ġextend...",162,"[tensor(45408), tensor(26579), tensor(48557), ..."


In [29]:
#df_sampled_code.reset_index()

In [30]:
#super_data['test_raw0'].filter( items = df_sampled_code.origin_pos.values, axis=0 ) #<-------- Retrieving original Data

In [58]:
len(df_sampled_code.input_tokens.values)

5

In [59]:
df_sampled_code.input_tokens.values[0]

tensor([43542, 48455, 14269, 11056,   104, 44306,  4741, 36987, 21603, 25522,
          787,  7199, 49302,   787, 49116,   285, 21603, 41836,  2103,  1640,
         1039,  7199, 49302,   507, 26602,  2103, 31723,    43, 25522,   671,
         2103,  1640, 15755,     6,  2103, 31723,  4397, 35524,   787,  7199,
        49302,   787, 49116, 21603, 41836,  2103,  1640,  1039,  7199, 49302,
          507, 26602,  2103, 31723,  4397,   787,  7199, 49302,   787, 49116,
        21603, 41836,  2103,  1640,  1039, 49302,   868,   507, 26602,  8503,
        31723,     6,   787,  7199, 49302,   507, 26602,  2103, 31723,  4397,
          787,  7199, 49302,   787, 49116, 21603, 41836,  1106,  9089,  1952,
         2103,  1106,  9089,  1952,  1640,  1039,  7199, 49302,   507, 26602,
         2103, 31723,  4397,   787,  7199, 49302,   787, 49116, 21603, 41836,
         1106,  9089,  1952,  2103,  1106,  9089,  1952,  1640,  1039, 49302,
          868,   507, 26602,  8503, 31723,     6,   787,  7199, 

In [60]:
SAMPLES = 30 #<---- Hardocoded
MAX_GEN_TOK = 200

In [61]:
def df_sample_generation(
    df_sampled_code, 
    model, 
    n=1, 
    max_gen_tok = 100
    ):
    generated_input = lambda input,model,n,max_gen_tok: model.generate( 
        input,
        beam = n, 
        maxlen = max_gen_tok, ##WARNING, This parameter is not working
        #max_length = n, 
        do_sample = False, 
        pad_token_id = 50256 ) ## HARDCODED
    arr_generated_code = np.array([ generated_input(input, model=model, n=n, 
                                max_gen_tok=max_gen_tok ) for input in df_sampled_code.input_tokens.values ]).T
    
    #dict_generated_code = { i: [j['tokens'].cpu().data.numpy()[:max_gen_tok] for j in samples] for i,samples in enumerate(arr_generated_code) } #Max Token Generation
    dict_generated_code = { i: [j['tokens'].cpu().data.numpy() for j in samples] for i,samples in enumerate(arr_generated_code) }
    dict_generated_code['source_sampling'] = [ i.cpu().data.numpy() for i in df_sampled_code.input_tokens.values] 
    #return arr_generated_code
    df_temp = pd.DataFrame().from_dict( data=dict_generated_code ) # DataFrame from Generation
    df_temp = pd.concat([df_sampled_code.reset_index(), df_temp ], axis=1) #Index before concating
    #return pd.DataFrame().from_dict( data=dict_generated_code )
    return df_temp

In [62]:
#TODO limit the number of tokens generated
#WARNING TIME CONSUMING
df_generated_input = df_sample_generation( 
    df_sampled_code = df_sampled_code, 
    model = model, 
    n = SAMPLES, 
    max_gen_tok = MAX_GEN_TOK 
)
# [ sample_generation(input, model=model) for input in input_tokens[:2] ]

In [70]:
df_generated_input

Unnamed: 0,index,test_raw0,test_raw_bpe0,method_size0,input_tokens,0,1,2,3,4,...,21,22,23,24,25,26,27,28,29,source_sampling
0,5394,DropImpl extends BaseSqlPart implements Drop {...,"[Drop, Impl, Ġextends, ĠBase, S, ql, Part, Ġim...",162,"[tensor(43542), tensor(48455), tensor(14269), ...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 44840, 26170,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...",...,"[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[43542, 48455, 14269, 11056, 104, 44306, 4741,..."
1,71045,RuleDatabaseItemUpdateRunnable implements Runn...,"[Rule, Database, Item, Update, Run, n, able, Ġ...",183,"[tensor(47181), tensor(49187), tensor(47599), ...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...",...,"[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[47181, 49187, 47599, 39962, 33177, 282, 868, ..."
2,30570,InChIToStructure { public IAtomContainer getAt...,"[In, Ch, IT, o, St, ructure, Ġ{, Ġpublic, ĠI, ...",123,"[tensor(1121), tensor(4771), tensor(2068), ten...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...",...,"[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1121, 4771, 2068, 139, 5320, 20636, 25522, 28..."
3,28852,SgfParser { public List<Short> parseGameFromFi...,"[S, g, f, Parser, Ġ{, Ġpublic, ĠList, <, Short...",150,"[tensor(104), tensor(571), tensor(506), tensor...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 197, 22011, 1090, 20...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...",...,"[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 197, 22011, 1090, 20...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[104, 571, 506, 49707, 25522, 285, 9527, 41552..."
4,54142,PredictionContainerGenerator extends AbstractA...,"[Pred, iction, Container, Gener, ator, Ġextend...",162,"[tensor(45408), tensor(26579), tensor(48557), ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...",...,"[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[45408, 26579, 48557, 40025, 2630, 14269, 4364..."


### Statistics and Checkpoint

In [64]:
np_len_method = [ (np.array([ len(gen_method) for gen_method in df_generated_input[j] ]).mean(),
                   np.array([ len(gen_method) for gen_method in df_generated_input[j] ]).std()  )
                    for j in range(30) ]

In [65]:
np_len_method

[(112.2, 62.92980216082043),
 (112.0, 62.504399845130905),
 (108.0, 61.13264267148934),
 (107.4, 59.87520354871455),
 (107.6, 60.45361858482915),
 (104.8, 59.96465625683182),
 (102.8, 58.714223149080325),
 (102.2, 58.87410296556543),
 (102.4, 58.72852799108794),
 (100.0, 58.32323722153975),
 (103.0, 60.02666074337302),
 (100.0, 58.22370651203855),
 (101.0, 59.32284551502903),
 (99.4, 58.67742325630872),
 (97.2, 58.34175177349408),
 (95.2, 57.241243871879654),
 (94.4, 58.3869848510779),
 (92.6, 58.711498022108074),
 (90.8, 58.89448191469214),
 (92.8, 57.80449809487147),
 (92.8, 57.880566686928695),
 (90.0, 59.03558249056242),
 (91.8, 58.47871407614912),
 (85.0, 61.59545437773797),
 (81.4, 60.88875101363141),
 (79.6, 62.31725282776833),
 (78.8, 62.268451080784075),
 (82.4, 60.65509047062744),
 (82.0, 61.01803012225157),
 (74.0, 65.78145635359557)]

In [71]:
#Checkpoint of Generation
def checkpoint_generation( df , name = '1_generation_[max:100]_02.json' ):
    df.drop('input_tokens', axis=1).to_json( params['output_results'] + name )
    pass

In [72]:
checkpoint_generation( df = df_generated_input )

In [73]:
df_generated_input = pd.read_json( params['output_results'] + '1_generation_[max:100]_02.json' )

In [74]:
df_generated_input.head()

Unnamed: 0,index,test_raw0,test_raw_bpe0,method_size0,0,1,2,3,4,5,...,21,22,23,24,25,26,27,28,29,source_sampling
0,5394,DropImpl extends BaseSqlPart implements Drop {...,"[Drop, Impl, Ġextends, ĠBase, S, ql, Part, Ġim...",162,"[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 44840, 26170,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 44840, 26170,...",...,"[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[1039, 34603, 1640, 10162, 5457, 36993, 45621,...","[43542, 48455, 14269, 11056, 104, 44306, 4741,..."
1,71045,RuleDatabaseItemUpdateRunnable implements Runn...,"[Rule, Database, Item, Update, Run, n, able, Ġ...",183,"[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...",...,"[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[1039, 34603, 285, 13842, 197, 44514, 43048, 6...","[47181, 49187, 47599, 39962, 33177, 282, 868, ..."
2,30570,InChIToStructure { public IAtomContainer getAt...,"[In, Ch, IT, o, St, ructure, Ġ{, Ġpublic, ĠI, ...",123,"[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...",...,"[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1039, 34603, 285, 13842, 1296, 14181, 3750, 1...","[1121, 4771, 2068, 139, 5320, 20636, 25522, 28..."
3,28852,SgfParser { public List<Short> parseGameFromFi...,"[S, g, f, Parser, Ġ{, Ġpublic, ĠList, <, Short...",150,"[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 197, 22011, 1090, 20...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...",...,"[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[1039, 34603, 285, 13842, 197, 22011, 1090, 20...","[1039, 34603, 285, 13842, 1296, 22011, 1090, 2...","[104, 571, 506, 49707, 25522, 285, 9527, 41552..."
4,54142,PredictionContainerGenerator extends AbstractA...,"[Pred, iction, Container, Gener, ator, Ġextend...",162,"[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...",...,"[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[1039, 34603, 285, 13842, 1296, 21527, 45788, ...","[45408, 26579, 48557, 40025, 2630, 14269, 4364..."


In [75]:
#tst decoding
decoded = model.decode(df_generated_input['1'][0])
decoded

'@ Test ( expected Ġ= ĠIllegal Arg ument Exception . class ) Ġpublic Ġvoid Ġtest Drop Table Null Table () Ġ{ Ġdrop . table ( null ); Ġ}'

In [76]:
prettify_java( lazy_decode( decoded ) )

'@Test(expected = IllegalArgumentException.class) public void testDropTableNullTable() {\n     drop.table(null);\n }\n'

In [1]:
## MEMORy DEALLOCATION
torch.cuda.empty_cache()

NameError: name 'torch' is not defined