# Athena Inferring Notebook (from Scratch)

>
> Excercise to replicate Athena Inferring by @davidN
>

In [1]:
from pathlib import Path
import csv
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

In [2]:
from tokenizers import ByteLevelBPETokenizer

In [3]:
import torch
from fairseq.models.transformer import TransformerModel

In [4]:
import fairseq as fs
print(fs.__version__)

1.0.0a0+61c20f7


In [34]:
def param_default():
    corpus = 'fm_fc_ms_ff' #<-- Scope
    data_path = Path('../athena-datasets/' + corpus + '/')
    data_path_raw = Path('../athena-datasets/' + corpus + '/raw/')
    tokenizer_path = Path('../tokenizer/')
    return {
        'bpe_path' : tokenizer_path / 'universal_tokenizer/universal_tokenizer/roberta_aug_spaces',
        'eval_raw': [data_path_raw / 'eval/input.methods.txt',
                        data_path_raw / 'eval/output.tests.txt'],
        'test_raw': [data_path_raw / 'test/input.methods.txt', 
                        data_path_raw / 'test/output.tests.txt'],
        'train_raw': [data_path_raw / 'train/input.methods.txt', 
                        data_path_raw / 'train/output.tests.txt'],
        'data_labels' : ['eval_raw','test_raw','train_raw'],
        'output_pandas' : data_path / 'pandas/',
        'out_processed' : '/datasets/out_processed/',
        'model_name_or_path' : '~/data/dummy/models/', #Model Path
        'checkpoint_file': 'models/checkpoint_best_mod.pt' #Model 
    }

In [37]:
params = param_default()

In [38]:
params['checkpoint_file']

'models/checkpoint_best_mod.pt'

## Universal Tokenizer

In [45]:
def load_tokenizer(bpe_path):
    return ByteLevelBPETokenizer(str(bpe_path)+'-vocab.json',str(bpe_path)+'-merges.txt')

In [47]:
tokenizer = load_tokenizer(params['bpe_path'])

## Generative Functions

In [55]:
def lazy_decode(bpe_java):
    return bpe_java.replace(' ','').replace('Ġ',' ').replace('Ċ','\n')

In [48]:
def generate_unittest(focal_method,sampling=True,temperature=1):
    print('The focal method being tested is:\n',focal_method)
    focal_method = minify(focal_method)
    focal_method = ' '.join(tokenizer.encode(focal_method).tokens)
    #print('',focal_method)
    unit_test = prettify_java(lazy_decode(model.sample(focal_method,sampling=sampling,temperature=temperature)))
    print('\n-----------------------------------\nThe randomly generated unit test is:\n',unit_test)
    return unit_test

In [54]:
def prettify_java(minified_java):
    "tries to undo Michele's minification. Works decently, although for loops and sets get newlines inserted, and there are no empty lines or comments"
    minified_java = minified_java.replace('{','{\n').replace('}','}\n').replace(';',';\n')
    num_indents = 0
    pretty_java = ''
    for line in minified_java.splitlines():
        if line.lstrip().startswith('}'):
            num_indents -= 1
        pretty_java += num_indents*'    '+line+'\n'
        if line.endswith('{'):
            num_indents += 1
        if line.endswith('}') and not line.lstrip().startswith('}'):
            num_indents -= 1
    return pretty_java

In [49]:
def generate_k(focal_method,k,sampling=True):
    print('The focal method being tested is:\n',focal_method)
    focal_method = minify(focal_method)
    focal_method = ' '.join(tokenizer.encode(focal_method).tokens)
    for i in range(k):
        unit_test = prettify_java(lazy_decode(model.sample(focal_method,sampling=sampling, sampling_topk=k)))
        print('\n-----------------------------------\n',unit_test)

In [50]:
def minify(java_code):
    return ' '.join(java_code.split())

In [57]:
def generate_k_beam(focal_method, k):
    '''
    @k number of elements to be generated
    '''
    print('The focal method being tested is:\n',focal_method)
    focal_code = minify(focal_method)
    tokens = tokenizer.encode(focal_code).tokens
    
    MAX_TOKENS = 1023 ## HARDCODED <-----
    if len(tokens) > MAX_TOKENS:
	    tokens = tokens[0:MAX_TOKENS]
    focal_code = " ".join(tokens)

	# Run inference
    input = model.encode(focal_code)
	# candidates = model.generate(input, num_preds, diverse_beam_groups=5)
    candidates = model.generate(input, k)
	
    for candidate in candidates:
        decoded = model.decode(candidate['tokens'])
        unit_test = prettify_java(lazy_decode(decoded))
        print('\n-----------------------------------\n',unit_test)


## Loading and Testing

In [None]:
#TODO I believe this action is to upload tokenizer files or vocab, the format for fairseq should be "dict.input.methods.txt" or "dict.output.test.txt"
#TODO since target and source have the same vocab, they employ the same file
#!cp universal_tokenizer/roberta_aug_spaces_dict.txt dict.input.methods.txt
#!cp universal_tokenizer/roberta_aug_spaces_dict.txt dict.output.tests.txt

In [39]:
#Loading a pretrain model
model = TransformerModel.from_pretrained(
  '.',
  #model_name_or_path = 'checkpoint_best_mod.pt' # params['model_name_or_path'],
  checkpoint_file = params['checkpoint_file'] #params['checkpoint_file'],
)

In [40]:
model.to('cuda')

GeneratorHubInterface(
  (models): ModuleList(
    (0): BARTModel(
      (encoder): TransformerEncoderBase(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(50348, 1024, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(1026, 1024, padding_idx=1)
        (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0): TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (dropout_module):

### Testing

In [52]:
focal_method = "NumberUtils { public static long toLong(final String str) { return toLong(str, 0L); } NumberUtils(); static int toInt(final String str); static int toInt(final String str, final int defaultValue); static long toLong(final String str); static long toLong(final String str, final long defaultValue); static float toFloat(final String str); static float toFloat(final String str, final float defaultValue); static double toDouble(final String str); static double toDouble(final String str, final double defaultValue); static byte toByte(final String str); static byte toByte(final String str, final byte defaultValue); static short toShort(final String str); static short toShort(final String str, final short defaultValue); static Number createNumber(final String str); static Float createFloat(final String str); static Double createDouble(final String str); static Integer createInteger(final String str); static Long createLong(final String str); static BigInteger createBigInteger(final String str); static BigDecimal createBigDecimal(final String str); static long min(final long[] array); static int min(final int[] array); static short min(final short[] array); static byte min(final byte[] array); static double min(final double[] array); static float min(final float[] array); static long max(final long[] array); static int max(final int[] array); static short max(final short[] array); static byte max(final byte[] array); static double max(final double[] array); static float max(final float[] array); static long min(long a, final long b, final long c); static int min(int a, final int b, final int c); static short min(short a, final short b, final short c); static byte min(byte a, final byte b, final byte c); static double min(final double a, final double b, final double c); static float min(final float a, final float b, final float c); static long max(long a, final long b, final long c); static int max(int a, final int b, final int c); static short max(short a, final short b, final short c); static byte max(byte a, final byte b, final byte c); static double max(final double a, final double b, final double c); static float max(final float a, final float b, final float c); static boolean isDigits(final String str); static boolean isNumber(final String str); static final Long LONG_ZERO; static final Long LONG_ONE; static final Long LONG_MINUS_ONE; static final Integer INTEGER_ZERO; static final Integer INTEGER_ONE; static final Integer INTEGER_MINUS_ONE; static final Short SHORT_ZERO; static final Short SHORT_ONE; static final Short SHORT_MINUS_ONE; static final Byte BYTE_ZERO; static final Byte BYTE_ONE; static final Byte BYTE_MINUS_ONE; static final Double DOUBLE_ZERO; static final Double DOUBLE_ONE; static final Double DOUBLE_MINUS_ONE; static final Float FLOAT_ZERO; static final Float FLOAT_ONE; static final Float FLOAT_MINUS_ONE; }"

In [56]:
generate_k_beam(focal_method, 5)

The focal method being tested is:
 NumberUtils { public static long toLong(final String str) { return toLong(str, 0L); } NumberUtils(); static int toInt(final String str); static int toInt(final String str, final int defaultValue); static long toLong(final String str); static long toLong(final String str, final long defaultValue); static float toFloat(final String str); static float toFloat(final String str, final float defaultValue); static double toDouble(final String str); static double toDouble(final String str, final double defaultValue); static byte toByte(final String str); static byte toByte(final String str, final byte defaultValue); static short toShort(final String str); static short toShort(final String str, final short defaultValue); static Number createNumber(final String str); static Float createFloat(final String str); static Double createDouble(final String str); static Integer createInteger(final String str); static Long createLong(final String str); static BigInteger