# Athena Training Notebook (from Scratch)

>
> Excercise to replicate Athena Training by @davidN
>

In [3]:
from pathlib import Path
import csv
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

In [4]:
from tokenizers import ByteLevelBPETokenizer

In [5]:
def param_default():
    corpus = 'fm_fc_ms_ff' #<-- Scope
    data_path = Path('../athena-datasets/' + corpus + '/')
    data_path_raw = Path('../athena-datasets/' + corpus + '/raw/')
    tokenizer_path = Path('../tokenizer/')
    return {
        'bpe_path' : tokenizer_path / 'universal_tokenizer/universal_tokenizer/roberta_aug_spaces',
        'eval_raw': [data_path_raw / 'eval/input.methods.txt',
                        data_path_raw / 'eval/output.tests.txt'],
        'test_raw': [data_path_raw / 'test/input.methods.txt', 
                        data_path_raw / 'test/output.tests.txt'],
        'train_raw': [data_path_raw / 'train/input.methods.txt', 
                        data_path_raw / 'train/output.tests.txt'],
        'data_labels' : ['eval_raw','test_raw','train_raw'],
        'output_pandas' : data_path / 'pandas/',
        'out_processed' : '/datasets/out_processed/',
        'in_model' : '~/data/dummy/models/checkpoint_best_mod.pt'
    }

In [6]:
params = param_default()

In [7]:
#Input file names
inputs = "input.methods.txt"
outputs = "output.tests.txt"

In [8]:
#Acccctualllyy, we don't need to normalize anything! The BPE tokenizer will take care of this.
#Consider using a linter to standardize the java code
def normalize_method(method):
    return method
    #return method.replace(' . ','.').replace(' ;',';').replace(' ,',',').replace(' ( ','(').replace(' )',')')

## Universal Tokenizer

In [9]:
def load_tokenizer(bpe_path):
    return ByteLevelBPETokenizer(str(bpe_path)+'-vocab.json',str(bpe_path)+'-merges.txt')

In [10]:
tokenizer = load_tokenizer(params['bpe_path'])

## BPE Encoding

In [11]:
def bpe_encode_and_write(src_file,dest_file,tokenizer):
    with open(src_file,'r') as r, open(dest_file,'w') as w:
        lines = r.readlines()
        lines = [normalize_method(line.rstrip()) for line in lines]
        bpe_encodings = [' '.join(enc.tokens) for enc in tokenizer.encode_batch(lines)]
        for line in bpe_encodings:
            w.write(line+'\n')

In [12]:
def bpe_encode_folder(corpus_folder, processed_folder):
    for split in ['train','eval','test']:
        for basename in [inputs, outputs]:
            src_file = Path(corpus_folder)/Path(split)/Path(basename)
            print('encoding \t'+str(src_file))
            dest_file = Path(processed_folder)/Path(split+'.'+basename[:-4])
            bpe_encode_and_write(src_file,dest_file,tokenizer)

# Fairseq Preprocessing

In [17]:
def build_preprocessing_command(processed_root, src_dict):
    '''
    Writes the command for preprocessing in fairseq @davidN
    '''
    #src_dict = "/tufanodata/work/unit-test-gen/code/universal_tokenizer/universal_tokenizer/roberta_aug_spaces_dict.txt" 
    dest_dir = processed_root / 'bins/'
    src_dir_pref = processed_root

    src_ext = inputs
    tgt_ext = outputs

    source_lang = "input.methods"
    target_lang = "output.tests"

    trainpref = src_dir_pref / 'train'
    validpref = src_dir_pref / 'eval'
    testpref = src_dir_pref / 'test'

    command ="""
            fairseq-preprocess \
            --source-lang """ +  source_lang + """ \
            --target-lang """ + target_lang + """ \
            --trainpref """ + str(trainpref) + """ \
            --validpref """ + str(validpref) + """ \
            --testpref """ + str(testpref) + """ \
            --destdir """ + str(dest_dir) + """ \
            --workers 24 \
            --srcdict """ + str(src_dict) + """ \
            --joined-dictionary \
            """
    print(command)

In [18]:
#Processing a Folder
#processed_folder = corpus_root + "fm_fc_ms_ff" + "/processed/"
build_preprocessing_command(processed_root = params['bpe_path'], src_dict = params['out_processed'])


            fairseq-preprocess             --source-lang input.methods             --target-lang output.tests             --trainpref ../tokenizer/universal_tokenizer/universal_tokenizer/roberta_aug_spaces/train             --validpref ../tokenizer/universal_tokenizer/universal_tokenizer/roberta_aug_spaces/eval             --testpref ../tokenizer/universal_tokenizer/universal_tokenizer/roberta_aug_spaces/test             --destdir ../tokenizer/universal_tokenizer/universal_tokenizer/roberta_aug_spaces/bins             --workers 24             --srcdict /datasets/out_processed/             --joined-dictionary             


# Testing Training

In [20]:
! export TOTAL_NUM_UPDATES=100000
! export WARMUP_UPDATES=10000
! export LR=4.2e-05
! export UPDATE_FREQ=8
! export DIR=/tufanodata/work/unit-test-gen-context/results-models/fm_fc_ms_ff
! export MAX_TOKENS=1024
! export PRETRAINED=/tufanodata/work/unit-test-gen-context/models/bart-english+java/java_finetune_from_english_filtered.pt
! export DATA_DIR=/tufanodata/work/unit-test-gen-context/data/corpus/fm_fc_ms_ff/processed/bins
! export SRC_LANG=input.methods
! export TRG_LANG=output.tests