In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sparse
from collections import defaultdict

# Dataset Ingestion

In [4]:
cols = {0: 'pseudo', 1: 'code'}

eval_df = pd.read_csv('../../data/input-tok-eval.tsv', header=None, delimiter='\t')
eval_df.rename(columns=cols, inplace=True)

eval_df['pseudo'].fillna(' ')
eval_df['code'].fillna(' ')

eval_df['pseudo_token'] = eval_df['pseudo'].str.split(' ')
eval_df['code_token'] = eval_df['code'].str.split(' ')
eval_df.head()

Unnamed: 0,pseudo,code,pseudo_token,code_token
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]"
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]"


In [5]:
def create_copy_gen_seq_from_pseudo_row(row):
    """
    Returns binary sequence for pseudocode tokens based on 
    true code tokens

    If the pseudocode token exists in the true code (can be
    copied), then the sequence contains a 1 in that position
    """  
    code_token_set = set(row['code_token'])

    output_seq = []
    generate_seq = []

    for token in row['pseudo_token']:
        if token in code_token_set:
            output_seq.append(1)
            generate_seq.append('[CPY]')
        else:
            output_seq.append(0)
            generate_seq.append(token)

    # generate_seq.append('[END]')
    assert len(output_seq) == len(row['pseudo_token'])
    return (output_seq, generate_seq)

In [6]:
seqs = eval_df.apply(create_copy_gen_seq_from_pseudo_row, axis=1)
code_binary_seq = [x[0] for x in seqs]
generate_seq = [x[1] for x in seqs]
eval_df['pseudo_copy_seq'] = code_binary_seq
eval_df['pseudo_gen_seq'] = generate_seq
eval_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[let, [CPY], [CPY], [CPY], [CPY], [CPY], [CPY]..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[read, [CPY]]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[read, [CPY]]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[read, [CPY]]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[read, [CPY]]"
...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[[CPY], array, [CPY]]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY], [CPY]]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY], [CPY]]"
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[CPY], [CPY], [CPY], [CPY], to, [CPY], exclus..."


In [7]:
def create_gen_seq_from_code_row(row):
    """
    Returns binary sequence for pseudocode tokens based on 
    true code tokens

    If the pseudocode token exists in the true code (can be
    copied), then the sequence contains a [CPY] in that position
    """
    pseudo_token_set = set(row['pseudo_token'])

    gen_code_seq = []

    # for i, token in enumerate(row['code_token']):
    for token in row['code_token']:
        # if i == 0:
        #     gen_code_seq.append('[START]')
        # elif i == len(row['code_token']):
        #     gen_code_seq.append('[END]')

        if token in pseudo_token_set:
            gen_code_seq.append('[CPY]')
        else:
            gen_code_seq.append(token)

    return gen_code_seq

In [8]:
gen_code_seq = eval_df.apply(create_gen_seq_from_code_row, axis=1)
eval_df['code_gen_seq'] = gen_code_seq
eval_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq,code_gen_seq
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[let, [CPY], [CPY], [CPY], [CPY], [CPY], [CPY]...","[int, [CPY], [CPY], [CPY], [CPY], [CPY], [CPY]..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[read, [CPY]]","[cin, >>, [CPY], ;]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[read, [CPY]]","[cin, >>, [CPY], ;]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[read, [CPY]]","[cin, >>, [CPY], ;]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[read, [CPY]]","[cin, >>, [CPY], ;]"
...,...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[[CPY], array, [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY], [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY], [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]"
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[CPY], [CPY], [CPY], [CPY], to, [CPY], exclus...","[[CPY], (, int, [CPY], [CPY], [CPY], ;, [CPY],..."


In [9]:
eval_df['code_gen_seq_aug'] = eval_df['code_gen_seq'].apply(lambda x: ['[START]'] + x + ['[STOP]'])
eval_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq,code_gen_seq,code_gen_seq_aug
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[let, [CPY], [CPY], [CPY], [CPY], [CPY], [CPY]...","[int, [CPY], [CPY], [CPY], [CPY], [CPY], [CPY]...","[[START], int, [CPY], [CPY], [CPY], [CPY], [CP..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[read, [CPY]]","[cin, >>, [CPY], ;]","[[START], cin, >>, [CPY], ;, [STOP]]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[read, [CPY]]","[cin, >>, [CPY], ;]","[[START], cin, >>, [CPY], ;, [STOP]]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[read, [CPY]]","[cin, >>, [CPY], ;]","[[START], cin, >>, [CPY], ;, [STOP]]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[read, [CPY]]","[cin, >>, [CPY], ;]","[[START], cin, >>, [CPY], ;, [STOP]]"
...,...,...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[[CPY], array, [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]","[[START], [CPY], (, [CPY], ,, [CPY], +, m, ), ..."
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY], [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]","[[START], [CPY], (, [CPY], ,, [CPY], +, m, ), ..."
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY], [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]","[[START], [CPY], (, [CPY], ,, [CPY], +, m, ), ..."
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[CPY], [CPY], [CPY], [CPY], to, [CPY], exclus...","[[CPY], (, int, [CPY], [CPY], [CPY], ;, [CPY],...","[[START], [CPY], (, int, [CPY], [CPY], [CPY], ..."


In [10]:
eval_df[['pseudo_gen_seq', 'code_gen_seq', 'code_gen_seq_aug']].to_pickle('../../data/CPY_dataset_eval.pkl')

### Get Vocabulary of pseudocode and code

In [11]:
# Read the consts.csv
consts_df = pd.read_csv('consts.csv', header=None, index_col=0)

In [12]:
max_pseudo_gen = consts_df.loc['max_pseudo_gen'][1]
max_code_gen = consts_df.loc['max_code_gen'][1]
max_code_gen_aug = consts_df.loc['max_code_gen_aug'][1]

In [13]:
# def get_vocab(column_name):
#     vocab = set()
#     for line in train_df[column_name]:
#         for token in line:
#             vocab.add(token) 
#     return vocab

In [14]:
# def get_max_len(column_name):
#     maxlist = max(train_df[column_name], key=len)
#     return len(maxlist)

In [15]:
# pseudo_gen_vocab = get_vocab('pseudo_gen_seq')
# code_gen_aug_vocab = get_vocab('code_gen_seq_aug')

In [16]:
# max_pseudo_gen = get_max_len('pseudo_gen_seq')
# max_code_gen = get_max_len('code_gen_seq')
# max_code_gen_aug = get_max_len('code_gen_seq_aug')

## Make Indexes for each word

In [17]:
# input_words = sorted(list(pseudo_gen_vocab)) + ['[UNK]']
# output_words = sorted(list(code_gen_aug_vocab))

In [18]:
import pickle
input_words = pickle.load(open('input_words.pkl', 'rb'))
output_words = pickle.load(open('output_words.pkl', 'rb'))

In [19]:
pseudo_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
code_token_index = dict([(word, i+1) for i, word in enumerate(output_words)])

reverse_pseudo_index = dict((i, word) for word, i in pseudo_token_index.items())
reverse_code_index = dict((i, word) for word, i in code_token_index.items())

In [20]:
pseudo_token_index = defaultdict(lambda: pseudo_token_index[''], pseudo_token_index)
code_token_index = defaultdict(lambda: code_token_index[''], code_token_index)

# Define Seq2Seq model

In [21]:
num_encoder_tokens = len(input_words) + 1
latent_dim = 50 
num_decoder_tokens = len(output_words) + 1 
# +1 because of 0 padding. No. of tokens = 1078 but the index starts from 1 not from 0 so add 1

### Seq2Seq learning for generation

#### Define Placeholders for the input tensors

In [22]:
encoder_input_data = None
decoder_input_data = None
decoder_target_data = None

In [23]:
encoder_matrix_inputs = {
  'coords': [
    [],
    []
  ],
  'data': []
}

decoder_matrix_inputs = {
  'coords': [
    [],
    []
  ],
  'data': []
}

In [24]:
decoder_target_matrix_inputs = {
  'coords': [
    [],
    [],
    []
  ],
  'data': []
}

In [25]:
# encoder_input_data = np.zeros((train_df['pseudo_gen_seq'].shape[0], max_pseudo_gen), dtype='float32')


In [26]:
# decoder_input_data = np.zeros((train_df['pseudo_gen_seq'].shape[0], max_code_gen_aug), dtype='float32')

In [27]:
# decoder_target_data = np.zeros((train_df['pseudo_gen_seq'].shape[0], max_code_gen_aug, num_decoder_tokens), dtype='float32')

In [28]:
# print(encoder_input_data.shape)
# print(decoder_input_data.shape)
# print(decoder_target_data.shape)

In [29]:
# errpr

In [30]:
for i, (input_seq, target_seq) in enumerate(eval_df[['pseudo_gen_seq', 'code_gen_seq_aug']].values):
  # print(input_text, target_text)
  for t, word in enumerate(input_seq):
    encoder_matrix_inputs['coords'][0].append(i)
    encoder_matrix_inputs['coords'][1].append(t)
    encoder_matrix_inputs['data'].append(pseudo_token_index[word])
    # encoder_input_data[i, t] = pseudo_token_index[word]

  for t, word in enumerate(target_seq):
    # print(t, word)
    # decoder_target_data is ahead of decoder_input_data by one timestep
    decoder_matrix_inputs['coords'][0].append(i)
    decoder_matrix_inputs['coords'][1].append(t)
    decoder_matrix_inputs['data'].append(code_token_index[word])

    # decoder_input_data[i, t] = code_token_index[word]   
    # print(word, code_token_index[word])
    if t > 0:
      # decoder_target_data will be ahead by one timestep
      # and will not include the start character.
      # decoder_target_data[i, t - 1, code_token_index[word]] = 1
      decoder_target_matrix_inputs['coords'][0].append(i)
      decoder_target_matrix_inputs['coords'][1].append(t-1)
      decoder_target_matrix_inputs['coords'][2].append(code_token_index[word])
      decoder_target_matrix_inputs['data'].append(1.0)

In [31]:
encoder_input_data = sparse.COO(**encoder_matrix_inputs, shape=(eval_df['pseudo_gen_seq'].shape[0], max_pseudo_gen))
encoder_input_data

0,1
Format,coo
Data Type,int64
Shape,"(19180, 85)"
nnz,145765
Density,0.0894099245537631
Read-only,True
Size,3.3M
Storage ratio,0.3


In [32]:
decoder_input_data = sparse.COO(**decoder_matrix_inputs, shape=(eval_df['pseudo_gen_seq'].shape[0], max_code_gen_aug))
decoder_input_data

0,1
Format,coo
Data Type,int64
Shape,"(19180, 87)"
nnz,207845
Density,0.12455802859779704
Read-only,True
Size,4.8M
Storage ratio,0.4


In [33]:
decoder_target_data = sparse.COO(**decoder_target_matrix_inputs, shape=(eval_df['pseudo_gen_seq'].shape[0], max_code_gen_aug, num_decoder_tokens))
decoder_target_data

0,1
Format,coo
Data Type,float64
Shape,"(19180, 87, 1079)"
nnz,188665
Density,0.00010478570502709344
Read-only,True
Size,5.8M
Storage ratio,0.0


In [34]:
# # print(train_df['pseudo_gen_seq'][1500])
# # encoder_input_data[1500][:100]

print(eval_df['code_gen_seq_aug'][1500])
# # decoder_input_data[1500][:100]
decoder_target_data[1500][1].shape

['[START]', 'int', '[CPY]', '[CPY]', '[CPY]', '[CPY]', '[CPY]', ';', '[STOP]']


(1079,)

### Inference

In [35]:
# Load a keras saved model

model = tf.keras.models.load_model('../models/50_epochs_saved/model_50_epochs.h5', compile=True)
encoder = tf.keras.models.load_model('../models/50_epochs_saved/encoder_50_epochs.h5', compile=True)
decoder = tf.keras.models.load_model('../models/50_epochs_saved/decoder_50_epochs.h5', compile=True)

2022-05-04 11:28:52.095932: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [36]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    # states_value = encoder_model.predict(input_seq)
    # try:
    states_value = encoder.predict(input_seq)
    # except Exception as e:
    #     print(e)
    #     print(input_seq)
    #     print(input_seq.shape)
    #     return

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = code_token_index['[START]']
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    # decoded_sentence = ''
    decoded_sentence = []
    
    while not stop_condition:
        output_tokens, h, c = decoder.predict([target_seq] + states_value)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_code_index[sampled_token_index]
        # decoded_sentence += ' '+sampled_char
        decoded_sentence.append(sampled_char)
        
        # Exit condition: either hit max length or find stop token.
        if (sampled_char == '[STOP]' or len(decoded_sentence) > 50):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        
        # Update states
        states_value = [h, c]
    
    return decoded_sentence

In [37]:
for seq_index in [599, 100]:
    # print(encoder_input_data[seq_index])
    input_seq = encoder_input_data[seq_index: seq_index+1].todense()
    # print(input_seq)
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', eval_df['pseudo_gen_seq'][seq_index: seq_index + 1])
    print('True sentence:', eval_df['code_gen_seq_aug'][seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: 599    [print, [CPY], and, endline]
Name: pseudo_gen_seq, dtype: object
True sentence: 599    [[START], cout, <<, [CPY], <<, endl, ;, [STOP]]
Name: code_gen_seq_aug, dtype: object
Decoded sentence: ['cout', '<<', '[CPY]', '<<', 'endl', ';', '[STOP]']
-
Input sentence: 100    [print, [CPY], [CPY]]
Name: pseudo_gen_seq, dtype: object
True sentence: 100    [[START], cout, <<, [CPY], [CPY], <<, endl, ;,...
Name: code_gen_seq_aug, dtype: object
Decoded sentence: ['cout', '<<', '[CPY]', '[CPY]', '<<', 'endl', ';', '[STOP]']


In [38]:
pred_df = eval_df[['pseudo_gen_seq', 'code_gen_seq']].copy()
pred_df['pred_code_gen_seq'] = ''
pred_df

Unnamed: 0,pseudo_gen_seq,code_gen_seq,pred_code_gen_seq
0,"[let, [CPY], [CPY], [CPY], [CPY], [CPY], [CPY]...","[int, [CPY], [CPY], [CPY], [CPY], [CPY], [CPY]...",
1,"[read, [CPY]]","[cin, >>, [CPY], ;]",
2,"[read, [CPY]]","[cin, >>, [CPY], ;]",
3,"[read, [CPY]]","[cin, >>, [CPY], ;]",
4,"[read, [CPY]]","[cin, >>, [CPY], ;]",
...,...,...,...
19175,"[[CPY], array, [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]",
19176,"[[CPY], [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]",
19177,"[[CPY], [CPY]]","[[CPY], (, [CPY], ,, [CPY], +, m, ), ;]",
19178,"[[CPY], [CPY], [CPY], [CPY], to, [CPY], exclus...","[[CPY], (, int, [CPY], [CPY], [CPY], ;, [CPY],...",


In [39]:
def predict_code_gen_seq(row):
    """
    Predict code from pseudocode
    """
    # Index of a row in the dataframe
    seq_index = row.name
    input_seq = encoder_input_data[seq_index: seq_index+1].todense()

    # Decode the input as state vectors
    # print('row', row)
    decoded_sentence = decode_sequence(input_seq)

    if seq_index % 500 == 0:
        print(seq_index)
    
    return decoded_sentence[:-1]

In [40]:
pred_df['pred_code_gen_seq'] = pred_df.apply(predict_code_gen_seq, axis=1)
pred_df.to_pickle('pred_eval.pkl') 

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000


In [41]:
read_df = pd.read_pickle('pred.pkl')

In [42]:
read_df

Unnamed: 0,pseudo_gen_seq,code_gen_seq,pred_code_gen_seq
0,"[set, [CPY], to, [CPY]]","[[CPY], =, [CPY], ;]","[[CPY], =, [CPY], ;]"
1,"[[CPY], [CPY], is, [CPY]]","[[CPY], (, [CPY], ==, [CPY], )]","[[CPY], (, [CPY], ==, [CPY], )]"
2,"[read, [CPY], and, [CPY]]","[cin, >>, [CPY], >>, [CPY], ;]","[cin, >>, [CPY], >>, [CPY], ;]"
3,"[declare, [CPY], longs, [CPY], [CPY], [CPY], a...","[[CPY], [CPY], [CPY], [CPY], [CPY], ,, [CPY], ...","[[CPY], [CPY], [CPY], [CPY], [CPY], ,, [CPY], ..."
4,"[[CPY], [CPY], integer, array, where, the, the...","[int, [CPY], [CPY], [CPY], [CPY], {, [CPY], [C...","[int, [CPY], [CPY], [CPY], [CPY], [CPY], [CPY]..."
...,...,...,...
181857,"[declare, [CPY], constant, integer, [CPY], [CP...","[[CPY], const, int, [CPY], [CPY], [CPY], ;]","[const, int, [CPY], [CPY], [CPY], ;]"
181858,"[print, [CPY], and, a, new, line]","[cout, <<, "", [CPY], "", <<, ', \n, ', ;]","[cout, <<, [CPY], <<, endl, ;]"
181859,"[change, the, value, of, [CPY], to, [CPY], [CP...","[[CPY], =, [CPY], [CPY], [CPY], [CPY], [CPY], ...","[[CPY], =, [CPY], [CPY], [CPY], [CPY], [CPY], ..."
181860,"[[CPY], [CPY], [CPY], is, less, than, [CPY]]","[[CPY], [CPY], (, [CPY], <, [CPY], )]","[[CPY], [CPY], (, [CPY], <, [CPY], )]"


In [43]:
# test_df = pd.read_csv('../data/input-tok-test.tsv', sep='\t', header=None, names=['pseudo', 'code'])

# test_df['pseudo_token'] = test_df['pseudo'].str.split(' ')
# test_df['code_token'] = test_df['code'].str.split(' ')

# gen_code_seq = test_df.apply(create_gen_seq_from_code_row, axis=1)
# test_df['code_gen_seq'] = gen_code_seq
# test_df

In [44]:
# seqs = test_df.apply(create_copy_gen_seq_from_pseudo_row, axis=1)
# code_binary_seq = [x[0] for x in seqs]
# generate_seq = [x[1] for x in seqs]

# test_df['pseudo_copy_seq'] = code_binary_seq
# test_df['pseudo_gen_seq'] = generate_seq

# test_df['code_gen_seq_aug'] = test_df['code_gen_seq'].apply(lambda x: ['[START]'] + x + ['[STOP]'])
# test_df

In [45]:
# pred_df_test = test_df[['pseudo_gen_seq', 'code_gen_seq']].copy()
# pred_df_test['pred_code_gen_seq'] = ''
# pred_df_test

In [46]:
# for seq_index in range(encoder_input_data.shape[0]):
#     # encoder_input_data.shape[0]
#     # print(encoder_input_data[seq_index])
#     input_seq = encoder_input_data[seq_index: seq_index+1]
#     # print(input_seq)
#     decoded_sentence = decode_sequence(input_seq)
#     # print('-')
#     # print('Input sentence:', train_df['pseudo_gen_seq'][seq_index: seq_index + 1])
#     # print('True sentence:', train_df['code_gen_seq_aug'][seq_index: seq_index + 1])
#     # print('Decoded sentence:', decoded_sentence)
#     pred_df['pred_code_gen_seq'][seq_index] = decoded_sentence[:-1]

In [47]:
# def predict_code_gen_seq(row):
#     """
#     Predict code from pseudocode
#     """
#     # Index of a row in the dataframe
#     seq_index = row.name
#     input_seq = encoder_input_data[seq_index: seq_index+1]

#     # Decode the input as state vectors
#     decoded_sentence = decode_sequence(input_seq)

#     if seq_index % 1000 == 0:
#         print(seq_index)
    
#     return decoded_sentence[:-1]

In [48]:
# for i, (input_seq, target_seq) in enumerate(zip(test_df['pseudo_gen_seq'], test_df['code_gen_seq_aug'])):
#     # print(input_text, target_text)
#     for t, word in enumerate(input_seq):
#         encoder_input_data[i, t] = pseudo_token_index[word] if word in pseudo_token_index else 0
#         # print(word, pseudo_token_index[word])
#     for t, word in enumerate(target_seq):
        
#         # decoder_target_data is ahead of decoder_input_data by one timestep
#         decoder_input_data[i, t] = code_token_index[word] if word in code_token_index else 0
#         # print(word, code_token_index[word])
#         if t > 0:
#             # decoder_target_data will be ahead by one timestep
#             # and will not include the start character.
#             decoder_target_data[i, t - 1, code_token_index[word]] = 1.

In [49]:
# for seq_index in [399, 100]:
#     # encoder_input_data.shape[0]
#     # print(encoder_input_data[seq_index])
#     input_seq = encoder_input_data[seq_index: seq_index+1]
#     # print(input_seq)
#     decoded_sentence = decode_sequence(input_seq)
#     # print('-')
#     # print('Input sentence:', train_df['pseudo_gen_seq'][seq_index: seq_index + 1])
#     # print('True sentence:', train_df['code_gen_seq_aug'][seq_index: seq_index + 1])
#     # print('Decoded sentence:', decoded_sentence)
#     pred_df['pred_code_gen_seq'][seq_index] = decoded_sentence[:-1]