In [14]:
!pip install sparse



In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sparse

# Dataset Ingestion

In [16]:
cols = {0: 'pseudo', 1: 'code'}

train_df = pd.read_csv('../../data/input-tok-train-shuf.tsv', header=None, delimiter='\t')
train_df.rename(columns=cols, inplace=True)

train_df['pseudo'].fillna(' ')
train_df['code'].fillna(' ')

train_df['pseudo_token'] = train_df['pseudo'].str.split(' ')
train_df['code_token'] = train_df['code'].str.split(' ')
train_df.head()

Unnamed: 0,pseudo,code,pseudo_token,code_token
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,..."
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ..."


In [17]:
def create_copy_gen_seq_from_pseudo_row(row):
    """
    Returns binary sequence for pseudocode tokens based on 
    true code tokens

    If the pseudocode token exists in the true code (can be
    copied), then the sequence contains a 1 in that position
    """  
    code_token_set = set(row['code_token'])

    output_seq = []
    generate_seq = []

    for token in row['pseudo_token']:
        if token in code_token_set:
            output_seq.append(1)
            generate_seq.append('[CPY]')
        else:
            output_seq.append(0)
            generate_seq.append(token)

    # generate_seq.append('[END]')
    assert len(output_seq) == len(row['pseudo_token'])
    return (output_seq, generate_seq)

In [18]:
seqs = train_df.apply(create_copy_gen_seq_from_pseudo_row, axis=1)
code_binary_seq = [x[0] for x in seqs]
generate_seq = [x[1] for x in seqs]
train_df['pseudo_copy_seq'] = code_binary_seq
train_df['pseudo_gen_seq'] = generate_seq
train_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]","[0, 1, 0, 1]","[set, [CPY], to, [CPY]]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]","[1, 1, 0, 1]","[[CPY], [CPY], is, [CPY]]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]","[0, 1, 0, 1]","[read, [CPY], and, [CPY]]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1]","[declare, [CPY], longs, [CPY], [CPY], [CPY], a..."
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[[CPY], [CPY], integer, array, where, the, the..."
...,...,...,...,...,...,...
181857,declare static constant integer mod = 1000000009,static const int mod = 1000000009 ;,"[declare, static, constant, integer, mod, =, 1...","[static, const, int, mod, =, 1000000009, ;]","[0, 1, 0, 0, 1, 1, 1]","[declare, [CPY], constant, integer, [CPY], [CP..."
181858,print NO and a new line,"cout << "" NO "" << ' \n ' ;","[print, NO, and, a, new, line]","[cout, <<, "", NO, "", <<, ', \n, ', ;]","[0, 1, 0, 0, 0, 0]","[print, [CPY], and, a, new, line]"
181859,change the value of ans to abs ( x - y ) / d,ans = abs ( x - y ) / d ;,"[change, the, value, of, ans, to, abs, (, x, -...","[ans, =, abs, (, x, -, y, ), /, d, ;]","[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]","[change, the, value, of, [CPY], to, [CPY], [CP..."
181860,else if s is less than f,else if ( s < f ),"[else, if, s, is, less, than, f]","[else, if, (, s, <, f, )]","[1, 1, 1, 0, 0, 0, 1]","[[CPY], [CPY], [CPY], is, less, than, [CPY]]"


In [19]:
def create_gen_seq_from_code_row(row):
    """
    Returns binary sequence for pseudocode tokens based on 
    true code tokens

    If the pseudocode token exists in the true code (can be
    copied), then the sequence contains a 1 in that position
    """
    pseudo_token_set = set(row['pseudo_token'])

    gen_code_seq = []

    # for i, token in enumerate(row['code_token']):
    for token in row['code_token']:
        # if i == 0:
        #     gen_code_seq.append('[START]')
        # elif i == len(row['code_token']):
        #     gen_code_seq.append('[END]')

        if token in pseudo_token_set:
            gen_code_seq.append('[CPY]')
        else:
            gen_code_seq.append(token)

    return gen_code_seq

In [20]:
gen_code_seq = train_df.apply(create_gen_seq_from_code_row, axis=1)
train_df['code_gen_seq'] = gen_code_seq
train_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq,code_gen_seq
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]","[0, 1, 0, 1]","[set, [CPY], to, [CPY]]","[[CPY], =, [CPY], ;]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]","[1, 1, 0, 1]","[[CPY], [CPY], is, [CPY]]","[[CPY], (, [CPY], ==, [CPY], )]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]","[0, 1, 0, 1]","[read, [CPY], and, [CPY]]","[cin, >>, [CPY], >>, [CPY], ;]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1]","[declare, [CPY], longs, [CPY], [CPY], [CPY], a...","[[CPY], [CPY], [CPY], [CPY], [CPY], ,, [CPY], ..."
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[[CPY], [CPY], integer, array, where, the, the...","[int, [CPY], [CPY], [CPY], [CPY], {, [CPY], [C..."
...,...,...,...,...,...,...,...
181857,declare static constant integer mod = 1000000009,static const int mod = 1000000009 ;,"[declare, static, constant, integer, mod, =, 1...","[static, const, int, mod, =, 1000000009, ;]","[0, 1, 0, 0, 1, 1, 1]","[declare, [CPY], constant, integer, [CPY], [CP...","[[CPY], const, int, [CPY], [CPY], [CPY], ;]"
181858,print NO and a new line,"cout << "" NO "" << ' \n ' ;","[print, NO, and, a, new, line]","[cout, <<, "", NO, "", <<, ', \n, ', ;]","[0, 1, 0, 0, 0, 0]","[print, [CPY], and, a, new, line]","[cout, <<, "", [CPY], "", <<, ', \n, ', ;]"
181859,change the value of ans to abs ( x - y ) / d,ans = abs ( x - y ) / d ;,"[change, the, value, of, ans, to, abs, (, x, -...","[ans, =, abs, (, x, -, y, ), /, d, ;]","[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]","[change, the, value, of, [CPY], to, [CPY], [CP...","[[CPY], =, [CPY], [CPY], [CPY], [CPY], [CPY], ..."
181860,else if s is less than f,else if ( s < f ),"[else, if, s, is, less, than, f]","[else, if, (, s, <, f, )]","[1, 1, 1, 0, 0, 0, 1]","[[CPY], [CPY], [CPY], is, less, than, [CPY]]","[[CPY], [CPY], (, [CPY], <, [CPY], )]"


In [21]:
train_df['code_gen_seq_aug'] = train_df['code_gen_seq'].apply(lambda x: ['[START]'] + x + ['[STOP]'])
train_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq,code_gen_seq,code_gen_seq_aug
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]","[0, 1, 0, 1]","[set, [CPY], to, [CPY]]","[[CPY], =, [CPY], ;]","[[START], [CPY], =, [CPY], ;, [STOP]]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]","[1, 1, 0, 1]","[[CPY], [CPY], is, [CPY]]","[[CPY], (, [CPY], ==, [CPY], )]","[[START], [CPY], (, [CPY], ==, [CPY], ), [STOP]]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]","[0, 1, 0, 1]","[read, [CPY], and, [CPY]]","[cin, >>, [CPY], >>, [CPY], ;]","[[START], cin, >>, [CPY], >>, [CPY], ;, [STOP]]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1]","[declare, [CPY], longs, [CPY], [CPY], [CPY], a...","[[CPY], [CPY], [CPY], [CPY], [CPY], ,, [CPY], ...","[[START], [CPY], [CPY], [CPY], [CPY], [CPY], ,..."
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[[CPY], [CPY], integer, array, where, the, the...","[int, [CPY], [CPY], [CPY], [CPY], {, [CPY], [C...","[[START], int, [CPY], [CPY], [CPY], [CPY], {, ..."
...,...,...,...,...,...,...,...,...
181857,declare static constant integer mod = 1000000009,static const int mod = 1000000009 ;,"[declare, static, constant, integer, mod, =, 1...","[static, const, int, mod, =, 1000000009, ;]","[0, 1, 0, 0, 1, 1, 1]","[declare, [CPY], constant, integer, [CPY], [CP...","[[CPY], const, int, [CPY], [CPY], [CPY], ;]","[[START], [CPY], const, int, [CPY], [CPY], [CP..."
181858,print NO and a new line,"cout << "" NO "" << ' \n ' ;","[print, NO, and, a, new, line]","[cout, <<, "", NO, "", <<, ', \n, ', ;]","[0, 1, 0, 0, 0, 0]","[print, [CPY], and, a, new, line]","[cout, <<, "", [CPY], "", <<, ', \n, ', ;]","[[START], cout, <<, "", [CPY], "", <<, ', \n, ',..."
181859,change the value of ans to abs ( x - y ) / d,ans = abs ( x - y ) / d ;,"[change, the, value, of, ans, to, abs, (, x, -...","[ans, =, abs, (, x, -, y, ), /, d, ;]","[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]","[change, the, value, of, [CPY], to, [CPY], [CP...","[[CPY], =, [CPY], [CPY], [CPY], [CPY], [CPY], ...","[[START], [CPY], =, [CPY], [CPY], [CPY], [CPY]..."
181860,else if s is less than f,else if ( s < f ),"[else, if, s, is, less, than, f]","[else, if, (, s, <, f, )]","[1, 1, 1, 0, 0, 0, 1]","[[CPY], [CPY], [CPY], is, less, than, [CPY]]","[[CPY], [CPY], (, [CPY], <, [CPY], )]","[[START], [CPY], [CPY], (, [CPY], <, [CPY], ),..."


In [22]:
train_df[['pseudo_gen_seq', 'code_gen_seq', 'code_gen_seq_aug']].to_pickle('../../data/CPY_dataset.pkl')

### Get Vocabulary of pseudocode and code

In [23]:
def get_vocab(column_name):
    vocab = set()
    for line in train_df[column_name]:
        for token in line:
            vocab.add(token) 
    return vocab

In [24]:
def get_max_len(column_name):
    maxlist = max(train_df[column_name], key=len)
    return len(maxlist)

In [25]:
pseudo_gen_vocab = get_vocab('pseudo_gen_seq')
code_gen_aug_vocab = get_vocab('code_gen_seq_aug')

In [26]:
max_pseudo_gen = get_max_len('pseudo_gen_seq')
max_code_gen = get_max_len('code_gen_seq')
max_code_gen_aug = get_max_len('code_gen_seq_aug')

In [27]:
# Save into text files
with open('consts.csv', 'w') as f:
    f.write('max_pseudo_gen,{}\n'.format(max_pseudo_gen))
    f.write('max_code_gen,{}\n'.format(max_code_gen))
    f.write('max_code_gen_aug,{}\n'.format(max_code_gen_aug))

## Make Indexes for each word

In [28]:
input_words = sorted(list(pseudo_gen_vocab)) + ['[UNK]']
output_words = sorted(list(code_gen_aug_vocab))

In [29]:
import pickle
pickle.dump(input_words, open('input_words.pkl', 'wb'))
pickle.dump(output_words, open('output_words.pkl', 'wb'))

In [30]:
pseudo_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
code_token_index = dict([(word, i+1) for i, word in enumerate(output_words)])

reverse_pseudo_index = dict((i, word) for word, i in pseudo_token_index.items())
reverse_code_index = dict((i, word) for word, i in code_token_index.items())

# Define Seq2Seq model

In [31]:
num_encoder_tokens = len(input_words) + 1
latent_dim = 50 
num_decoder_tokens = len(output_words) + 1 
# +1 because of 0 padding. No. of tokens = 1078 but the index starts from 1 not from 0 so add 1

### Seq2Seq learning for generation

#### Define Placeholders for the input tensors

In [32]:
encoder_input_data = None
decoder_input_data = None
decoder_target_data = None

In [33]:
encoder_matrix_inputs = {
  'coords': [
    [],
    []
  ],
  'data': []
}

decoder_matrix_inputs = {
  'coords': [
    [],
    []
  ],
  'data': []
}

In [34]:
decoder_target_matrix_inputs = {
  'coords': [
    [],
    [],
    []
  ],
  'data': []
}

In [35]:
# encoder_input_data = np.zeros((train_df['pseudo_gen_seq'].shape[0], max_pseudo_gen), dtype='float32')


In [36]:
# decoder_input_data = np.zeros((train_df['pseudo_gen_seq'].shape[0], max_code_gen_aug), dtype='float32')

In [37]:
# decoder_target_data = np.zeros((train_df['pseudo_gen_seq'].shape[0], max_code_gen_aug, num_decoder_tokens), dtype='float32')

In [38]:
# print(encoder_input_data.shape)
# print(decoder_input_data.shape)
# print(decoder_target_data.shape)

In [39]:
# errpr

In [40]:
for i, (input_seq, target_seq) in enumerate(train_df[['pseudo_gen_seq', 'code_gen_seq_aug']].values):
  # print(input_text, target_text)
  for t, word in enumerate(input_seq):
    encoder_matrix_inputs['coords'][0].append(i)
    encoder_matrix_inputs['coords'][1].append(t)
    encoder_matrix_inputs['data'].append(pseudo_token_index[word])
    # encoder_input_data[i, t] = pseudo_token_index[word]

  for t, word in enumerate(target_seq):
    # print(t, word)
    # decoder_target_data is ahead of decoder_input_data by one timestep
    decoder_matrix_inputs['coords'][0].append(i)
    decoder_matrix_inputs['coords'][1].append(t)
    decoder_matrix_inputs['data'].append(code_token_index[word])

    # decoder_input_data[i, t] = code_token_index[word]   
    # print(word, code_token_index[word])
    if t > 0:
      # decoder_target_data will be ahead by one timestep
      # and will not include the start character.
      # decoder_target_data[i, t - 1, code_token_index[word]] = 1
      decoder_target_matrix_inputs['coords'][0].append(i)
      decoder_target_matrix_inputs['coords'][1].append(t-1)
      decoder_target_matrix_inputs['coords'][2].append(code_token_index[word])
      decoder_target_matrix_inputs['data'].append(1.0)

In [41]:
encoder_input_data = sparse.COO(**encoder_matrix_inputs, shape=(train_df['pseudo_gen_seq'].shape[0], max_pseudo_gen))
encoder_input_data

0,1
Format,coo
Data Type,int64
Shape,"(181862, 85)"
nnz,1417923
Density,0.0917258528929822
Read-only,True
Size,32.5M
Storage ratio,0.3


In [42]:
decoder_input_data = sparse.COO(**decoder_matrix_inputs, shape=(train_df['pseudo_gen_seq'].shape[0], max_code_gen_aug))
decoder_input_data

0,1
Format,coo
Data Type,int64
Shape,"(181862, 87)"
nnz,2001053
Density,0.12647287061289494
Read-only,True
Size,45.8M
Storage ratio,0.4


In [43]:
decoder_target_data = sparse.COO(**decoder_target_matrix_inputs, shape=(train_df['pseudo_gen_seq'].shape[0], max_code_gen_aug, num_decoder_tokens))
decoder_target_data

0,1
Format,coo
Data Type,float64
Shape,"(181862, 87, 1079)"
nnz,1819191
Density,0.00010656035008279122
Read-only,True
Size,55.5M
Storage ratio,0.0


In [44]:
# # print(train_df['pseudo_gen_seq'][1500])
# # encoder_input_data[1500][:100]

print(train_df['code_gen_seq_aug'][1500])
# # decoder_input_data[1500][:100]
decoder_target_data[1500][1].shape

['[START]', '[CPY]', '(', '[CPY]', '>=', '[CPY]', ')', '[CPY]', ';', '[STOP]']


(1079,)

### Inference

In [46]:
# Load a keras saved model

model = tf.keras.models.load_model('../models/50_epochs_saved/model_50_epochs.h5', compile=True)
encoder = tf.keras.models.load_model('../models/50_epochs_saved/encoder_50_epochs.h5', compile=True)
decoder = tf.keras.models.load_model('../models/50_epochs_saved/decoder_50_epochs.h5', compile=True)

2022-05-04 11:20:11.677546: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [47]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    # states_value = encoder_model.predict(input_seq)
    print(input_seq.shape)
    states_value = encoder.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = code_token_index['[START]']
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    # decoded_sentence = ''
    decoded_sentence = []
    
    while not stop_condition:
        output_tokens, h, c = decoder.predict([target_seq] + states_value)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_code_index[sampled_token_index]
        # decoded_sentence += ' '+sampled_char
        decoded_sentence.append(sampled_char)
        
        # Exit condition: either hit max length or find stop token.
        if (sampled_char == '[STOP]' or len(decoded_sentence) > 50):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        
        # Update states
        states_value = [h, c]
    
    return decoded_sentence

In [48]:
for seq_index in [399, 100]:
    # print(encoder_input_data[seq_index])
    input_seq = encoder_input_data[seq_index: seq_index+1].todense()
    # print(input_seq)
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', train_df['pseudo_gen_seq'][seq_index: seq_index + 1])
    print('True sentence:', train_df['code_gen_seq_aug'][seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)

(1, 85)
-
Input sentence: 399    [decrement, [CPY]]
Name: pseudo_gen_seq, dtype: object
True sentence: 399    [[START], [CPY], --, ;, [STOP]]
Name: code_gen_seq_aug, dtype: object
Decoded sentence: ['[CPY]', '--', ';', '[STOP]']
(1, 85)
-
Input sentence: 100    [assign, [CPY], [CPY], to, [CPY]]
Name: pseudo_gen_seq, dtype: object
True sentence: 100    [[START], int, [CPY], =, [CPY], [CPY], ;, [STOP]]
Name: code_gen_seq_aug, dtype: object
Decoded sentence: ['[CPY]', '=', '[CPY]', '[CPY]', '[CPY]', ';', '[STOP]']


In [49]:
pred_df = train_df[['pseudo_gen_seq', 'code_gen_seq']].copy()
pred_df['pred_code_gen_seq'] = ''
pred_df

Unnamed: 0,pseudo_gen_seq,code_gen_seq,pred_code_gen_seq
0,"[set, [CPY], to, [CPY]]","[[CPY], =, [CPY], ;]",
1,"[[CPY], [CPY], is, [CPY]]","[[CPY], (, [CPY], ==, [CPY], )]",
2,"[read, [CPY], and, [CPY]]","[cin, >>, [CPY], >>, [CPY], ;]",
3,"[declare, [CPY], longs, [CPY], [CPY], [CPY], a...","[[CPY], [CPY], [CPY], [CPY], [CPY], ,, [CPY], ...",
4,"[[CPY], [CPY], integer, array, where, the, the...","[int, [CPY], [CPY], [CPY], [CPY], {, [CPY], [C...",
...,...,...,...
181857,"[declare, [CPY], constant, integer, [CPY], [CP...","[[CPY], const, int, [CPY], [CPY], [CPY], ;]",
181858,"[print, [CPY], and, a, new, line]","[cout, <<, "", [CPY], "", <<, ', \n, ', ;]",
181859,"[change, the, value, of, [CPY], to, [CPY], [CP...","[[CPY], =, [CPY], [CPY], [CPY], [CPY], [CPY], ...",
181860,"[[CPY], [CPY], [CPY], is, less, than, [CPY]]","[[CPY], [CPY], (, [CPY], <, [CPY], )]",


In [50]:
def predict_code_gen_seq(row):
    """
    Predict code from pseudocode
    """
    # Index of a row in the dataframe
    seq_index = row.name
    input_seq = encoder_input_data[seq_index: seq_index+1].todense()

    # Decode the input as state vectors
    decoded_sentence = decode_sequence(input_seq)

    if seq_index % 500 == 0:
        print(seq_index)
    
    return decoded_sentence[:-1]

In [51]:
pred_df['pred_code_gen_seq'] = pred_df.apply(predict_code_gen_seq, axis=1)
# pred_df.to_pickle('pred.pkl') 

(1, 85)
0
(1, 85)
(1, 85)
(1, 85)
(1, 85)


KeyboardInterrupt: 

In [None]:
read_df = pd.read_pickle('pred.pkl')

In [None]:
read_df

NameError: name 'read_df' is not defined

In [None]:
# test_df = pd.read_csv('../data/input-tok-test.tsv', sep='\t', header=None, names=['pseudo', 'code'])

# test_df['pseudo_token'] = test_df['pseudo'].str.split(' ')
# test_df['code_token'] = test_df['code'].str.split(' ')

# gen_code_seq = test_df.apply(create_gen_seq_from_code_row, axis=1)
# test_df['code_gen_seq'] = gen_code_seq
# test_df

In [None]:
# seqs = test_df.apply(create_copy_gen_seq_from_pseudo_row, axis=1)
# code_binary_seq = [x[0] for x in seqs]
# generate_seq = [x[1] for x in seqs]

# test_df['pseudo_copy_seq'] = code_binary_seq
# test_df['pseudo_gen_seq'] = generate_seq

# test_df['code_gen_seq_aug'] = test_df['code_gen_seq'].apply(lambda x: ['[START]'] + x + ['[STOP]'])
# test_df

In [None]:
# pred_df_test = test_df[['pseudo_gen_seq', 'code_gen_seq']].copy()
# pred_df_test['pred_code_gen_seq'] = ''
# pred_df_test

In [None]:
# for seq_index in range(encoder_input_data.shape[0]):
#     # encoder_input_data.shape[0]
#     # print(encoder_input_data[seq_index])
#     input_seq = encoder_input_data[seq_index: seq_index+1]
#     # print(input_seq)
#     decoded_sentence = decode_sequence(input_seq)
#     # print('-')
#     # print('Input sentence:', train_df['pseudo_gen_seq'][seq_index: seq_index + 1])
#     # print('True sentence:', train_df['code_gen_seq_aug'][seq_index: seq_index + 1])
#     # print('Decoded sentence:', decoded_sentence)
#     pred_df['pred_code_gen_seq'][seq_index] = decoded_sentence[:-1]

In [None]:
# def predict_code_gen_seq(row):
#     """
#     Predict code from pseudocode
#     """
#     # Index of a row in the dataframe
#     seq_index = row.name
#     input_seq = encoder_input_data[seq_index: seq_index+1]

#     # Decode the input as state vectors
#     decoded_sentence = decode_sequence(input_seq)

#     if seq_index % 1000 == 0:
#         print(seq_index)
    
#     return decoded_sentence[:-1]

In [None]:
# for i, (input_seq, target_seq) in enumerate(zip(test_df['pseudo_gen_seq'], test_df['code_gen_seq_aug'])):
#     # print(input_text, target_text)
#     for t, word in enumerate(input_seq):
#         encoder_input_data[i, t] = pseudo_token_index[word] if word in pseudo_token_index else 0
#         # print(word, pseudo_token_index[word])
#     for t, word in enumerate(target_seq):
        
#         # decoder_target_data is ahead of decoder_input_data by one timestep
#         decoder_input_data[i, t] = code_token_index[word] if word in code_token_index else 0
#         # print(word, code_token_index[word])
#         if t > 0:
#             # decoder_target_data will be ahead by one timestep
#             # and will not include the start character.
#             decoder_target_data[i, t - 1, code_token_index[word]] = 1.

In [None]:
# for seq_index in [399, 100]:
#     # encoder_input_data.shape[0]
#     # print(encoder_input_data[seq_index])
#     input_seq = encoder_input_data[seq_index: seq_index+1]
#     # print(input_seq)
#     decoded_sentence = decode_sequence(input_seq)
#     # print('-')
#     # print('Input sentence:', train_df['pseudo_gen_seq'][seq_index: seq_index + 1])
#     # print('True sentence:', train_df['code_gen_seq_aug'][seq_index: seq_index + 1])
#     # print('Decoded sentence:', decoded_sentence)
#     pred_df['pred_code_gen_seq'][seq_index] = decoded_sentence[:-1]