In [4]:
import pandas as pd
import numpy as np

# Dataset Ingestion

In [6]:
cols = {0: 'pseudo', 1: 'code'}

train_df = pd.read_csv('../../data/input-tok-train-shuf.tsv', header=None, delimiter='\t')
# train_df = pd.read_csv('../../data/input-tok-eval.tsv', header=None, delimiter='\t') # For eval
train_df.rename(columns=cols, inplace=True)

train_df['pseudo_token'] = train_df['pseudo'].str.split(' ')
train_df['code_token'] = train_df['code'].str.split(' ')
train_df.head()

Unnamed: 0,pseudo,code,pseudo_token,code_token
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]"
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]"


In [7]:
def create_copy_gen_seq_from_pseudo_row(row):
    """
    Returns binary sequence for pseudocode tokens based on 
    true code tokens

    If the pseudocode token exists in the true code (can be
    copied), then the sequence contains a 1 in that position
    """
    code_token_set = set(row['code_token'])
    output_seq = []
    generate_seq = [None] * len(row['pseudo_token'])

    cpy_count = 1
    for i, token in enumerate(row['pseudo_token']):
        if token in code_token_set:
            output_seq.append(1)

            if generate_seq[i] == None:
                generate_seq[i] = f'[CPY_{cpy_count}]'

                if row['pseudo_token'].count(token) > 1:
                    # print(row['pseudo_token'])
                    indexes = np.where(np.array(row['pseudo_token']) == token)[0]
                    np_seq = np.array(generate_seq)
                    np_seq[indexes] = f'[CPY_{cpy_count}]'
                    generate_seq = np_seq.tolist()

                    # trues.append(row.name)
                    
                cpy_count += 1
        else:
            output_seq.append(0)
            generate_seq[i] = token

    # generate_seq.append('[END]')
    assert len(output_seq) == len(row['pseudo_token'])
    return (output_seq, generate_seq)

In [8]:
seqs = train_df.apply(create_copy_gen_seq_from_pseudo_row, axis=1)
code_binary_seq = [x[0] for x in seqs]
generate_seq = [x[1] for x in seqs]
train_df['pseudo_copy_seq'] = code_binary_seq
train_df['pseudo_gen_seq'] = generate_seq
train_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[let, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[read, [CPY_1]]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[read, [CPY_1]]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[read, [CPY_1]]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[read, [CPY_1]]"
...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[[CPY_1], array, [CPY_2]]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]"
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[CPY_1], [CPY_2], [CPY_3], [CPY_4], to, [CPY_..."


In [9]:
def create_gen_seq_from_code_row(row):
    """
    Returns binary sequence for pseudocode tokens based on 
    true code tokens

    If the pseudocode token exists in the true code (can be
    copied), then the sequence contains a 1 in that position
    """
    # pseudo_token_set = set(row['pseudo_token'])
    pseudo_token = row['pseudo_token']
    gen_code_seq = []

    # for i, token in enumerate(row['code_token']):
    for token in row['code_token']:
        # if i == 0:
        #     gen_code_seq.append('[START]')
        # elif i == len(row['code_token']):
        #     gen_code_seq.append('[END]')

        if token in pseudo_token:
            pseudo_index = pseudo_token.index(token)
            cpy_token = row['pseudo_gen_seq'][pseudo_index]
            # gen_code_seq.append('[CPY]')
            gen_code_seq.append(cpy_token)
        else:
            gen_code_seq.append(token)

    return gen_code_seq

In [10]:
gen_code_seq = train_df.apply(create_gen_seq_from_code_row, axis=1)
train_df['code_gen_seq'] = gen_code_seq
train_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq,code_gen_seq
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[let, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY...","[int, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]"
...,...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[[CPY_1], array, [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]"
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[CPY_1], [CPY_2], [CPY_3], [CPY_4], to, [CPY_...","[[CPY_1], (, int, [CPY_2], [CPY_3], [CPY_4], ;..."


In [11]:
train_df['code_gen_seq_aug'] = train_df['code_gen_seq'].apply(lambda x: ['[START]'] + x + ['[STOP]'])
train_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq,code_gen_seq,code_gen_seq_aug
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[let, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY...","[int, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY...","[[START], int, [CPY_1], [CPY_2], [CPY_3], [CPY..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]"
...,...,...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[[CPY_1], array, [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ..."
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ..."
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ..."
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[CPY_1], [CPY_2], [CPY_3], [CPY_4], to, [CPY_...","[[CPY_1], (, int, [CPY_2], [CPY_3], [CPY_4], ;...","[[START], [CPY_1], (, int, [CPY_2], [CPY_3], [..."


In [12]:
train_df['code_token_aug'] = train_df['code_token'].apply(lambda x: ['[START]'] + x + ['[STOP]'])
train_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq,code_gen_seq,code_gen_seq_aug,code_token_aug
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[let, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY...","[int, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY...","[[START], int, [CPY_1], [CPY_2], [CPY_3], [CPY...","[[START], int, a, ,, b, ,, c, ,, d, ,, e, ;, [..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]","[[START], cin, >>, a, ;, [STOP]]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]","[[START], cin, >>, b, ;, [STOP]]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]","[[START], cin, >>, c, ;, [STOP]]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]","[[START], cin, >>, d, ;, [STOP]]"
...,...,...,...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[[CPY_1], array, [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ...","[[START], sort, (, a, ,, a, +, m, ), ;, [STOP]]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ...","[[START], sort, (, b, ,, b, +, m, ), ;, [STOP]]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ...","[[START], reverse, (, b, ,, b, +, m, ), ;, [ST..."
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[CPY_1], [CPY_2], [CPY_3], [CPY_4], to, [CPY_...","[[CPY_1], (, int, [CPY_2], [CPY_3], [CPY_4], ;...","[[START], [CPY_1], (, int, [CPY_2], [CPY_3], [...","[[START], for, (, int, i, =, 0, ;, i, <, m, ;,..."


In [19]:
train_df

Unnamed: 0,pseudo,code,pseudo_token,code_token,pseudo_copy_seq,pseudo_gen_seq,code_gen_seq,code_gen_seq_aug,code_token_aug
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[let, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY...","[int, [CPY_1], [CPY_2], [CPY_3], [CPY_2], [CPY...","[[START], int, [CPY_1], [CPY_2], [CPY_3], [CPY...","[[START], int, a, ,, b, ,, c, ,, d, ,, e, ;, [..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]","[[START], cin, >>, a, ;, [STOP]]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]","[[START], cin, >>, b, ;, [STOP]]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]","[[START], cin, >>, c, ;, [STOP]]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[read, [CPY_1]]","[cin, >>, [CPY_1], ;]","[[START], cin, >>, [CPY_1], ;, [STOP]]","[[START], cin, >>, d, ;, [STOP]]"
...,...,...,...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[[CPY_1], array, [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ...","[[START], sort, (, a, ,, a, +, m, ), ;, [STOP]]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ...","[[START], sort, (, b, ,, b, +, m, ), ;, [STOP]]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[[CPY_1], [CPY_2]]","[[CPY_1], (, [CPY_2], ,, [CPY_2], +, m, ), ;]","[[START], [CPY_1], (, [CPY_2], ,, [CPY_2], +, ...","[[START], reverse, (, b, ,, b, +, m, ), ;, [ST..."
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[[CPY_1], [CPY_2], [CPY_3], [CPY_4], to, [CPY_...","[[CPY_1], (, int, [CPY_2], [CPY_3], [CPY_4], ;...","[[START], [CPY_1], (, int, [CPY_2], [CPY_3], [...","[[START], for, (, int, i, =, 0, ;, i, <, m, ;,..."


In [20]:
# # For eval
# train_df[['pseudo_gen_seq', 'code_gen_seq', 'code_gen_seq_aug', 'pseudo_token', 'code_token']].to_pickle('../../data/CPY_dataset_cpynum_eval.pkl')

In [73]:
train_df.to_pickle('../../data/CPY_dataset_numbered.pkl')

In [53]:
train_df.to_csv('./CPY_dataset.tsv', sep='\t', index=False)