In [1]:
import torch
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
import importlib
import preprocess

In [3]:
# Vocab indices of constants
PAD_INDEX = 0
START_INDEX = 1
STOP_INDEX = 2
CPY_INDEX = 3
UNKNOWN_INDEX = 4

# Constants for vocabulary
PAD_TOKEN = '[PAD]'
START_TOKEN = '[START]'
STOP_TOKEN = '[STOP]'
CPY_TOKEN = '[CPY]'
UNKNOWN_TOKEN = '[UNK]'

RESERVED_TOKENS = {PAD_TOKEN, START_TOKEN, STOP_TOKEN, CPY_TOKEN, UNKNOWN_TOKEN}
class Vocabulary:
    """
        Representation of a vocabulary/language as several dictionaries
        
        
        Attributes:
            name (str): name of the vocabulary/language (e.g. 'english', 'pseudo', 'code' etc)
            itos (dict): dictionary of index to token/string (e.g. {0: '[PAD]', 1: '[START]', 2: '[STOP]', 3: '[CPY]', 4: '[UNK]', 5: 'int'})
            stoi (dict): dictionary of token/string to indices (e.g. {'[PAD]': 0, '[START]': 1, '[STOP]': 2, '[CPY]': 3, '[UNK]': 4, 'int': 5})
            vocab (set): set of all tokens in the vocabulary (e.g. {'[PAD]', '[START]', '[STOP]', '[CPY]', '[UNK]', 'int'})
            n_words (int): number of tokens in the vocabulary (e.g. 6)
        """
    def __init__(self, name: str):
        """
        Args:
            name: name of the language
        """
        self.name = name
        self.itos = {
            PAD_INDEX: PAD_TOKEN, 
            START_INDEX: START_TOKEN, 
            STOP_INDEX: STOP_TOKEN, 
            CPY_INDEX: CPY_TOKEN,
            UNKNOWN_INDEX: UNKNOWN_TOKEN
        }
        self.stoi = {k:j for j,k in self.itos.items()}
        self.vocab = set()
        self.n_words = len(self.itos)

    def __len__(self):
        return len(self.itos)
        

    def build_vocabulary(self, train_df: pd.DataFrame, column_name: str):
        """
        Build a vocabulary from a dataframe column containing lists of tokens
        """
        # Empty vocab set
        self.vocab = set()

        # Loop through every row in the dataframe
        # TODO: parallelize this
        for line in train_df[column_name]:
            # Augment the vocab
            for token in line:
                self.vocab.add(token)

        # Sort the vocabulary and store in a list
        # This is being done so that the indices are always
        # the same for the same vocabulary (e.g. if the vocab is
        # {'a', 'b', 'c'} then the indices are always {0, 1, 2})

        words = sorted(list(self.vocab))

        for word in words: 
            # If word not in vocabulary (should not be unless it is a reserved token)
            if word not in self.stoi:
                self.stoi[word] = self.n_words
                self.itos[self.n_words] = word 
                self.n_words += 1
            elif word in RESERVED_TOKENS:
                pass
            else:
                raise ValueError(f'Word "{word}" already in vocabulary')
        


    def numericalize(self, text: list) -> list:
        """
        Convert a list of strings to a list of vocab indices. Treats unknown words as '[UNK]'
        """
        numericalized_text = []

        for token in text:
            if token in self.stoi:
                numericalized_text.append(self.stoi[token])
            else: 
                # out-of-vocab words are represented by UNK token index
                numericalized_text.append(self.stoi[UNKNOWN_TOKEN])

        return numericalized_text

    def get_max_len(self, train_df: pd.DataFrame, column_name: str):
        """
        Get the max length (array) of a tokenized sentence in the train dataframe
        """
        maxlist = max(train_df[column_name], key=len)
        return len(maxlist)

## Read spoc tokenized input

In [4]:
cols = {0: 'pseudo', 1: 'code'}

train_df = pd.read_csv('../../data/input-tok-train-shuf.tsv', header=None, delimiter='\t')
train_df.rename(columns=cols, inplace=True)

importlib.reload(preprocess)

preprocess.tokenize_column(train_df, col_to_tokenize='pseudo', tokenized_col_name='pseudo_tokens', inplace=True)
preprocess.tokenize_column(train_df, col_to_tokenize='code', tokenized_col_name='code_tokens', inplace=True)

train_df.head()

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,..."
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ..."


## Create binary sequence

In [5]:
importlib.reload(preprocess)

code_binary_seq = train_df.apply(preprocess.create_binary_seq_from_row, args=('code_tokens', 'pseudo_tokens'), axis=1)
train_df['code_binary_seq'] = code_binary_seq
train_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]","[0, 1, 0, 1]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]","[1, 1, 0, 1]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]","[0, 1, 0, 1]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1]"
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."
...,...,...,...,...,...
181857,declare static constant integer mod = 1000000009,static const int mod = 1000000009 ;,"[declare, static, constant, integer, mod, =, 1...","[static, const, int, mod, =, 1000000009, ;]","[0, 1, 0, 0, 1, 1, 1]"
181858,print NO and a new line,"cout << "" NO "" << ' \n ' ;","[print, NO, and, a, new, line]","[cout, <<, "", NO, "", <<, ', \n, ', ;]","[0, 1, 0, 0, 0, 0]"
181859,change the value of ans to abs ( x - y ) / d,ans = abs ( x - y ) / d ;,"[change, the, value, of, ans, to, abs, (, x, -...","[ans, =, abs, (, x, -, y, ), /, d, ;]","[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]"
181860,else if s is less than f,else if ( s < f ),"[else, if, s, is, less, than, f]","[else, if, (, s, <, f, )]","[1, 1, 1, 0, 0, 0, 1]"


In [74]:
pseudo_voc = Vocabulary('pseudo_voc')
pseudo_voc.build_vocabulary(train_df, 'pseudo_tokens')

In [75]:
embedding_size = 100
window_size = 2
hidden_size = 100

epochs = 50
dropout_p = 0.5

In [76]:
# from mlp_tagger import MLPTagger

In [77]:
class MLPTagger(nn.Module):
    def __init__(self, vocab_size, embedding_size, window_size, hidden_size, dropout_p=0.5):
        super(MLPTagger, self).__init__()

        self.embedding_size = embedding_size
        self.window_size = window_size
        self.hidden_size = hidden_size
        self.hidden_size_2 = int(self.hidden_size/2)

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        self.fc1 = nn.Linear((self.embedding_size * 2*self.window_size), self.hidden_size)
        self.fc2 = nn.Linear(self.hidden_size, self.hidden_size_2)
        self.fc3 = nn.Linear(self.hidden_size_2, 1)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p = dropout_p)
        self.relu = nn.ReLU()

    def forward(self, data):
        emb = self.embedding(data)
        # print('1', emb.size())

        batch_size = emb.size()[0]
        emb = emb.view(batch_size, -1)
        # print('2', emb.size())

        hidden_1 = self.dropout(self.fc1(emb))
        hidden_1 = self.relu(hidden_1)
        # print('2', hidden_1.size())

        hidden_2 = self.dropout(self.fc2(hidden_1))
        hidden_2 = self.relu(hidden_2)
        # print('3', hidden_2.size())

        output = self.sigmoid(self.fc3(hidden_2))
        # print('4', output.size())
        return output 


In [78]:
mlp = MLPTagger(len(pseudo_voc), embedding_size, window_size, hidden_size, dropout_p)

In [79]:
from tqdm import tqdm
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mlp.parameters())

In [82]:
for epoch in range(epochs):
    for index, row in tqdm(train_df.iterrows(), total=train_df.shape[0], unit='row'):
        numer = pseudo_voc.numericalize(row['pseudo_tokens'])

        train = torch.zeros(len(numer), 4, dtype=torch.int64)
        # print(train.size())

        for i, word in enumerate(numer):
            train[i][0] = numer[i-2] if i-2 >=0 else pseudo_voc.stoi['[PAD]']
            train[i][1] = numer[i-1] if i-1 >=0 else pseudo_voc.stoi['[PAD]']
            train[i][2] = numer[i+1] if i+1 < len(numer) else pseudo_voc.stoi['[PAD]']
            train[i][3] = numer[i+2] if i+2 < len(numer) else pseudo_voc.stoi['[PAD]']

        labels = torch.Tensor(row['code_binary_seq'])
        labels = labels.unsqueeze(-1)
        
        out = mlp(train)
        # print(out)
        # print(labels)

        loss = criterion(out, labels)
        # print(loss)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # print("=====")
        

  3%|▎         | 6048/181862 [00:24<12:03, 243.07row/s]


KeyboardInterrupt: 

# DO NOT RUN AFTER THIS

## Test the dataset

In [3]:
cols = {0: 'pseudo', 1: 'code'}

test_df = pd.read_csv('../../data/input-tok-eval.tsv', header=None, delimiter='\t')
test_df.rename(columns=cols, inplace=True)

importlib.reload(preprocess)

preprocess.tokenize_column(test_df, col_to_tokenize='pseudo', tokenized_col_name='pseudo_tokens', inplace=True)
preprocess.tokenize_column(test_df, col_to_tokenize='code', tokenized_col_name='code_tokens', inplace=True)

test_df.head()

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]"
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]"


In [4]:
importlib.reload(preprocess)

code_binary_seq = test_df.apply(preprocess.create_binary_seq_from_row, args=('code_tokens', 'pseudo_tokens'), axis=1)
test_df['code_binary_seq'] = code_binary_seq
test_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]"
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]"
...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]"
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."


In [5]:
test_df['pseudo_features'] = test_df.apply(preprocess.create_features, args=('pseudo_tokens',), axis=1)
test_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq,pseudo_features
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[{'word': 'let', 'length': 3, 'is_numeric': Fa..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[{'word': 'sort', 'length': 4, 'is_numeric': F..."
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[{'word': 'sort', 'length': 4, 'is_numeric': F..."
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[{'word': 'reverse', 'length': 7, 'is_numeric'..."
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[{'word': 'for', 'length': 3, 'is_numeric': Fa..."


In [6]:
clf_loaded.predict(test_df['pseudo_features'][0])

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0])

In [7]:
importlib.reload(preprocess)

test_df['predictions'] = test_df.apply(preprocess.apply_function_to_column, args=(clf_loaded.predict, 'pseudo_features'), axis=1)

In [8]:
test_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq,pseudo_features,predictions
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[{'word': 'let', 'length': 3, 'is_numeric': Fa...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]"
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F...","[0, 1]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F...","[0, 1]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F...","[0, 1]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F...","[0, 1]"
...,...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[{'word': 'sort', 'length': 4, 'is_numeric': F...","[1, 0, 1]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[{'word': 'sort', 'length': 4, 'is_numeric': F...","[1, 1]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[{'word': 'reverse', 'length': 7, 'is_numeric'...","[1, 1]"
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[{'word': 'for', 'length': 3, 'is_numeric': Fa...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."


In [9]:
# Strict accuracy

strict_bools = test_df.apply(lambda row: np.array_equal(row['code_binary_seq'], row['predictions']), axis=1)
strict_accuracy = strict_bools.sum() / len(test_df)
strict_accuracy

0.8913451511991658

In [10]:
# Word-by-word accuracy

word_matches = np.sum(test_df.apply(lambda row: np.sum(row['predictions'] == row['code_binary_seq']), axis=1))
total_words = np.sum(test_df.apply(lambda row: np.size(row['predictions']), axis=1))

word_accuracy = word_matches / total_words
word_accuracy

0.9806949542071142