In [12]:
import pandas as pd
import numpy as np
import importlib
import preprocessing.preprocess_functions as preprocess

## Read spoc tokenized input

In [13]:
cols = {0: 'pseudo', 1: 'code'}

train_df = pd.read_csv('../../data/input-tok-train-shuf.tsv', header=None, delimiter='\t')
train_df.rename(columns=cols, inplace=True)

importlib.reload(preprocess)

preprocess.tokenize_column(train_df, col_to_tokenize='pseudo', tokenized_col_name='pseudo_tokens', inplace=True)
preprocess.tokenize_column(train_df, col_to_tokenize='code', tokenized_col_name='code_tokens', inplace=True)

train_df.head()

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,..."
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ..."


## Create binary sequence

In [14]:
importlib.reload(preprocess)

code_binary_seq = train_df.apply(preprocess.create_binary_seq_from_row, args=('code_tokens', 'pseudo_tokens'), axis=1)
train_df['code_binary_seq'] = code_binary_seq
train_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]","[0, 1, 0, 1]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]","[1, 1, 0, 1]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]","[0, 1, 0, 1]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1]"
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."
...,...,...,...,...,...
181857,declare static constant integer mod = 1000000009,static const int mod = 1000000009 ;,"[declare, static, constant, integer, mod, =, 1...","[static, const, int, mod, =, 1000000009, ;]","[0, 1, 0, 0, 1, 1, 1]"
181858,print NO and a new line,"cout << "" NO "" << ' \n ' ;","[print, NO, and, a, new, line]","[cout, <<, "", NO, "", <<, ', \n, ', ;]","[0, 1, 0, 0, 0, 0]"
181859,change the value of ans to abs ( x - y ) / d,ans = abs ( x - y ) / d ;,"[change, the, value, of, ans, to, abs, (, x, -...","[ans, =, abs, (, x, -, y, ), /, d, ;]","[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]"
181860,else if s is less than f,else if ( s < f ),"[else, if, s, is, less, than, f]","[else, if, (, s, <, f, )]","[1, 1, 1, 0, 0, 0, 1]"


## Create tagged pairs

In [15]:
importlib.reload(preprocess)

train_df['tagged_pseudo'] = train_df.apply(preprocess.create_tagged_tuples, args=('pseudo_tokens', 'code_binary_seq'), axis=1)
train_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq,tagged_pseudo
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]","[0, 1, 0, 1]","[(set, 0), (l, 1), (to, 0), (mid, 1)]"
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]","[1, 1, 0, 1]","[(if, 1), (i, 1), (is, 0), (0, 1)]"
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]","[0, 1, 0, 1]","[(read, 0), (n, 1), (and, 0), (k, 1)]"
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1]","[(declare, 0), (long, 1), (longs, 0), (sum, 1)..."
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[(dy, 1), (=, 1), (integer, 0), (array, 0), (w..."
...,...,...,...,...,...,...
181857,declare static constant integer mod = 1000000009,static const int mod = 1000000009 ;,"[declare, static, constant, integer, mod, =, 1...","[static, const, int, mod, =, 1000000009, ;]","[0, 1, 0, 0, 1, 1, 1]","[(declare, 0), (static, 1), (constant, 0), (in..."
181858,print NO and a new line,"cout << "" NO "" << ' \n ' ;","[print, NO, and, a, new, line]","[cout, <<, "", NO, "", <<, ', \n, ', ;]","[0, 1, 0, 0, 0, 0]","[(print, 0), (NO, 1), (and, 0), (a, 0), (new, ..."
181859,change the value of ans to abs ( x - y ) / d,ans = abs ( x - y ) / d ;,"[change, the, value, of, ans, to, abs, (, x, -...","[ans, =, abs, (, x, -, y, ), /, d, ;]","[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]","[(change, 0), (the, 0), (value, 0), (of, 0), (..."
181860,else if s is less than f,else if ( s < f ),"[else, if, s, is, less, than, f]","[else, if, (, s, <, f, )]","[1, 1, 1, 0, 0, 0, 1]","[(else, 1), (if, 1), (s, 1), (is, 0), (less, 0..."


## 1. Decision Tree Classifier

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

In [16]:
# Add features to training set
importlib.reload(preprocess)

train_df['pseudo_features'] = train_df.apply(preprocess.create_features, args=('pseudo_tokens',), axis=1)
train_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq,tagged_pseudo,pseudo_features
0,set l to mid,l = mid ;,"[set, l, to, mid]","[l, =, mid, ;]","[0, 1, 0, 1]","[(set, 0), (l, 1), (to, 0), (mid, 1)]","[{'word': 'set', 'length': 3, 'is_numeric': Fa..."
1,if i is 0,if ( i == 0 ),"[if, i, is, 0]","[if, (, i, ==, 0, )]","[1, 1, 0, 1]","[(if, 1), (i, 1), (is, 0), (0, 1)]","[{'word': 'if', 'length': 2, 'is_numeric': Fal..."
2,read n and k,cin >> n >> k ;,"[read, n, and, k]","[cin, >>, n, >>, k, ;]","[0, 1, 0, 1]","[(read, 0), (n, 1), (and, 0), (k, 1)]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
3,declare long longs sum = 0 and min = LONG_LONG...,"long long min = LONG_LONG_MAX , sum = 0 ;","[declare, long, longs, sum, =, 0, and, min, =,...","[long, long, min, =, LONG_LONG_MAX, ,, sum, =,...","[0, 1, 0, 1, 1, 1, 0, 1, 1, 1]","[(declare, 0), (long, 1), (longs, 0), (sum, 1)...","[{'word': 'declare', 'length': 7, 'is_numeric'..."
4,dy = integer array where the the following int...,"int dy [ ] = { 0 , 0 , - 1 , 1 } ;","[dy, =, integer, array, where, the, the, follo...","[int, dy, [, ], =, {, 0, ,, 0, ,, -, 1, ,, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[(dy, 1), (=, 1), (integer, 0), (array, 0), (w...","[{'word': 'dy', 'length': 2, 'is_numeric': Fal..."
...,...,...,...,...,...,...,...
181857,declare static constant integer mod = 1000000009,static const int mod = 1000000009 ;,"[declare, static, constant, integer, mod, =, 1...","[static, const, int, mod, =, 1000000009, ;]","[0, 1, 0, 0, 1, 1, 1]","[(declare, 0), (static, 1), (constant, 0), (in...","[{'word': 'declare', 'length': 7, 'is_numeric'..."
181858,print NO and a new line,"cout << "" NO "" << ' \n ' ;","[print, NO, and, a, new, line]","[cout, <<, "", NO, "", <<, ', \n, ', ;]","[0, 1, 0, 0, 0, 0]","[(print, 0), (NO, 1), (and, 0), (a, 0), (new, ...","[{'word': 'print', 'length': 5, 'is_numeric': ..."
181859,change the value of ans to abs ( x - y ) / d,ans = abs ( x - y ) / d ;,"[change, the, value, of, ans, to, abs, (, x, -...","[ans, =, abs, (, x, -, y, ), /, d, ;]","[0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]","[(change, 0), (the, 0), (value, 0), (of, 0), (...","[{'word': 'change', 'length': 6, 'is_numeric':..."
181860,else if s is less than f,else if ( s < f ),"[else, if, s, is, less, than, f]","[else, if, (, s, <, f, )]","[1, 1, 1, 0, 0, 0, 1]","[(else, 1), (if, 1), (s, 1), (is, 0), (less, 0...","[{'word': 'else', 'length': 4, 'is_numeric': F..."


In [8]:
X_train = [word for l in train_df['pseudo_features'].tolist() for word in l]
y_train = [word for l in train_df['code_binary_seq'].tolist() for word in l]


In [9]:
print(X_train[:10])
print(y_train[:10])

[{'word': 'set', 'length': 3, 'is_numeric': False, 'is_alpha': True, 'is_alphanumeric': True, 'is_punctuation': True, 'prev_word': '', 'next_word': 'l', 'prev_prev_word': '', 'next_next_word': 'to'}, {'word': 'l', 'length': 1, 'is_numeric': False, 'is_alpha': True, 'is_alphanumeric': True, 'is_punctuation': True, 'prev_word': 'set', 'next_word': 'to', 'prev_prev_word': '', 'next_next_word': 'mid'}, {'word': 'to', 'length': 2, 'is_numeric': False, 'is_alpha': True, 'is_alphanumeric': True, 'is_punctuation': True, 'prev_word': 'l', 'next_word': 'mid', 'prev_prev_word': 'set', 'next_next_word': ''}, {'word': 'mid', 'length': 3, 'is_numeric': False, 'is_alpha': True, 'is_alphanumeric': True, 'is_punctuation': True, 'prev_word': 'to', 'next_word': '', 'prev_prev_word': 'l', 'next_next_word': ''}, {'word': 'if', 'length': 2, 'is_numeric': False, 'is_alpha': True, 'is_alphanumeric': True, 'is_punctuation': True, 'prev_word': '', 'next_word': 'i', 'prev_prev_word': '', 'next_next_word': 'is'},

In [10]:
clf = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

clf.fit(X_train, y_train)   # Use only the first 10K samples if you're running it multiple times. It takes a fair bit :)
 
print('Training completed')

Training completed


In [16]:
import pickle

clf_saved = pickle.dump(clf, open('trained_decision_tree.pkl', 'wb'))

In [2]:
import pickle

clf_loaded = pickle.load(open('trained_decision_tree.pkl', 'rb'))

## Test the dataset

In [3]:
cols = {0: 'pseudo', 1: 'code'}

test_df = pd.read_csv('../../data/input-tok-eval.tsv', header=None, delimiter='\t')
test_df.rename(columns=cols, inplace=True)

importlib.reload(preprocess)

preprocess.tokenize_column(test_df, col_to_tokenize='pseudo', tokenized_col_name='pseudo_tokens', inplace=True)
preprocess.tokenize_column(test_df, col_to_tokenize='code', tokenized_col_name='code_tokens', inplace=True)

test_df.head()

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]"
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]"


In [4]:
importlib.reload(preprocess)

code_binary_seq = test_df.apply(preprocess.create_binary_seq_from_row, args=('code_tokens', 'pseudo_tokens'), axis=1)
test_df['code_binary_seq'] = code_binary_seq
test_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]"
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]"
...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]"
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."


In [5]:
test_df['pseudo_features'] = test_df.apply(preprocess.create_features, args=('pseudo_tokens',), axis=1)
test_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq,pseudo_features
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[{'word': 'let', 'length': 3, 'is_numeric': Fa..."
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F..."
...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[{'word': 'sort', 'length': 4, 'is_numeric': F..."
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[{'word': 'sort', 'length': 4, 'is_numeric': F..."
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[{'word': 'reverse', 'length': 7, 'is_numeric'..."
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[{'word': 'for', 'length': 3, 'is_numeric': Fa..."


In [6]:
clf_loaded.predict(test_df['pseudo_features'][0])

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0])

In [7]:
importlib.reload(preprocess)

test_df['predictions'] = test_df.apply(preprocess.apply_function_to_column, args=(clf_loaded.predict, 'pseudo_features'), axis=1)

In [8]:
test_df

Unnamed: 0,pseudo,code,pseudo_tokens,code_tokens,code_binary_seq,pseudo_features,predictions
0,"let a , b , c , d , e be integers","int a , b , c , d , e ;","[let, a, ,, b, ,, c, ,, d, ,, e, be, integers]","[int, a, ,, b, ,, c, ,, d, ,, e, ;]","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]","[{'word': 'let', 'length': 3, 'is_numeric': Fa...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]"
1,read a,cin >> a ;,"[read, a]","[cin, >>, a, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F...","[0, 1]"
2,read b,cin >> b ;,"[read, b]","[cin, >>, b, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F...","[0, 1]"
3,read c,cin >> c ;,"[read, c]","[cin, >>, c, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F...","[0, 1]"
4,read d,cin >> d ;,"[read, d]","[cin, >>, d, ;]","[0, 1]","[{'word': 'read', 'length': 4, 'is_numeric': F...","[0, 1]"
...,...,...,...,...,...,...,...
19175,sort array a,"sort ( a , a + m ) ;","[sort, array, a]","[sort, (, a, ,, a, +, m, ), ;]","[1, 0, 1]","[{'word': 'sort', 'length': 4, 'is_numeric': F...","[1, 0, 1]"
19176,sort b,"sort ( b , b + m ) ;","[sort, b]","[sort, (, b, ,, b, +, m, ), ;]","[1, 1]","[{'word': 'sort', 'length': 4, 'is_numeric': F...","[1, 1]"
19177,reverse b,"reverse ( b , b + m ) ;","[reverse, b]","[reverse, (, b, ,, b, +, m, ), ;]","[1, 1]","[{'word': 'reverse', 'length': 7, 'is_numeric'...","[1, 1]"
19178,"for i = 0 to m exclusive , set cnt [ b [ i ] ....",for ( int i = 0 ; i < m ; ++ i ) { cnt [ b [ i...,"[for, i, =, 0, to, m, exclusive, ,, set, cnt, ...","[for, (, int, i, =, 0, ;, i, <, m, ;, ++, i, )...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ...","[{'word': 'for', 'length': 3, 'is_numeric': Fa...","[1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."


In [9]:
# Strict accuracy

strict_bools = test_df.apply(lambda row: np.array_equal(row['code_binary_seq'], row['predictions']), axis=1)
strict_accuracy = strict_bools.sum() / len(test_df)
strict_accuracy

0.8913451511991658

In [10]:
# Word-by-word accuracy

word_matches = np.sum(test_df.apply(lambda row: np.sum(row['predictions'] == row['code_binary_seq']), axis=1))
total_words = np.sum(test_df.apply(lambda row: np.size(row['predictions']), axis=1))

word_accuracy = word_matches / total_words
word_accuracy

0.9806949542071142