In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

# import torchtext
# from torchtext.legacy.data import Field, BucketIterator#, Iterator
# from torchtext.legacy import data

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# import spacy
import numpy as np
import pandas as pd

import random
import math
import time
import io


In [None]:
torchtext.__version__

# Reading the text file

In [None]:
f = open(r"english_python_data.txt", "r", encoding="utf8")
file_lines = f.readlines()

In [None]:
file_lines[:7]

In [None]:
dps = []
dp = None
for line in file_lines:
    if line[0] == "#":
        if dp:
            dp['solution'] = ''.join(dp['solution'])
            dps.append(dp)
        dp = {"question": None, "solution": []}
        dp['question'] = line[1:]
    else:
        dp["solution"].append(line)

In [None]:
i=0
for dp in dps:
    print("\n Question no: ", i+1)
    i+=1
    print(dp['question'][1:])
    print(dp['solution'])
    if i>4:
        break

In [None]:
print("Dataset size:", len(dps))

## Using a custom tokenizer to tokenize python code

Python is a programming language with its own unique syntax. Regular tokenizers like spacy are meant to tokenize english scentences and are not optimized towards Python's syntax. Here, we write our own custom tokenizer that makes use of Python's default tokenize library. When we make use of this library we only extract the token type and the token string.

In [None]:
from tokenize import tokenize, untokenize


def tokenize_python_code(python_code_str):
    python_tokens = list(tokenize(io.BytesIO(python_code_str.encode('utf-8')).readline))
    tokenized_output = []
    for i in range(0, len(python_tokens)):
        tokenized_output.append((python_tokens[i].type, python_tokens[i].string))
    return tokenized_output

In [None]:
tokenized_sample = tokenize_python_code(dps[1]['solution'])
print(tokenized_sample)

In [None]:
print(untokenize(tokenized_sample).decode('utf-8'))

Since we have mere 5000 data points, we make use of data augmentations to increase the size of our dataset. While tokenizing the python code, we mask the names of certain variables randomly(with 'var_1, 'var_2' etc) to ensure that the model that we train does not merly fixate on the way the variables are named and actually tries to understand the inhrent logic and syntax of the python code.

But, while randomly picking varibles to mask we avoid keyword literals(keyword.kwlist), control structures(as can be seen in below skip_list) and object properties. We add all such literals that need to be skipped into the skip_list

In [None]:
skip_list = ['range', 'enumerate', 'print', 'ord', 'int', 'float', 'char', 'list', 'dict', 'tuple', 'set', 'len', 'sum', 'min', 'max']

In [None]:
import keyword

print(keyword.kwlist)

In [None]:
def augment_tokenize_python_code(python_code_str, mask_factor=0.3):


    var_dict = {} # Dictionary that stores masked variables

    # certain reserved words that should not be treated as normal variables and
    # hence need to be skipped from our variable mask augmentations
    skip_list = ['range', 'enumerate', 'print', 'ord', 'int', 'float', 'zip'
                 'char', 'list', 'dict', 'tuple', 'set', 'len', 'sum', 'min', 'max']
    skip_list.extend(keyword.kwlist)

    var_counter = 1
    python_tokens = list(tokenize(io.BytesIO(python_code_str.encode('utf-8')).readline))
    tokenized_output = []

    for i in range(0, len(python_tokens)):
        if python_tokens[i].type == 1 and python_tokens[i].string not in skip_list:
        
            if i>0 and python_tokens[i-1].string in ['def', '.', 'import', 'raise', 'except', 'class']: # avoid masking modules, functions and error literals
                skip_list.append(python_tokens[i].string)
                tokenized_output.append((python_tokens[i].type, python_tokens[i].string))
            elif python_tokens[i].string in var_dict:  # if variable is already masked
                tokenized_output.append((python_tokens[i].type, var_dict[python_tokens[i].string]))
            elif random.uniform(0, 1) > 1-mask_factor: # randomly mask variables
                var_dict[python_tokens[i].string] = 'var_' + str(var_counter)
                var_counter+=1
                tokenized_output.append((python_tokens[i].type, var_dict[python_tokens[i].string]))
            else:
                skip_list.append(python_tokens[i].string)
                tokenized_output.append((python_tokens[i].type, python_tokens[i].string))
      
        else:
            tokenized_output.append((python_tokens[i].type, python_tokens[i].string))
    
    return tokenized_output