# Code Autocompletion
Today, I wrote the code that automatically give you what was suppose to be
the next code e.g. If you input line "import numpy as" it suppose to return "import numpy as np"

## Data preparation

In [54]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import math
from tqdm import tqdm

In [55]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda:0


### Download

In [56]:
# This "github jupyter code to text" datasets is what we gonna use.
# My good friend Todsavad recommended me to use this as it light and easy to work with.
import datasets 
train_jupyter = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="train")
test_jupyter = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="test")
print(train_jupyter, test_jupyter)

Using custom data configuration codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1
Found cached dataset parquet (/root/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Using custom data configuration codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1
Found cached dataset parquet (/root/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 47452
}) Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 11864
})


In [57]:
# Let's take a peek at the dataset
train_jupyter["content"][0]

'import numpy as np\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\n\n"""\nExplanation: Simple MNIST convnet\nAuthor: fchollet<br>\nDate created: 2015/06/19<br>\nLast modified: 2020/04/21<br>\nDescription: A simple convnet that achieves ~99% test accuracy on MNIST.\nSetup\nEnd of explanation\n"""\n\n\n# Model / data parameters\nnum_classes = 10\ninput_shape = (28, 28, 1)\n\n# the data, split between train and test sets\n(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n\n# Scale images to the [0, 1] range\nx_train = x_train.astype("float32") / 255\nx_test = x_test.astype("float32") / 255\n# Make sure images have shape (28, 28, 1)\nx_train = np.expand_dims(x_train, -1)\nx_test = np.expand_dims(x_test, -1)\nprint("x_train shape:", x_train.shape)\nprint(x_train.shape[0], "train samples")\nprint(x_test.shape[0], "test samples")\n\n\n# convert class vectors to binary class matrices\ny_train = keras.utils.to_categorical(y_train, num_classes)\ny_test

### Clean

In [58]:
# Split new line
# As we can see from the cell above the data is not cleaned.
# We want the data to be in format ["import numpy as np", "import pandas as pd", ..... ,"...."]

train_split = [split for text in train_jupyter['content'] for split in text.split('\n') if split != ""]
test_split = [split for text in test_jupyter['content'] for split in text.split('\n') if split != ""]

In [59]:
# Check the  sentence amounts.
len(train_split), len(test_split)

(11367363, 2875424)

In [60]:
# Even when we cleaned it, it still have some character that we do not want. e.g. html tags or another languages
# beside english.

from spacy.lang.en.stop_words import STOP_WORDS
import spacy
import re

nlp = spacy.load('en_core_web_md')

def preprocessing(sentence):
    
    # Clear the html tag by using regular expression.
    sentence = re.sub("<[^>]*>", "", sentence) # RegEx to remove html tags
    sentence = re.sub("[^\x00-\x7F]+", "", sentence) # RegEx to remove another languages that is not English.
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    # Cleaned its with spacy, remove symbol, punct and empty space
    # It's smart enough to not remove the . in the middle of sentences. E.g. "np.array"

    for token in doc: 
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and \
            token.pos_ != 'SYM' and token.pos_!= 'X':
                cleaned_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(cleaned_tokens)

### Tokenized

In [61]:
# Create tokenizer

from torchtext.data.utils import get_tokenizer
def yield_tokens(data_iter):
    for text in data_iter:
        text = preprocessing(text) 
        yield tokenizer(text)

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# My pc can only handle this.
# Gtx 1660ti can handle only around 100,000 sentences in total otherwise it will crashed.
train_size = 100000
tokenized_dataset_train = yield_tokens(train_split[:train_size])
tokenized_dataset_test = yield_tokens(test_split[:20000])

In [62]:
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_split[:train_size]), min_freq=5) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])       

9442
['<unk>', '<eos>', '"', ',', ')', '=', ':', 'the', '#', "'"]


### Batch Loader

In [63]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:       
        #appends eos so we know it ends....so model learn how to end...                             
        tokens = example.append('<eos>') #end of sentence
        #numericalize          
        tokens = [vocab[token] for token in example] 
        data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)        
    return data

In [64]:
batch_size = 128
train_data = get_data(tokenized_dataset_train, vocab, batch_size)
valid_data = get_data(tokenized_dataset_test, vocab, batch_size)
# test_data  = get_data(tokenized_dataset['test'], vocab, batch_size)

In [65]:
# Just save for later used, because it take very long time to preprocess.
import pickle

# Save preprocessed data
object_data = train_data
file_train = open('/root/projects/NLP/Assignment/16_Feb_Code_Autocompletion/obj/train_data.pkl', 'wb') 
pickle.dump(object_data, file_train)

object_data = valid_data
file_val = open('/root/projects/NLP/Assignment/16_Feb_Code_Autocompletion/obj/valid_data.pkl', 'wb') 
pickle.dump(object_data, file_val)

# Save vocab
object_data = vocab
file_vocab = open('/root/projects/NLP/Assignment/16_Feb_Code_Autocompletion/obj/vocab.pkl', 'wb') 
pickle.dump(object_data, file_vocab)


In [66]:
train_data.shape #[batch_size, all the next length]

torch.Size([128, 3818])

## Modeling 
We will just use the same model from the classes.

In [67]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size,emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                                        dropout = dropout_rate, batch_first = True)
        self.dropout = nn.Dropout(dropout_rate)
        #when you do LM, you look forward, so it does not make sense to do bidirectional
        self.fc = nn.Linear(hid_dim,vocab_size)

    def init_hidden(self, batch_size, device):
        #this function gonna be run in the beginning of the epoch
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)

        return hidden, cell #return as tuple

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #removing this hidden from gradients graph
        cell =  cell.detach() #removing this hidden from gradients graph
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch_size, seq_len]

        #embed 
        embedded = self.embedding(src)
        #embed : [batch_size, seq_len, emb_dim]

        #send this to the lstm
        #we want to put hidden here... because we want to reset hidden .....
        output, hidden = self.lstm(embedded, hidden)
        #output : [batch_size, seq_len, hid_dim] ==> all hidden states
        #hidden : [batch_size, seq_len, hid_dim] ==> last hidden states from each layer

        output = self.dropout(output)
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab_size]
        return prediction, hidden

## Training

In [68]:
vocab_size = len(vocab)
emb_dim = 1024                # 400 in the paper
hid_dim = 1024                # 1150 in the paper
num_layers = 2                # 3 in the paper
dropout_rate = 0.65              
lr = 1e-3                     

In [69]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 36,140,258 trainable parameters


In [70]:
def get_batch(data, seq_len, idx):
    #this data is from get_data()
    #train_data.shape # [batch_size, number of batches....]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [71]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) #prevent gradient explosion - clip is basically 
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [72]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [73]:
n_epochs = 50
seq_len  = 50
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-auto.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                         

	Train Perplexity: 160.062
	Valid Perplexity: 70.551


                                                         

	Train Perplexity: 70.376
	Valid Perplexity: 58.200


                                                         

	Train Perplexity: 56.390
	Valid Perplexity: 53.903


                                                         

	Train Perplexity: 48.368
	Valid Perplexity: 50.318


                                                         

	Train Perplexity: 42.208
	Valid Perplexity: 47.934


                                                         

	Train Perplexity: 37.380
	Valid Perplexity: 46.452


                                                         

	Train Perplexity: 33.421
	Valid Perplexity: 45.242


                                                         

	Train Perplexity: 30.027
	Valid Perplexity: 44.186


                                                         

	Train Perplexity: 26.990
	Valid Perplexity: 44.068


                                                         

	Train Perplexity: 24.411
	Valid Perplexity: 43.408


                                                         

	Train Perplexity: 22.250
	Valid Perplexity: 43.756


                                                         

	Train Perplexity: 19.875
	Valid Perplexity: 43.182


                                                         

	Train Perplexity: 18.276
	Valid Perplexity: 44.188


                                                         

	Train Perplexity: 17.039
	Valid Perplexity: 44.407


                                                         

	Train Perplexity: 16.169
	Valid Perplexity: 45.123


                                                         

	Train Perplexity: 15.673
	Valid Perplexity: 45.609


                                                         

	Train Perplexity: 15.375
	Valid Perplexity: 45.622


                                                         

	Train Perplexity: 15.202
	Valid Perplexity: 45.634


                                                         

	Train Perplexity: 15.150
	Valid Perplexity: 45.678


                                                         

	Train Perplexity: 15.083
	Valid Perplexity: 45.717


                                                         

	Train Perplexity: 15.059
	Valid Perplexity: 45.740


                                                         

	Train Perplexity: 15.074
	Valid Perplexity: 45.741


                                                         

	Train Perplexity: 15.051
	Valid Perplexity: 45.747


                                                         

	Train Perplexity: 15.054
	Valid Perplexity: 45.750


                                                         

	Train Perplexity: 15.030
	Valid Perplexity: 45.752


                                                         

	Train Perplexity: 15.065
	Valid Perplexity: 45.752


                                                         

	Train Perplexity: 15.033
	Valid Perplexity: 45.753


                                                         

	Train Perplexity: 15.049
	Valid Perplexity: 45.753


                                                         

	Train Perplexity: 15.056
	Valid Perplexity: 45.753


                                                         

	Train Perplexity: 15.082
	Valid Perplexity: 45.753


                                                         

	Train Perplexity: 15.046
	Valid Perplexity: 45.754


                                                         

	Train Perplexity: 15.050
	Valid Perplexity: 45.754


                                                         

	Train Perplexity: 15.042
	Valid Perplexity: 45.754


                                                         

	Train Perplexity: 15.086
	Valid Perplexity: 45.754


                                                         

	Train Perplexity: 15.055
	Valid Perplexity: 45.754


                                                         

	Train Perplexity: 15.059
	Valid Perplexity: 45.754


                                                         

	Train Perplexity: 15.037
	Valid Perplexity: 45.754


                                                         

	Train Perplexity: 15.068
	Valid Perplexity: 45.755


                                                         

	Train Perplexity: 15.046
	Valid Perplexity: 45.755


                                                         

	Train Perplexity: 15.045
	Valid Perplexity: 45.755


                                                         

	Train Perplexity: 15.060
	Valid Perplexity: 45.755


                                                         

	Train Perplexity: 15.048
	Valid Perplexity: 45.755


                                                         

	Train Perplexity: 15.040
	Valid Perplexity: 45.755


                                                         

	Train Perplexity: 15.072
	Valid Perplexity: 45.756


                                                         

	Train Perplexity: 15.065
	Valid Perplexity: 45.756


                                                         

	Train Perplexity: 15.055
	Valid Perplexity: 45.756


                                                         

	Train Perplexity: 15.040
	Valid Perplexity: 45.756


                                                         

	Train Perplexity: 15.050
	Valid Perplexity: 45.756


                                                         

	Train Perplexity: 15.065
	Valid Perplexity: 45.756


                                                         

	Train Perplexity: 15.023
	Valid Perplexity: 45.756


## Inference

Mine model is still dumb, but you can tried to train its on cloud for better performance.
This is limitation of mine gtx 1660ti

In [74]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [75]:
prompt = 'import numpy as'
max_seq_len = 30
seed = 0
            #superdiverse   more diverse
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0] 
#sample from this distribution higher probability will get more change
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
import numpy as

0.7
import numpy as

0.75
import numpy as

0.8
import numpy as start

1.0
import numpy as result start library import numpy np numpy what choice field essentially

