# Code Generation using CodeGPT

In [None]:
import pandas as pd
import numpy as np 
from transformers import RobertaTokenizer, AutoModelForCausalLM,GPT2Tokenizer,GPT2LMHeadModel,GPT2Config
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader,TensorDataset
from matplotlib import pyplot as plt

### Set the language for the dataset

In [None]:
language = "sql"
#language = "py"

### Check if GPU is avaialble for faster training/testing

In [None]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

### Instantiate Microsoft's Pre-trained CodeGPT model 

In [None]:
# Instantiate Pre-trained model
CodeGPT = GPT2LMHeadModel.from_pretrained("microsoft/CodeGPT-small-java")
tokenizer = GPT2Tokenizer.from_pretrained("microsoft/CodeGPT-small-java",do_lower_case=True, bos_token='<s>', eos_token='</s>', pad_token='<pad>', unk_token='<|UNKNOWN|>', sep_token='concode_elem_sep')
config = GPT2Config.from_pretrained("microsoft/CodeGPT-small-java")

# Adjust Tokenizer settings 
CodeGPT.resize_token_embeddings(len(tokenizer))
CodeGPT.config.bos_token_id = tokenizer.bos_token_id
CodeGPT.config.eos_token_id = tokenizer.eos_token_id
CodeGPT.config.pad_token_id = tokenizer.pad_token_id

# Move model to GPU
CodeGPT = CodeGPT.to(device)

### Build and Load the Training and Validation Datasets

In [None]:
# Load Training and Validation data
data = pd.read_csv("../StaQC_Data/{}_single_answer.train.csv".format(language))
val_data = pd.read_csv("../StaQC_Data/{}_single_answer.val.csv".format(language))
data = train_data[["NL","Code"]]
val_data = val_data[["NL", "Code"]]

In [None]:
class ConcodeDataset(Dataset):
    '''Class to format the StaQC Single Answer Dataset'''
    def __init__(self, tokenizer, data, file_type='train', block_size=150, mode='train'):

            self.block_size = block_size
            self.mode = mode
            self.inputs = []
            self.token_labels = []

            datas = data

            length = len(datas)

            for idx in range(len(datas)):
                x = datas.iloc[idx]
                code = tokenizer.encode(x["Code"])
                nl = tokenizer.encode(x["NL"])

                input_ids, input_labels = self.pad_and_get_mask(code, nl, tokenizer)
                self.inputs.append(input_ids)
                self.token_labels.append(input_labels)


    def pad_and_get_mask(self, code, nl, tokenizer):
        '''Pad the extra space and mask the overflow'''
        if self.mode == 'test':
            code = []
        while (len(code) + len(nl) + 2 > self.block_size):
            if (len(code) > len(nl)):
                code = code[:-1]
            else:
                nl = nl[:-1]
        if self.mode == 'train':
            inputs = nl + [tokenizer.bos_token_id] + code + [tokenizer.eos_token_id]
            labels = [1] * len(nl) + [2] * (len(code)+1) + [0]
        else:
            inputs = nl + [tokenizer.bos_token_id]
            labels = [1] * len(nl) + [2]
            return inputs, labels
        assert len(inputs) <= self.block_size
        pad_len = self.block_size - len(inputs)
        inputs += [tokenizer.pad_token_id] * pad_len
        labels += [0] * pad_len
        assert len(inputs) == len(labels)
        return inputs, labels


    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        return torch.tensor(self.inputs[item]), torch.tensor(self.token_labels[item])

In [None]:
# Load the Training and Testing dataset as Concode Variables
dataset = ConcodeDataset(tokenizer, data, file_type='train',block_size=150)
val_dataset = ConcodeDataset(tokenizer, val_data,mode='train', file_type='dev',block_size=150)

# Create Dataloader for training and testing data to iterate through the batches
train_dataloader = DataLoader(dataset, batch_size=16, drop_last=True,shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, drop_last=True,shuffle=False)

### Train and Validate the CodeGPT Model

In [None]:
# Instantiate Loss function and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(lr=0.00001,params = CodeGPT.parameters())

In [None]:
torch.cuda.empty_cache()
num_epochs = 8

train_loss_graph = []
val_loss_graph = []

# Loop through dataloader for each epoch
for i in range(num_epochs):
    tr_loss = 0.0
    eval_loss = 0.0
    
    # Loop through all the batches for each epoch
    for batch,token_labels in train_dataloader:
    
        optimizer.zero_grad()
        
        # Seperate and format both the features and targets
        token_labels = token_labels.to(device)
        attn_mask = torch.tensor(token_labels.clone().detach() != 0, dtype=torch.uint8)
        loss_mask = torch.tensor(token_labels.clone().detach() == 2, dtype=torch.uint8)
        attn_mask = attn_mask.to(device)
        batch = batch.to(device)
        
        # Train the GPT model on the current bath
        CodeGPT.train()
        
        # Retrieve the new outputs after training
        out = CodeGPT(batch,attention_mask=attn_mask)
        
        # Format output before calculating loss and gradients
        logits = out.logits
        labels = batch
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        flatten_shift_loss_mask = loss_mask[..., :-1].contiguous().view(-1)
        ids = torch.nonzero(flatten_shift_loss_mask).view(-1)
        
        # Calculate Loss
        loss = criterion(shift_logits.view(-1, shift_logits.size(-1))[ids], shift_labels.view(-1)[ids])
        
        # Calculate Gradients 
        loss.backward()
        
        # Back Propogate
        optimizer.step()
        tr_loss+= loss.item()

    # Loop through the next batch in the validation data loader
    for batch,token_labels in val_dataloader:
        CodeGPT.eval()
        with torch.no_grad(): # Ensure gradients aren't updated
            
            # Seperate and format both the features and targets
            token_labels = token_labels.to(device)
            attn_mask = torch.tensor(token_labels.clone().detach() != 0, dtype=torch.uint8)
            loss_mask = torch.tensor(token_labels.clone().detach() == 2, dtype=torch.uint8)
            attn_mask = attn_mask.to(device)
            batch = batch.to(device)
            
            # Calculate outputs based on the newly updated model
            out = CodeGPT(batch,attention_mask=attn_mask)
            
            # Format output before calculating loss
            logits = out.logits
            labels = batch
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            flatten_shift_loss_mask = loss_mask[..., :-1].contiguous().view(-1)
            ids = torch.nonzero(flatten_shift_loss_mask).view(-1)
            
            # Calculte Loss
            loss = criterion(shift_logits.view(-1, shift_logits.size(-1))[ids], shift_labels.view(-1)[ids])
            eval_loss += loss.item()
            #print(eval_loss)

    # Calculate the average Loss over the batch for both training and validation
    tr_loss = tr_loss/len(train_dataloader)
    eval_loss = eval_loss/len(val_dataloader)
    train_loss_graph.append(tr_loss)
    val_loss_graph.append(eval_loss)
    print("Epoch: {} Train Loss: {} Val Loss: {}".format(i+1,tr_loss,eval_loss))
    torch.save(CodeGPT.state_dict(),'CodeGPT-{}-{}.pkl'.format(language, i+1))


### Plot Training and Validation Loss

In [None]:
plt.plot(train_loss_graph,'k')
plt.plot(val_loss_graph,'y')
plt.legend(["Training Loss","Validation Loss"])
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss vs Epoch")
plt.savefig("CodeGPT-{}-Loss-graph.png".format(language.upper()))

losses = pd.DataFrame({'Tr_Loss': train_loss_graph, 'Val_Loss': val_loss_graph})

losses.to_csv('Training_loss_{}_GPT.csv'.format(language.upper()))