# Code Generation using PLBart

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt 
import torch
from transformers import PLBartTokenizer, PLBartModel, RobertaTokenizer,PLBartForConditionalGeneration
import sentencepiece as spm
from torch.utils.data import Dataset, DataLoader
from collections import namedtuple

### Set the language for the dataset

In [None]:
language = "sql"
#language = "py"

### Check if GPU is avaialble for faster training/testing

In [2]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


### Standardize Features and Targets

In [None]:
class InputFeatures(object):
    """Container for a single training/test features for an example."""
    def __init__(self,input_tokens,input_ids,target_tokens,target_ids):
        self.input_tokens = input_tokens
        self.input_ids = input_ids
        self.target_tokens = target_tokens
        self.target_ids = target_ids

In [None]:
def convert_examples_to_features(data,tokenizer):
    '''Format each example through tokenization, padding and attaching start and end tokens'''
    # Standardize Features
    code = data['NL']
    code_tokens=tokenizer.tokenize(code)[:20-2]
    input_tokens =[tokenizer.bos_token]+code_tokens+[tokenizer.eos_token]
    input_ids =  tokenizer.convert_tokens_to_ids(input_tokens)
    padding_length = 20 - len(input_ids)
    input_ids+=[tokenizer.pad_token_id]*padding_length
    
    # Standardize Target
    nat = data['Code']
    nat_tokens=tokenizer.tokenize(nat)[:150-2]
    target_tokens =[tokenizer.bos_token]+nat_tokens+[tokenizer.eos_token]
    target_ids =  tokenizer.convert_tokens_to_ids(target_tokens)
    padding_length = 150 - len(target_ids)
    target_ids+=[tokenizer.pad_token_id]*padding_length
    return InputFeatures(input_tokens,input_ids,target_tokens,target_ids)

### Instantiate unclanlp's Pre-trained PLBart Base Model

In [None]:
# Convert Features and Targets into tensors
torch.tensor(self.examples[indx].input_ids),torch.tensor(self.examples[indx].target_ids)

# Instantiate Token
tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base")
model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
model = model.to(device)

### Build training and testing Dataloaders

In [None]:
class CodeData(Dataset):
    '''Class to hold to hold instances of data'''
    def __init__(self, tokenizer, dataset):
        self.examples = []
        for i in range(len(dataset)):
          x = dataset.loc[i]
          self.examples.append(convert_examples_to_features(x,tokenizer))
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, indx):       
        return 

In [None]:
# Load data from CSV
train_data = pd.read_csv("../StaQC_Data/{}_single_answer.train.csv".format(language))
val_data = pd.read_csv("../StaQC_Data/{}_single_answer.val.csv".format(language))
train_data = train_data[['Code','NL']]
val_data = val_data[['Code','NL']]

train_dataset = CodeData(tokenizer, train_data)
val_dataset = CodeData(tokenizer,val_data)

train_batch_size = 16 #batch size
#gc.collect()
torch.cuda.empty_cache() #empty cache
train_dataloader = DataLoader(train_dataset, 
                                  batch_size=train_batch_size,num_workers=1,shuffle=True) #initialize data loaders
val_dataloader = DataLoader(val_dataset, batch_size=train_batch_size,num_workers=1)
num_train_epochs= 8 #number of epochs

### Train and Validate the PLBart Model

In [None]:
# Instantiate Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr= 0.0001) #Adam optimizer

train_loss_graph = [] #list to save training loss per epoch
val_loss_graph = [] #list to save validation loss per epoch
model.zero_grad()

# Loop through dataloader for each epoch
for idx in range(num_train_epochs): 
    #bar = tqdm(train_dataloader,total=len(train_dataloader))
    #bar2 = tqdm(val_dataloader,total = len(val_dataloader))
    tr_loss = 0.0
    val_loss = 0.0
    
    # Loop through all the batches for each epoch
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        # Seperate the features and targets from the batch
        input_ids = batch[0].to(device) #input IDs
        target_ids = batch[1].to(device) #targets
        #source_mask = torch.where(input_ids!=tokenizer.pad_token_id, input_ids, torch.tensor(-100))
        #target_mask = torch.where(target_ids!=tokenizer.pad_token_id, target_ids, torch.tensor(-100))

        # Train the PLBart model
        model.train()

        # Retrieve the new outputs of the model for the current batch 
        model_out = model(input_ids=input_ids, labels=target_ids)
        
        # Calculate loss
        loss = model_out.loss
        
        # Calculate Gradients with Losss
        loss.backward()
        
        # Backpropogate the Gradients
        optimizer.step()
        tr_loss += loss.item()


    # Loop through the next batch in the validation data loader
    for batch in val_dataloader:
        model.eval()

        # Seperate the feature and targets in this batch
        input_ids = batch[0].to(device) #input IDs
        target_ids = batch[1].to(device) #targets
        #source_mask = input_ids.ne(sp.pad_id())
        #target_mask = target_ids.ne(sp.pad_id())
        
        with torch.no_grad(): # Without changing gradients
            # Cacluate the outputs for the current validation batch
            model_out = model(input_ids=input_ids,labels=target_ids) #decoder_attention_mask=target_mask)
           
            # Calculate Validation loss
            v_loss = model_out.loss
            val_loss +=v_loss.item()

    # Calculate the average Loss over the batch for both training and validation
    epoch_loss = tr_loss/len(train_dataloader) #training loss per epoch
    epoch_val_loss = val_loss/len(val_dataloader) #validation loss per epoch
    print("epoch {} train loss {} val loss {}".format(idx+1,epoch_loss,epoch_val_loss))
    train_loss_graph.append(epoch_loss) 
    val_loss_graph.append(epoch_val_loss)
    torch.save(model.state_dict(), 'PLBART_{}_model-{}.pkl'.format(language, idx+1))


### Plot Training and Validation Loss

In [None]:
plt.plot(train_loss_graph,'k')
plt.plot(val_loss_graph,'y')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Train Loss','Val Loss'])
plt.title('Loss vs Epoch')
plt.savefig('Loss-Plot-{}-PLBART.png'.format(language.upper()))

d = pd.DataFrame({'Train_Loss':train_loss_graph,'Val_Loss':val_loss_graph})
d.to_csv('Training_loss_{}_PLBART.csv'.format(language.upper()))