In [1]:
# BASED ON PYTORCH NLP TUTORIAL https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html and https://github.com/sic-rus-ai/stepik-dl-nlp/blob/master/task8_generate_stackoverflow_code.ipynb

In [1]:
%%capture
%pip install torchtext==0.6.0
%pip install transformers
%pip install evaluate
%pip install rouge_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
QPATH = 'Quantlet/4-seq2seq'

import sys
IN_COLAB = 'google.colab' in sys.modules

import os
if IN_COLAB:
  os.chdir(f'/content/drive/MyDrive/ColabNotebooks/IRTG/Encode_the_Qode/Encode-the-Qode/{QPATH}')

In [4]:
import torchtext
torchtext.__version__

'0.6.0'

In [5]:
import os
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

from tqdm import tqdm

import re

#from torchtext import data, datasets
#import torchdata.datapipes as dp
#import torchtext.transforms as T
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset

import spacy

import random
import math
import time

import importlib
from seq2seq_modeling import *
from seq2seq_modeling import Seq2Seq, train, epoch_time
#from seq2seq_modeling import  evaluate as ev
from transformers import AutoTokenizer

In [6]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [7]:
DATE = '20231104'

In [8]:
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base-multi-sum", skip_special_tokens=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

In [9]:
def tokenize_summary(text):
    """
    Tokenizes question from a string into a list of strings (tokens) and reverses it
    """
    return list(filter(lambda x: len(x) < 15, re.findall(r"[\w']+", text)[::-1]))

def tokenize_snippet(text):
    """
    Tokenizes code snippet into a list of operands
    """
    return list(filter(lambda x: (len(x) < 15) and (len(x)>=2), re.findall(r"[\w']+|[.,!?;:@~(){}\[\]+-/=\\\'\"\`]", text)))

In [10]:
train2 = pd.read_csv(f"../../data/preprocessed/Quantlet/{DATE}/train_df_{DATE}_sample0.csv")

In [11]:
#f = re.sub('[^A-Za-z0-9]+', ' ', train2.code_script[0])

In [12]:
SRC = Field(
    tokenize = tokenize_snippet,
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True,
    include_lengths = True
)

TRG = Field(
    tokenize = tokenize_summary,
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True
)

fields = {
    'code_script': ('src', SRC),
    'Description': ('trg', TRG)
}

train_data, valid_data, test_data = TabularDataset.splits(
                            path = f'../../data/preprocessed/Quantlet/{DATE}/',
                            train = f"train_df_{DATE}_sample0.csv",
                            validation = f"val_df_{DATE}_sample0.csv",
                            test = f"test_df_{DATE}_sample0.csv",
                            format = 'csv',
                            fields = fields
)

In [13]:
SRC.build_vocab([train_data.src], max_size=10000, min_freq=3)
print(SRC.vocab.freqs.most_common(20))


TRG.build_vocab([train_data.trg], max_size=3000, min_freq=2)
print(TRG.vocab.freqs.most_common(20))

print(f"Unique tokens in code: {len(SRC.vocab)}")
print(f"Unique tokens in descriptions: {len(TRG.vocab)}")

[('data', 12616), ('the', 11002), ('of', 8542), ('for', 8404), ('in', 7731), ('plot', 6616), ('if', 5606), ('and', 5515), ('as', 5140), ('true', 4777), ('function', 4461), ('to', 4445), ('list', 4377), ('df', 4376), ('import', 4087), ('length', 4026), ('matrix', 3696), ('plt', 3678), ('col', 3494), ('all', 3377)]
[('the', 6010), ('of', 3291), ('and', 2544), ('a', 1716), ('for', 1585), ('to', 1078), ('data', 843), ('with', 750), ('plots', 697), ('in', 693), ('is', 671), ('from', 596), ('on', 512), ('distribution', 438), ('are', 424), ('function', 412), ('by', 391), ('model', 382), ('computes', 354), ('using', 352)]
Unique tokens in code: 10004
Unique tokens in descriptions: 3004


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

In [15]:
BATCH_SIZE = 4

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
        batch_size = BATCH_SIZE,
        sort_within_batch = True,
        sort_key = lambda x : len(x.src),
        device = device)

In [16]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
ENC_HID_DIM = 100
DEC_HID_DIM = 100
ENC_DROPOUT = 0.8
DEC_DROPOUT = 0.3
PAD_IDX = SRC.vocab.stoi['<pad>']
SOS_IDX = TRG.vocab.stoi['<sos>']
EOS_IDX = TRG.vocab.stoi['<eos>']

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)

In [17]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'N Parameters {count_parameters(model):,}')

N Parameters 3,271,040


In [18]:
model.apply(init_weights)

optimizer = optim.Adam(model.parameters(), lr=0.01)

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [19]:
import numpy as np
import evaluate
def compute_metrics(decoded_preds, decoded_labels):

        results_dict = {}
        for m in ['bleu', 'rouge']:
            metric = evaluate.load(m)

            if m=='bleu':
                result = metric.compute(
                    predictions=decoded_preds, references=decoded_labels
                )
            elif m=='rouge':
                result = metric.compute(
                    predictions=decoded_preds, references=decoded_labels, use_stemmer=True
                )
            result = {key: value for key, value in result.items() if key!='precisions'}

            prediction_lens = [
                np.count_nonzero(pred != tokenizer.pad_token_id) for pred in decoded_preds
            ]
            result["gen_len"] = np.mean(prediction_lens)
            result = {k: round(v, 4) for k, v in result.items()}
            results_dict.update(result)
        return results_dict

In [20]:
def train_local(model, iterator, optimizer, criterion, clip):

    rouge_list = []
    loss_list = []

    model.train()

    epoch_loss = 0

    for i, batch in tqdm(enumerate(iterator)):

        #print(f'batch: {i}')

        src, src_len = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output, attention = model(src, src_len, trg, 0.4)

        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]

        loss = criterion(output, trg)

        translation_tensor = torch.argmax(output.squeeze(1), 1)
        translation = [TRG.vocab.itos[t] for t in translation_tensor]

        target = [TRG.vocab.itos[t] for t in trg]

        rouge_list.append(compute_metrics(translation, target)['rouge1'])
        loss_list. append(loss)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return (epoch_loss / len(iterator), rouge_list, loss_list)

In [21]:
def ev(model, iterator, criterion):

    rouge_list = []
    loss_list = []

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for i, batch in enumerate(iterator):

            src, src_len = batch.src
            trg = batch.trg

            output, attention = model(src, src_len, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            translation_tensor = torch.argmax(output.squeeze(1), 1)
            translation = [TRG.vocab.itos[t] for t in translation_tensor]

            target = [TRG.vocab.itos[t] for t in trg]

            rouge_list.append(compute_metrics(translation, target)['rouge1'])
            loss_list. append(loss)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [22]:
results_dict = {}

In [23]:
for SEED in range(15):

  random.seed(SEED)
  torch.manual_seed(SEED)
  torch.backends.cudnn.deterministic = True

  model = Seq2Seq(enc, dec, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)
  model.apply(init_weights)

  optimizer = optim.Adam(model.parameters(), lr=0.01)

  criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

  N_EPOCHS = 15
  CLIP = 1

  best_valid_loss = float('inf')

  training_rouge = []
  training_loss = []

  evaluation_rouge = []
  evaluation_loss = []

  for epoch in range(N_EPOCHS):
      print(epoch)

      start_time = time.time()

      train_loss, train_rouge, train_loss = train_local(model, train_iterator, optimizer, criterion, CLIP)
      valid_loss, eval_rouge, eval_loss = ev(model, valid_iterator, criterion)

      training_rouge.extend(train_rouge)
      evaluation_rouge.extend(eval_rouge)

      training_loss.extend(train_loss)
      evaluation_loss.extend(eval_loss)

      end_time = time.time()

      epoch_mins, epoch_secs = epoch_time(start_time, end_time)

      if valid_loss < best_valid_loss:
          best_valid_loss = valid_loss
          torch.save(model.state_dict(), 'conala_model_attention_test.pt')

      print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
      print(f'Perplexity (training): {math.exp(train_loss):7.3f}')
      print(f'Perplexity (validation): {math.exp(valid_loss):7.3f}')

      print(f'Loss (training): {train_loss:7.3f}')
      print(f'Loss (validation): {valid_loss:7.3f}')

  results_dict[SEED] = {'training_rouge' : training_rouge,
                        'evaluation_rouge' : evaluation_rouge,
                        'training_loss' : training_loss,
                        'evaluation_loss' : evaluation_loss}

0


0it [00:00, ?it/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

4it [00:23,  5.76s/it]


KeyboardInterrupt: ignored

In [None]:
model.load_state_dict(torch.load('conala_model_attention_test.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'Perplexity (validation): {math.exp(test_loss):7.3f}')
print(f'Loss (validation): {test_loss:7.3f}')

In [None]:
def translate_sentence(model, sentence):
    model.eval()
    tokenized = tokenize_snippet(sentence)
    tokenized = ['<sos>'] + [t.lower() for t in tokenized] + ['<eos>']
    numericalized = [SRC.vocab.stoi[t] for t in tokenized]
    sentence_length = torch.LongTensor([len(numericalized)]).to(device)
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device)
    translation_tensor_logits, attention = model(tensor, sentence_length, None, 0)
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    translation = [TRG.vocab.itos[t] for t in translation_tensor]
    translation, attention = translation[1:], attention[1:]
    return translation, attention

In [None]:
example_idx = 20

src = ' '.join(vars(train_data.examples[example_idx])['src'])
trg = ' '.join(vars(train_data.examples[example_idx])['trg'])

print(f'src = {src}')
print(f'trg = {trg}')

In [None]:
translation, attention = translate_sentence(model, src)

print('predicted trg = ', ' '.join(translation))

#display_attention(src, translation, attention)

In [None]:
compute_metrics([trg], [' '.join(translation)])

NAIVE BASELINE

In [None]:
val_df = pd.read_csv(f'../../data/preprocessed/Quantlet/{DATE}/val_df_{DATE}_sample0.csv')
test_df = pd.read_csv(f'../../data/preprocessed/Quantlet/{DATE}/test_df_{DATE}_sample0.csv')

In [None]:
val_df['pred_line'] = val_df.code_script.str.split('\n', expand=True)[0]
val_df['pred_sent'] = val_df.code_script.str.split('.', expand=True)[0]

test_df['pred_line'] = test_df.code_script.str.split('\n', expand=True)[0]
test_df['pred_sent'] = test_df.code_script.str.split('.', expand=True)[0]

Validation

In [None]:
compute_metrics(val_df.Description.tolist(), val_df.pred_line.tolist())

In [None]:
compute_metrics(val_df.Description.tolist(), val_df.pred_sent.tolist())

Test

In [None]:
compute_metrics(test_df.Description.tolist(), test_df.pred_line.tolist())

In [None]:
compute_metrics(test_df.Description.tolist(), test_df.pred_sent.tolist())