In [1]:
# BASED ON PYTORCH NLP TUTORIAL https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [2]:
%pip install spacy



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


from torchtext import data, datasets
import torchdata.datapipes as dp
import torchtext.transforms as T
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader

import spacy

import random
import math
import time

import importlib
from seq2seq_modeling import *
from seq2seq_modeling import Seq2Seq, train, evaluate, epoch_time

In [3]:
from transformers import AutoTokenizer

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [5]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base-multi-sum", skip_special_tokens=False)

In [7]:
import re

def tokenize_summary(text):
    """
    Tokenizes question from a string into a list of strings (tokens) and reverses it
    """
    return list(filter(lambda x: len(x) < 15, re.findall(r"[\w']+", text)[::-1]))

def tokenize_snippet(text):
    """
    Tokenizes code snippet into a list of operands
    """
    return list(filter(lambda x: len(x) < 15, re.findall(r"[\w']+|[.,!?;:@~(){}\[\]+-/=\\\'\"\`]", text)))

In [30]:
def tokenize_descr(text):
    return tokenizer.tokenize(text, max_length=75)

def tokenize_code(code):
    return tokenizer.tokenize(code, max_length=512)

In [9]:
DATE = '20231104'

In [10]:
FILE_PATH = f'../../data/preprocessed/Quantlet/{DATE}/train_df_{DATE}_sample0.csv'

In [31]:
data_pipe = dp.iter.IterableWrapper([FILE_PATH])
data_pipe = dp.iter.FileOpener(data_pipe, mode='rb')
data_pipe = data_pipe.parse_csv(skip_lines=1, delimiter=',', as_tuple=True)

In [32]:
def removeAttribution(row):
    """
    Function to keep the first two elements in a tuple
    """
    return (row[1], row[5])

data_pipe = data_pipe.map(removeAttribution)

In [55]:
for i, sample in enumerate(data_pipe):
    print(sample)
    if i ==1:
        break

(tensor([[   1,  123,    5,  ...,    0,    0,    0],
        [   1, 2122,   44,  ...,    0,    0,    0],
        [   1, 1210,  124,  ...,    0,    0,    0],
        [   1, 1210,  124,  ..., 2793,   83,    2]]), tensor([[   1,   44,   29,    9,  598,  579,   10,    4, 1993,    6, 2665, 1557,
            5, 3588, 2089,  292, 3352,   21,  447,    9,  579,   10,    4, 1993,
            6, 2665, 1557,   15,  317, 1720,   80,    5,  374, 3792,    7, 1307,
           76,    5,  261,   28,  289,    2],
        [   1,   27,    9, 1091,    6, 1792, 1031, 1845, 2250, 1124,  339,  820,
          518,  383,   33,    2,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0],
        [   1,   44,  118,   35,    6,   55,  782,   10,  133,  131,  927,  290,
          239, 1554,  109,   21,  741,   12, 2886,   87,   44,    2,    0,    0,
            0,    0,    0,    0,    0,    0,    0

In [49]:
def getTokens(data_iter, place):
    """
    Function to yield tokens from an iterator. Since, our iterator contains
    tuple of sentences (source and target), `place` parameters defines for which
    index to return the tokens for. `place=0` for source and `place=1` for target
    """
    for code, summary in data_iter:
        if place == 0:
            #yield tokenize_snippet(code)
            yield tokenize_code(code)
        else:
            #yield tokenize_summary(summary)
            yield tokenize_descr(summary)

In [50]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe, 0),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
source_vocab.set_default_index(source_vocab['<unk>'])

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [36]:
target_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,1),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
target_vocab.set_default_index(target_vocab['<unk>'])

In [37]:
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_tranform

In [38]:
temp_list = list(data_pipe)
some_sentence = temp_list[798][0]

In [39]:
transformed_sentence = getTransform(source_vocab)(tokenize_code(some_sentence))

In [40]:
index_to_string = source_vocab.get_itos()
for index in transformed_sentence:
    print(index_to_string[index], end=" ")

<sos> function Ġy Ġ= ĠComp ĠCor r ĠG auss ĠModel ĠC DO ( a , ĠR , Ġdef ĠProb , ĠU AP , ĠD F , ĠDay ĠCount , Ġtrue ĠSp read ) Ċ C Ġ= Ġnorm inv ( def ĠProb , Ġ0 , Ġ1 ); Ċ N inv ĠK Ġ= Ġnorm inv ( ĠU AP Ġ/ Ġ( 1 Ġ- ĠR ), Ġ0 , Ġ1 ); Ċ A Ġ= Ġ( ĠC Ġ- Ġsqrt ( 1 Ġ- Ġa ^ 2 ) Ġ* ĠN inv ĠK ) Ġ/ Ġa ; Ċ Sigma Ġ= Ġ[ 1 Ġ- a ; Ġ1 ]; Ċ Mu Ġ= Ġ[ 0 Ġ0 ]; Ċ EL 1 Ġ= Ġmv nc df ([ ĠC , Ġ- ĠA ], ĠM u , ĠS igma ); Ċ EL 2 Ġ= Ġnorm cdf ( ĠA ); if ĠL AP Ġ== Ġ0 Ċ EL Ġ= ĠEL 1 Ġ/ ĠU AP Ġ* Ġ( 1 - ĠR ) Ġ+ ĠEL 2 ; Ċ else Ċ N inv ĠL Ġ= Ġnorm inv ( ĠL AP Ġ/ Ġ( 1 Ġ- ĠR ), Ġ0 , Ġ1 ); Ċ B Ġ= Ġ( ĠC Ġ- Ġsqrt ( 1 Ġ- Ġa ^ 2 ) Ġ* ĠN inv ĠL ) Ġ/ Ġa ; Ċ EL 3 Ġ= Ġmv nc df ([ ĠC , Ġ- ĠB ], ĠM u , ĠS igma ); Ċ EL 4 Ġ= Ġnorm cdf ( ĠB ); Ċ Upper ĠET L Ġ= ĠEL 1 Ġ+ ĠEL 2 Ġ* ĠU AP Ġ/ Ġ( 1 Ġ- ĠR ); Ċ EL Ġ= Ġ( ĠUpper ĠET L Ġ- ĠLower ĠET L ) Ġ/ Ġ( ĠU AP Ġ- ĠL AP ) Ġ* Ġ( 1 Ġ- ĠR ); Ċ end Ċ Protect ĠLeg Ġ= Ġsum ( diff ([ 0 ; ĠEL ]) Ġ. * ĠD F ); Ċ Prem ium ĠLeg Ġ= Ġsum (( 1 Ġ- ĠEL ) Ġ. * ĠD F Ġ. * ĠDay ĠCount ); Ċ spread Ġ= ĠPro t ect ĠLeg Ġ/ ĠP 

In [41]:
def applyTransform(sequence_pair):
    """
    Apply transforms to sequence of tokens in a sequence pair
    """

    return (
        getTransform(source_vocab)(tokenize_code(sequence_pair[0])),
        getTransform(target_vocab)(tokenize_descr(sequence_pair[1]))
    )
data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
temp_list = list(data_pipe)
print(temp_list[0])

([1, 401, 2901, 5563, 29, 135, 2422, 2502, 296, 674, 55, 2842, 1130, 868, 16, 110, 2631, 79, 377, 132, 708, 60, 1971, 599, 205, 4, 122, 6, 28, 31, 47, 1047, 46, 17, 4, 65, 4575, 1914, 35, 4, 128, 2642, 5, 151, 129, 5, 122, 7, 261, 7, 271, 8, 257, 6, 62, 21, 4384, 1445, 20, 3945, 1143, 4, 1829, 6, 28, 31, 116, 1255, 608, 3734, 8, 90, 17, 4, 65, 116, 1255, 1548, 2653, 1192, 8, 90, 17, 4, 65, 116, 1255, 10006, 164, 4775, 8, 90, 35, 4, 128, 2642, 5, 151, 129, 5, 1829, 7, 1478, 21, 1769, 69, 4464, 1065, 3368, 237, 4, 3546, 651, 4693, 6, 72, 5, 12, 7, 64, 9, 59, 575, 5, 12, 7, 64, 7, 478, 6, 732, 1847, 3932, 2970, 78, 7498, 1763, 3873, 42, 2502, 3331, 2040, 4, 263, 28, 6, 15, 1447, 51, 32, 2801, 1501, 4, 1113, 119, 6, 4679, 5, 97, 8, 1113, 5, 263, 119, 7, 2669, 6, 15, 4058, 146, 65, 23, 4, 10, 7, 1332, 453, 47, 5, 763, 7, 1107, 7, 813, 7, 7989, 7, 5741, 7, 8787, 389, 599, 1871, 5563, 50, 799, 716, 20, 5983, 234, 1679, 4, 3965, 6, 134, 5, 543, 7, 76, 133, 6, 7992, 1006, 7, 76, 144, 6, 186, 5,

In [42]:
def sortBucket(bucket):
    """
    Function to sort a given bucket. Here, we want to sort based on the length of
    source and target sequence.
    """
    return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))

In [43]:
data_pipe = data_pipe.bucketbatch(
    batch_size = 4, batch_num=5,  bucket_num=1,
    use_in_batch_shuffle=False, sort_key=sortBucket
)

In [44]:
def separateSourceTarget(sequence_pairs):
    """
    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
    """
    sources,targets = zip(*sequence_pairs)
    return sources,targets

## Apply the function to each element in the iterator
data_pipe = data_pipe.map(separateSourceTarget)
print(list(data_pipe)[0])

(([1, 123, 5, 97, 19, 445, 5, 112, 19, 62, 21, 4, 150, 8, 131, 53, 12, 6, 28, 5, 10, 8, 2079, 7, 10, 8, 3306, 7, 10, 8, 1817, 7, 10, 8, 1535, 7, 10, 8, 1461, 7, 10, 8, 2025, 7, 10, 8, 3305, 7, 10, 8, 940, 7, 10, 8, 1534, 7, 10, 8, 2795, 7, 10, 8, 1520, 7, 10, 8, 1593, 7, 10, 8, 2185, 9, 2], [1, 2122, 44, 121, 500, 1265, 32, 116, 3581, 89, 250, 5440, 84, 206, 662, 89, 32, 2494, 2518, 32, 398, 1595, 231, 178, 42, 7394, 8033, 160, 4, 215, 47, 4, 103, 4, 171, 44, 2098, 66, 5391, 8, 275, 73, 4, 457, 10, 6, 1898, 44, 3509, 2412, 33, 49, 79, 71, 7873, 160, 4, 795, 10, 6, 850, 44, 6899, 33, 1898, 10, 7, 676, 4, 86, 506, 1136, 1285, 33, 1121, 252, 33, 49, 2469, 33, 49, 39, 424, 86, 1120, 676, 4, 86, 190, 1136, 33, 113, 18, 5482, 356, 242, 192, 1041, 63, 676, 4, 86, 82, 1136, 1285, 33, 1121, 344, 16, 344, 74, 625, 16, 344, 74, 242, 16, 344, 74, 784, 16, 344, 74, 800, 16, 344, 86, 1120, 676, 4, 86, 82, 1136, 33, 113, 18, 1167, 733, 3262, 253, 63, 676, 4, 86, 217, 339, 33, 588, 41, 4, 3846, 5, 795

In [45]:
def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply padding
    """
    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
# padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
# vocabulary.
data_pipe = data_pipe.map(applyPadding)

In [46]:
source_index_to_string = source_vocab.get_itos()
target_index_to_string = target_vocab.get_itos()

def showSomeTransformedSentences(data_pipe):
    """
    Function to show how the sentences look like after applying all transforms.
    Here we try to print actual words instead of corresponding index
    """
    for sources,targets in data_pipe:
        if sources[0][-1] != 0:
            continue # Just to visualize padding of shorter sentences
        for i in range(4):
            source = ""
            for token in sources[i]:
                source += " " + source_index_to_string[token]
            target = ""
            for token in targets[i]:
                target += " " + target_index_to_string[token]
            print(f"Source: {source}")
            print(f"Traget: {target}")
        break

showSomeTransformedSentences(data_pipe)

Source:  <sos> rm ( list = ls ( all = ĠTRUE )) Ċ graphics . off () x Ġ= Ġc ( 1 . 72 , 1 . 83 , 1 . 74 , 1 . 79 , 1 . 94 , 1 . 71 , 1 . 66 , 1 . 60 , 1 . 78 , 1 . 77 , 1 . 85 , 1 . 70 , 1 . 76 ) <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p

In [None]:
data_pipe

In [47]:

dl = DataLoader(dataset=data_pipe, batch_size=5, num_workers=2)

In [48]:
first = next(iter(dl))

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/RDC/zinovyee.hub/.conda/envs/encode_code/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/RDC/zinovyee.hub/.conda/envs/encode_code/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 42, in fetch
    return self.collate_fn(data)
  File "/home/RDC/zinovyee.hub/.conda/envs/encode_code/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/home/RDC/zinovyee.hub/.conda/envs/encode_code/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 142, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/RDC/zinovyee.hub/.conda/envs/encode_code/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 142, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/RDC/zinovyee.hub/.conda/envs/encode_code/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 119, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/home/RDC/zinovyee.hub/.conda/envs/encode_code/lib/python3.9/site-packages/torch/utils/data/_utils/collate.py", line 161, in collate_tensor_fn
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable


In [7]:


SRC = Field(
    tokenize = tokenize_code, 
    init_token = '<sos>', 
    eos_token = '<eos>', 
    lower = True,
    include_lengths = True
)

TRG = Field(
    tokenize = tokenize_summary, 
    init_token = '<sos>', 
    eos_token = '<eos>', 
    lower = True
)

fields = {
    'code_script': ('src', SRC),
    'Description': ('trg', TRG)
}

train_data, valid_data, test_data = TabularDataset.splits(
                            path = f'../../data/preprocessed/Quantlet/{DATE}/',
                            train = f"train_df_{DATE}_sample0.csv",
                            validation = f"val_df_{DATE}_sample0.csv",
                            test = f"test_df_{DATE}_sample0.csv",
                            format = 'csv',
                            fields = fields
)

In [8]:
SRC.build_vocab([train_data.src], max_size=25000, min_freq=3)
print(SRC.vocab.freqs.most_common(20))


TRG.build_vocab([train_data.trg], min_freq=5)
print(TRG.vocab.freqs.most_common(20))

print(f"Unique tokens in code: {len(SRC.vocab)}")
print(f"Unique tokens in descriptions: {len(TRG.vocab)}")

[(',', 160783), ('(', 155363), (')', 153066), ('=', 132957), ('.', 94494), ('"', 69028), ('[', 41935), (']', 41715), ('1', 41517), ('-', 33500), (':', 26800), ('0', 22978), ('2', 21118), ("'", 19160), (';', 18083), ('x', 17087), ('/', 13954), ('+', 13534), ('i', 12632), ('data', 12200)]
[('the', 6010), ('of', 3291), ('and', 2544), ('a', 1716), ('for', 1585), ('to', 1078), ('data', 843), ('with', 750), ('in', 693), ('is', 671), ('from', 596), ('on', 512), ('are', 424), ('by', 391), ('m', 334), ('time', 329), ('plot', 315), ('as', 284), ('an', 245), ('this', 239)]
Unique tokens in code: 5429
Unique tokens in descriptions: 478


In [1]:
SRC

NameError: name 'SRC' is not defined

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

In [13]:
BATCH_SIZE = 2

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
        batch_size = BATCH_SIZE,
        sort_within_batch = True,
        sort_key = lambda x : len(x.src),
        device = device)


In [14]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
ENC_HID_DIM = 100
DEC_HID_DIM = 100
ENC_DROPOUT = 0.8
DEC_DROPOUT = 0.8
PAD_IDX = SRC.vocab.stoi['<pad>']
SOS_IDX = TRG.vocab.stoi['<sos>']
EOS_IDX = TRG.vocab.stoi['<eos>']

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, PAD_IDX, SOS_IDX, EOS_IDX, device).to(device)

In [15]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Модель содержит {count_parameters(model):,} параметров')

Модель содержит 1,278,458 параметров


In [16]:
model.apply(init_weights)

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [17]:
print(torch.cuda.get_arch_list())

['sm_37', 'sm_50', 'sm_60', 'sm_70']


In [19]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    print(epoch)
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'conala_model_attention_test.pt')
    
    print(f'Эпоха: {epoch+1:02} | Время: {epoch_mins}m {epoch_secs}s')
    print(f'Перплексия (обучение): {math.exp(train_loss):7.3f}')
    print(f'Перплексия (валидация): {math.exp(valid_loss):7.3f}')

0


In [None]:
model.load_state_dict(torch.load('conala_model_attention_test.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'Перплексия (валидация): {math.exp(test_loss):7.3f}')