In [136]:
import json
import requests

import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence

import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer, ngrams_iterator

In [None]:
file_url = 'https://raw.githubusercontent.com/google-research/google-research/master/mbpp/mbpp.jsonl'
response = requests.get(file_url)

data = list()

if response.status_code == 200:
    json_lines = response.text.splitlines()
    for line in json_lines:
        json_object = json.loads(line)
        data.append(json_object)
else:
    print(f"Failed to download file. Status code: {response.status_code}")


In [16]:
for key, values in data[0].items():
    print(f"{key} - {json_object[key]}")

    print("*" * 10)

text - Write a function to find the minimum total path sum in the given triangle.
**********
code - def min_sum_path(A): 
	memo = [None] * len(A) 
	n = len(A) - 1
	for i in range(len(A[n])): 
		memo[i] = A[n][i] 
	for i in range(len(A) - 2, -1,-1): 
		for j in range( len(A[i])): 
			memo[j] = A[i][j] + min(memo[j], 
									memo[j + 1]) 
	return memo[0]
**********
task_id - 974
**********
test_setup_code - 
**********
test_list - ['assert min_sum_path([[ 2 ], [3, 9 ], [1, 6, 7 ]]) == 6', 'assert min_sum_path([[ 2 ], [3, 7 ], [8, 5, 6 ]]) == 10 ', 'assert min_sum_path([[ 3 ], [6, 4 ], [5, 2, 7 ]]) == 9']
**********
challenge_test_list - []
**********


In [22]:
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,text,code,task_id,test_setup_code,test_list,challenge_test_list
0,Write a function to find the minimum cost path...,"R = 3\r\nC = 3\r\ndef min_cost(cost, m, n): \r...",1,,"[assert min_cost([[1, 2, 3], [4, 8, 2], [1, 5,...",[]
1,Write a function to find the similar elements ...,"def similar_elements(test_tup1, test_tup2):\r\...",2,,"[assert similar_elements((3, 4, 5, 6),(5, 7, 4...",[]
2,Write a python function to identify non-prime ...,import math\r\ndef is_not_prime(n):\r\n res...,3,,"[assert is_not_prime(2) == False, assert is_no...",[]
3,Write a function to find the largest integers ...,import heapq as hq\r\ndef heap_queue_largest(n...,4,,"[assert heap_queue_largest( [25, 35, 22, 85, 1...",[]
4,Write a function to find the number of ways to...,def count_ways(n): \r\n\tA = [0] * (n + 1) \r\...,5,,"[assert count_ways(2) == 3, assert count_ways(...",[]


In [77]:
sample_data = df.sample(1)

In [78]:
tokenizer = get_tokenizer('basic_english')

In [83]:
" ".join([text.split() for text in sample_data['text']][0])

"Write a function to find all words starting with 'a' or 'e' in a given string."

In [84]:
" ".join([tokenizer(text) for text in sample_data['text']][0])

"write a function to find all words starting with ' a ' or ' e ' in a given string ."

In [121]:
class Preprocessor():
    def __init__(self, df, tokenize=True):
        self.df = df
        self.tokenize_ = tokenize
        self.text_tokenized = None
        self.code_tokenized = None
        self.text_vocab = None
        self.code_vocab = None
        self.text_numericalized = None
        self.code_numericalized = None
        self.text_padded = None
        self.code_padded = None
    
    def functionToTokenize(self):
        if self.tokenize_:
            tokenizer = get_tokenizer('basic_english')
            self.text_tokenized = [tokenizer(text) for text in self.df['text']]
            self.code_tokenized = [tokenizer(code) for code in self.df['code']]
        else:
            self.text_tokenized = [text.split() for text in self.df['text']]
            self.code_tokenized = [code.split() for code in self.df['code']]
            
    def functionToVocab(self):
        self.text_vocab = build_vocab_from_iterator(self.text_tokenized, specials=['<pad>'])
        self.code_vocab = build_vocab_from_iterator(self.code_tokenized, specials=['<pad>'])

        self.text_vocab.set_default_index(self.text_vocab['<pad>'])
        self.code_vocab.set_default_index(self.code_vocab['<pad>'])

        self.text_numericalized = [self.text_vocab.lookup_indices(tokens) for tokens in self.text_tokenized]
        self.code_numericalized = [self.code_vocab.lookup_indices(tokens) for tokens in self.code_tokenized]

    def functionToPad(self):
        self.text_padded = pad_sequence([torch.tensor(indices) for indices in self.text_numericalized], padding_value=self.text_vocab['<pad>'])
        self.code_padded = pad_sequence([torch.tensor(indices) for indices in self.code_numericalized], padding_value=self.code_vocab['<pad>'])
        
        return self.text_padded, self.code_padded
    
    def setup(self):
        self.functionToTokenize()
        self.functionToVocab()
        return self.functionToPad()

In [122]:
preprocessor = Preprocessor(df)

In [129]:
text_padded, code_padded = preprocessor.setup()

In [130]:
len(text_padded), len(code_padded)

(48, 258)

In [131]:
class CodeGenerationDataset(Dataset):
    def __init__(self, text_padded, code_padded):
        self.text_padded = text_padded
        self.code_padded = code_padded

    def __len__(self):
        return len(self.text_padded)

    def __getitem__(self, idx):
        return {'text': self.text_padded[idx], 'code': self.code_padded[idx]}

In [132]:
dataset = CodeGenerationDataset(text_padded, code_padded)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [133]:
for batch in dataloader:
    texts = batch['text']
    codes = batch['code']
    print(texts)
    print()
    print(codes)
    break

tensor([[  0,   0,   0,  ...,   0,   0,   0],
        [426,   0,   0,  ...,   0,   0,   0],
        [212,   0,   0,  ...,   0,   0,   0],
        ...,
        [  0,   0,   0,  ...,   0,   0,   0],
        [ 49,   0,   0,  ...,   0,   0,   0],
        [425,  24,  17,  ...,  26,   6, 171]])

tensor([[2915,    0,    0,  ...,    0,    0,   57],
        [  10,    0,   52,  ...,    0,    0,   17],
        [  10,    0,   16,  ...,    0,    0,   26],
        ...,
        [ 321,    0,    0,  ...,    0,    0,   23],
        [  34,    0,   18,  ...,    0,    0, 1573],
        [2378,   21,   27,  ...,   21,  588,  888]])


In [137]:
class Seq2SeqModel(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, pad_idx):
        super(Seq2SeqModel, self).__init__()

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.GRU(embedding_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        self.pad_idx = pad_idx

    def forward(self, input_sequence):
        embedded = self.embedding(input_sequence)
        output, hidden = self.rnn(embedded)
        return self.fc(output)

# Example usage:
# Assuming text_vocab_size and code_vocab_size are the sizes of your vocabularies
text_vocab_size = len(preprocessor.text_vocab)
code_vocab_size = len(preprocessor.code_vocab)
embedding_size = 256
hidden_size = 512
output_size = code_vocab_size  # Output size is the size of the code vocabulary
pad_idx = preprocessor.text_vocab['<pad>']  # Use the pad index from the text vocabulary

# Initialize the model
model = Seq2SeqModel(text_vocab_size, embedding_size, hidden_size, output_size, pad_idx)

# Example training loop (you will need to customize this based on your task)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Move the preprocessed data to GPU if available
text_padded = text_padded.to(device)
code_padded = code_padded.to(device)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    output = model(text_padded)

    # Calculate loss
    loss = criterion(output.view(-1, output.size(2)), code_padded.view(-1))

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

ValueError: Expected input batch_size (46752) to match target batch_size (251292).