<a href="https://colab.research.google.com/github/Rayyan-Portfolio/Gen_Ai/blob/main/transformer_text_to_code_Q1_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

def load_clean_data(file_path):
    """Loads and cleans the dataset by selecting relevant columns,
    removing missing values, and stripping whitespace."""
    df = pd.read_csv(file_path, sep="\t", usecols=['text', 'code']).dropna()
    df['text'] = df['text'].str.strip().str.lower()
    df['code'] = df['code'].str.strip()
    return df

# Example usage
file_path = "/kaggle/input/spoc-train/spoc-train-train.tsv"
train_df = load_clean_data(file_path)

# Print sample data
train_df.head()


Unnamed: 0,text,code
1,create string s,string s;
2,"create integers x1, y1, x2, y2","int x1, y1, x2, y2;"
3,read s,cin >> s;
4,set x1 to s[0] - 96,x1 = s[0] - 96;
5,set y1 to s[1] - '0',y1 = s[1] - '0';


In [None]:
import torch
import torch.nn as nn

class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.encoder_layers = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=num_layers
        )

    def forward(self, src):
        src_emb = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        return self.encoder_layers(src_emb.permute(1, 0, 2)).permute(1, 0, 2)

class TransformerDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.decoder_layers = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(emb_dim, output_dim)

    def forward(self, tgt, memory):
        tgt_emb = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :]
        output = self.decoder_layers(tgt_emb.permute(1, 0, 2), memory.permute(1, 0, 2))
        return self.fc_out(output.permute(1, 0, 2))

class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super(TransformerSeq2Seq, self).__init__()
        self.encoder = TransformerEncoder(input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)
        self.decoder = TransformerDecoder(output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)

    def forward(self, src, tgt):
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        return output


In [None]:
import nltk
from nltk.tokenize import word_tokenize
import torch

nltk.download('punkt')

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text.lower())  # Lowercase and tokenize

# Example usage
sample_text = "if x > 0 then print x"
sample_code = "if (x > 0) { cout << x; }"

tokenized_text = tokenize_text(sample_text)
tokenized_code = tokenize_text(sample_code)

print("Tokenized Text:", tokenized_text)
print("Tokenized Code:", tokenized_code)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Tokenized Text: ['if', 'x', '>', '0', 'then', 'print', 'x']
Tokenized Code: ['if', '(', 'x', '>', '0', ')', '{', 'cout', '<', '<', 'x', ';', '}']


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import time

nltk.download('punkt')

# Load and clean dataset
def load_clean_data(file_path):
    df = pd.read_csv(file_path, sep="\t", usecols=['text', 'code']).dropna()
    df['text'] = df['text'].str.strip().str.lower()
    df['code'] = df['code'].str.strip()
    return df

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text.lower())

# Build vocabulary
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(tokenize_text(text))
    vocab = {word: idx + 2 for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

# Custom dataset class
class CodeDataset(Dataset):
    def __init__(self, data, text_vocab, code_vocab):
        self.data = data
        self.text_vocab = text_vocab
        self.code_vocab = code_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text_tokens = tokenize_text(self.data.iloc[idx]['text'])
        code_tokens = tokenize_text(self.data.iloc[idx]['code'])
        text_indices = [min(self.text_vocab.get(token, 1), len(self.text_vocab) - 1) for token in text_tokens]
        code_indices = [min(self.code_vocab.get(token, 1), len(self.code_vocab) - 1) for token in code_tokens]

        return torch.tensor(text_indices), torch.tensor(code_indices)

# Transformer Components
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.encoder_layers = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=num_layers
        )

    def forward(self, src):
        src_emb = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        return self.encoder_layers(src_emb.permute(1, 0, 2)).permute(1, 0, 2)

class TransformerDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.decoder_layers = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(emb_dim, output_dim)

    def forward(self, tgt, memory):
        tgt_emb = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :]
        output = self.decoder_layers(tgt_emb.permute(1, 0, 2), memory.permute(1, 0, 2))
        return self.fc_out(output.permute(1, 0, 2))

class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.encoder = TransformerEncoder(input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)
        self.decoder = TransformerDecoder(output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)

    def forward(self, src, tgt):
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        return output


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Load data
file_path = "/kaggle/input/spoc-train/spoc-train-train.tsv"
train_df = load_clean_data(file_path)
train_df
# Load data
file_path = "/kaggle/input/spoc-train/spoc-train-test.tsv"
test_df = load_clean_data(file_path)
# Load data
file_path = "/kaggle/input/spoc-train/spoc-train-eval.tsv"
eval_df = load_clean_data(file_path)

In [None]:
# Build vocab
text_vocab = build_vocab(train_df['text'])
code_vocab = build_vocab(train_df['code'])

# Prepare dataset and dataloader
dataset = CodeDataset(train_df, text_vocab, code_vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda batch: tuple(zip(*batch)))

# Model Parameters
input_dim = len(text_vocab)
output_dim = len(code_vocab)
emb_dim, n_heads, num_layers, ff_dim, dropout, max_len = 256, 8, 6, 512, 0.1, 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerSeq2Seq(input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Training function
def train_model(model, dataloader, optimizer, criterion, epochs, device):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in dataloader:
            src = [torch.tensor(s, dtype=torch.long) for s in src]
            tgt = [torch.tensor(t, dtype=torch.long) for t in tgt]

            # Ensure tensors are properly padded
            src = torch.nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=0).to(device)
            tgt = torch.nn.utils.rnn.pad_sequence(tgt, batch_first=True, padding_value=0).to(device)

            # Debugging: Check max index before passing to embedding layer
            if torch.max(src) >= input_dim or torch.max(tgt) >= output_dim:
                print(f"Error: Index out of bounds! Max src index: {torch.max(src)}, Max tgt index: {torch.max(tgt)}")
                continue  # Skip this batch to prevent crashing

            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])  # Shift target for teacher forcing
            loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {total_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f}")


In [None]:
# Train the model
train_model(model, dataloader, optimizer, criterion, epochs=10, device=device)


In [None]:
torch.save(model.state_dict(), "transformer_seq2seq.pth")
print("Model saved successfully.")

# Load model
model.load_state_dict(torch.load("transformer_seq2seq.pth"))
model.eval()
print("Model loaded successfully.")

In [None]:

def generate_output(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        for src, _ in test_loader:
            src = torch.nn.utils.rnn.pad_sequence(src, batch_first=True).to(device)
            tgt = torch.full((src.shape[0], 1), code_vocab['<pad>'], dtype=torch.long, device=device)
            for _ in range(50):
                output = model(src, tgt)
                next_word = output[:, -1, :].argmax(dim=-1, keepdim=True)
                tgt = torch.cat((tgt, next_word), dim=1)
            print("Generated Code:", tgt.cpu().numpy())
            break

generate_output(model, test_loader, device)


In [None]:
# Function to generate predictions
def generate_code(model, input_text, text_vocab, code_vocab, max_len=50, device="cpu"):
    model.eval()

    # Tokenize and convert text to indices
    input_tokens = tokenize_text(input_text)
    input_indices = [text_vocab.get(token, 1) for token in input_tokens]  # 1 = <unk>
    input_tensor = torch.tensor(input_indices).unsqueeze(0).to(device)

    # Encode the input
    with torch.no_grad():
        memory = model.encoder(input_tensor)

    # Start decoding with <start> token
    tgt_indices = [code_vocab.get('<start>', 1)]

    for _ in range(max_len):
        tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(device)

        with torch.no_grad():
            output = model.decoder(tgt_tensor, memory)

        # Get the most probable next token
        next_token = output.argmax(-1)[:, -1].item()
        tgt_indices.append(next_token)

        # Stop if <end> token is generated
        if next_token == code_vocab.get('<end>', 1):
            break

    # Convert indices back to tokens
    inv_code_vocab = {idx: token for token, idx in code_vocab.items()}
    generated_code = " ".join(inv_code_vocab.get(idx, "<unk>") for idx in tgt_indices[1:])

    return generated_code

# Test the model with an example
test_text = "sort an array in ascending order"
predicted_code = generate_code(model, test_text, text_vocab, code_vocab, device=device)
print("\n📝 Input Text:", test_text)
print("💻 Generated Code:", predicted_code)



📝 Input Text: sort an array in ascending order
💻 Generated Code: <unk>


In [None]:
import torch
import heapq

def generate_code_beam_search(model, input_text, text_vocab, code_vocab, max_len=50, beam_size=3, device="cpu"):
    model.eval()

    # Tokenize and convert text to indices
    input_tokens = tokenize_text(input_text)
    input_indices = [text_vocab.get(token, 1) for token in input_tokens]  # 1 = <unk>
    input_tensor = torch.tensor(input_indices).unsqueeze(0).to(device)

    # Encode the input
    with torch.no_grad():
        memory = model.encoder(input_tensor)

    # Start decoding with <start> token
    start_token = code_vocab.get('<start>', 1)
    end_token = code_vocab.get('<end>', 1)

    # Beam search: Maintain multiple candidates
    beam = [(0, [start_token])]  # (score, sequence)

    for _ in range(max_len):
        new_beam = []
        for score, seq in beam:
            tgt_tensor = torch.tensor(seq).unsqueeze(0).to(device)

            with torch.no_grad():
                output = model.decoder(tgt_tensor, memory)

            # Get top `beam_size` next tokens
            probs = torch.nn.functional.log_softmax(output[:, -1, :], dim=-1)
            top_scores, top_tokens = torch.topk(probs, beam_size)

            for new_score, token in zip(top_scores[0], top_tokens[0]):
                new_seq = seq + [token.item()]
                new_beam.append((score + new_score.item(), new_seq))

        # Keep `beam_size` best sequences
        beam = heapq.nlargest(beam_size, new_beam, key=lambda x: x[0])

        # Stop if all candidates end with <end>
        if all(seq[-1] == end_token for _, seq in beam):
            break

    # Get the best sequence
    best_seq = max(beam, key=lambda x: x[0])[1]

    # Convert indices back to tokens
    inv_code_vocab = {idx: token for token, idx in code_vocab.items()}
    generated_code = " ".join(inv_code_vocab.get(idx, "<unk>") for idx in best_seq[1:])  # Skip <start>

    return generated_code

# Test with beam search
test_text = "sort an array in ascending order"
predicted_code = generate_code_beam_search(model, test_text, text_vocab, code_vocab, device=device)
print("\n📝 Input Text:", test_text)
print("💻 Generated Code:", predicted_code)



📝 Input Text: sort an array in ascending order
💻 Generated Code: <unk> ; <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


TESTING NEW CODE

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import time

nltk.download('punkt')

# Load and clean dataset
def load_clean_data(file_path):
    df = pd.read_csv(file_path, sep="\t", usecols=['text', 'code']).dropna()
    df['text'] = df['text'].str.strip().str.lower()
    df['code'] = df['code'].str.strip()
    return df

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text.lower())

# Build vocabulary
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(tokenize_text(text))
    vocab = {word: idx + 2 for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

# Custom dataset class
class CodeDataset(Dataset):
    def __init__(self, data, text_vocab, code_vocab):
        self.data = data
        self.text_vocab = text_vocab
        self.code_vocab = code_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text_tokens = tokenize_text(self.data.iloc[idx]['text'])
        code_tokens = tokenize_text(self.data.iloc[idx]['code'])
        text_indices = [self.text_vocab.get(token, 1) for token in text_tokens]  # 1 = <unk>
        code_indices = [self.code_vocab.get(token, 1) for token in code_tokens]  # 1 = <unk>
        return torch.tensor(text_indices), torch.tensor(code_indices)

# Padding function for DataLoader
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)  # Unzipping batch into src and tgt
    src_padded = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

# Transformer Components
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.encoder_layers = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=num_layers
        )

    def forward(self, src):
        src_emb = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        return self.encoder_layers(src_emb.permute(1, 0, 2)).permute(1, 0, 2)

class TransformerDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.decoder_layers = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(emb_dim, output_dim)

    def forward(self, tgt, memory):
        tgt_emb = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :]
        output = self.decoder_layers(tgt_emb.permute(1, 0, 2), memory.permute(1, 0, 2))
        return self.fc_out(output.permute(1, 0, 2))

class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.encoder = TransformerEncoder(input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)
        self.decoder = TransformerDecoder(output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)

    def forward(self, src, tgt):
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        return output

# Load data
file_path = "/kaggle/input/spoc-train/spoc-train-train.tsv"
train_df = load_clean_data(file_path)
file_path = "/kaggle/input/spoc-train/spoc-train-test.tsv"
test_df = load_clean_data(file_path)
file_path = "/kaggle/input/spoc-train/spoc-train-eval.tsv"
eval_df = load_clean_data(file_path)

# Build vocab
text_vocab = build_vocab(train_df['text'])
code_vocab = build_vocab(train_df['code'])

# Prepare dataset and dataloader
dataset = CodeDataset(train_df, text_vocab, code_vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Model Parameters
input_dim = len(text_vocab)
output_dim = len(code_vocab)
emb_dim, n_heads, num_layers, ff_dim, dropout, max_len = 256, 8, 6, 512, 0.1, 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerSeq2Seq(input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index

# Training Function
def train_model(model, train_loader, optimizer, criterion, epochs, device):
    model.train()
    for epoch in range(epochs):
        start_time = time.time()
        total_loss = 0
        for batch in train_loader:
            src, tgt = batch  # Now correctly structured as tensors
            src, tgt = src.to(device), tgt.to(device)  # Move tensors to device

            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])

            loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        train_perplexity = torch.exp(torch.tensor(avg_train_loss))
        epoch_time = time.time() - start_time

        print(f'\nEpoch {epoch+1}/{epochs} - {epoch_time:.2f}s')
        print(f'Train Loss: {avg_train_loss:.4f} - Train Perplexity: {train_perplexity:.2f}')
        print(f'Learning Rate: {optimizer.param_groups[0]["lr"]:.6f}\n')

#train_model(model, dataloader, optimizer, criterion, epochs=10, device=device)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


RANDOM TESTING


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')

# Load and clean dataset
def load_clean_data(file_path):
    df = pd.read_csv(file_path, sep="\t", usecols=['text', 'code']).dropna()
    df['text'] = df['text'].str.strip().str.lower()
    df['code'] = df['code'].str.strip()
    return df

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text.lower())

# Build vocabulary
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(tokenize_text(text))
    vocab = {word: idx + 2 for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

# Custom dataset class
class CodeDataset(Dataset):
    def __init__(self, data, text_vocab, code_vocab):
        self.data = data
        self.text_vocab = text_vocab
        self.code_vocab = code_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text_tokens = tokenize_text(self.data.iloc[idx]['text'])
        code_tokens = tokenize_text(self.data.iloc[idx]['code'])
        text_indices = [self.text_vocab.get(token, 1) for token in text_tokens]  # 1 = <unk>
        code_indices = [self.code_vocab.get(token, 1) for token in code_tokens]
        return torch.tensor(text_indices), torch.tensor(code_indices)

# Transformer Components
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.encoder_layers = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=num_layers
        )

    def forward(self, src):
        src_emb = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        return self.encoder_layers(src_emb.permute(1, 0, 2)).permute(1, 0, 2)

class TransformerDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.decoder_layers = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(emb_dim, output_dim)

    def forward(self, tgt, memory):
        tgt_emb = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :]
        output = self.decoder_layers(tgt_emb.permute(1, 0, 2), memory.permute(1, 0, 2))
        return self.fc_out(output.permute(1, 0, 2))

class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.encoder = TransformerEncoder(input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)
        self.decoder = TransformerDecoder(output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)

    def forward(self, src, tgt):
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        return output

# Load data
train_df = load_clean_data("/kaggle/input/spoc-train/spoc-train-train.tsv")
test_df = load_clean_data("/kaggle/input/spoc-train/spoc-train-test.tsv")
eval_df = load_clean_data("/kaggle/input/spoc-train/spoc-train-eval.tsv")

# Build vocab
text_vocab = build_vocab(train_df['text'])
code_vocab = build_vocab(train_df['code'])

# Prepare dataset and dataloader
train_loader = DataLoader(CodeDataset(train_df, text_vocab, code_vocab), batch_size=32, shuffle=True)
test_loader = DataLoader(CodeDataset(test_df, text_vocab, code_vocab), batch_size=32, shuffle=False)

# Model Parameters
input_dim = len(text_vocab)
output_dim = len(code_vocab)
emb_dim, n_heads, num_layers, ff_dim, dropout, max_len = 256, 8, 6, 512, 0.1, 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerSeq2Seq(input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, optimizer, criterion, epochs, device):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in train_loader:
            src = torch.nn.utils.rnn.pad_sequence(src, batch_first=True).to(device)
            tgt = torch.nn.utils.rnn.pad_sequence(tgt, batch_first=True).to(device)
            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])
            loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {total_loss/len(train_loader):.4f}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




In [None]:
train_model(model, train_loader, optimizer, criterion, epochs=10, device=device)

RuntimeError: stack expects each tensor to be equal size, but got [5] at entry 0 and [4] at entry 1

In [None]:
torch.save(model.state_dict(), "transformer_seq2seq.pth")
print("Model saved successfully.")

# Load model
model.load_state_dict(torch.load("transformer_seq2seq.pth"))
model.eval()
print("Model loaded successfully.")

In [None]:


def generate_output(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        for src, _ in test_loader:
            src = torch.nn.utils.rnn.pad_sequence(src, batch_first=True).to(device)
            tgt = torch.full((src.shape[0], 1), code_vocab['<pad>'], dtype=torch.long, device=device)
            for _ in range(50):
                output = model(src, tgt)
                next_word = output[:, -1, :].argmax(dim=-1, keepdim=True)
                tgt = torch.cat((tgt, next_word), dim=1)
            print("Generated Code:", tgt.cpu().numpy())
            break

generate_output(model, test_loader, device)


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


nltk.download('punkt')

# Load and clean dataset
def load_clean_data(file_path):
    df = pd.read_csv(file_path, sep="\t", usecols=['text', 'code']).dropna()
    df['text'] = df['text'].str.strip().str.lower()
    df['code'] = df['code'].str.strip()
    return df

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text.lower())

# Build vocabulary
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(tokenize_text(text))
    vocab = {word: idx + 2 for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

# Custom dataset class
class CodeDataset(Dataset):
    def __init__(self, data, text_vocab, code_vocab):
        self.data = data
        self.text_vocab = text_vocab
        self.code_vocab = code_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text_tokens = tokenize_text(self.data.iloc[idx]['text'])
        code_tokens = tokenize_text(self.data.iloc[idx]['code'])
        text_indices = [self.text_vocab.get(token, 1) for token in text_tokens]  # 1 = <unk>
        code_indices = [self.code_vocab.get(token, 1) for token in code_tokens]
        return torch.tensor(text_indices), torch.tensor(code_indices)

# Custom collate function for padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

# Transformer Components
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.encoder_layers = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout, batch_first=True),
            num_layers=num_layers
        )

    def forward(self, src):
        src_emb = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        return self.encoder_layers(src_emb)

class TransformerDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_len, emb_dim))
        self.decoder_layers = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout, batch_first=True),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(emb_dim, output_dim)

    def forward(self, tgt, memory):
        tgt_emb = self.embedding(tgt) + self.positional_encoding[:, :tgt.size(1), :]
        output = self.decoder_layers(tgt_emb, memory)
        return self.fc_out(output)

class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.encoder = TransformerEncoder(input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)
        self.decoder = TransformerDecoder(output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)

    def forward(self, src, tgt):
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        return output

# Load data
train_df = load_clean_data("/kaggle/input/spoc-train/spoc-train-train.tsv")
test_df = load_clean_data("/kaggle/input/spoc-train/spoc-train-test.tsv")
eval_df = load_clean_data("/kaggle/input/spoc-train/spoc-train-eval.tsv")

# Build vocab
text_vocab = build_vocab(train_df['text'])
code_vocab = build_vocab(train_df['code'])

# Prepare dataset and dataloader
train_loader = DataLoader(CodeDataset(train_df, text_vocab, code_vocab), batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(CodeDataset(test_df, text_vocab, code_vocab), batch_size=32, shuffle=False, collate_fn=collate_fn)

# Model Parameters
input_dim = len(text_vocab)
output_dim = len(code_vocab)
emb_dim, n_heads, num_layers, ff_dim, dropout, max_len = 256, 8, 6, 512, 0.1, 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerSeq2Seq(input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, optimizer, criterion, epochs, device):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for src, tgt in train_loader:
            src, tgt = src.to(device), tgt.to(device)
            optimizer.zero_grad()
            output = model(src, tgt[:, :-1].contiguous())
            loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].contiguous().view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {total_loss/len(train_loader):.4f}")

train_model(model, train_loader, optimizer, criterion, epochs=10, device=device)

torch.save(model.state_dict(), "transformer_seq2seq.pth")
print("Model saved successfully.")

# Load model
model.load_state_dict(torch.load("transformer_seq2seq.pth"))
model.eval()
print("Model loaded successfully.")

def generate_output(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        for src, _ in test_loader:
            src = src.to(device)
            tgt = torch.full((src.shape[0], 1), code_vocab['<pad>'], dtype=torch.long, device=device)
            for _ in range(50):
                output = model(src, tgt)
                next_word = output[:, -1, :].argmax(dim=-1, keepdim=True)
                tgt = torch.cat((tgt, next_word), dim=1)
            print("Generated Code:", tgt.cpu().numpy())
            break

generate_output(model, test_loader, device)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# TRYING DIfferent Approach!

In [None]:
import pandas as pd

# Load the dataset (replace 'dataset.csv' with your actual file path)
data = load_clean_data('/kaggle/input/spoc-data/spoc-train.tsv')

# Keep only 'text' and 'code' columns
data = data[['text', 'code']]

# Check for missing values and drop them if any
data = data.dropna()
data

Unnamed: 0,text,code
0,"in the function gcd(a,b=integers)","int gcd(int a, int b) {"
1,"if b=1 return a, else call function gcd(b, a%b)","return !b ? a : gcd(b, a % b);"
4,"n , nn, ans = integers with ans =0","int n, nn, ans = 0;"
5,read n,cin >> n;
6,for i=2 to n-1 execute,for (int i = 2; i <= n - 1; ++i) {
...,...,...
293846,set sum to sum1 + sum2,sum = sum1 + sum2;
293847,if sum % 2 is 0,if (sum % 2 == 0) {
293848,"print ""chat with her!"" print newline","cout << ""CHAT WITH HER!"" << endl;"
293849,else,} else {


In [None]:
from sklearn.model_selection import train_test_split

# Split into train (80%) and temp (20%)
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

Training samples: 172980
Validation samples: 21622
Test samples: 21623


In [None]:
# Simple tokenizer (splits on spaces)
def tokenize(text):
    return text.split()

# Build vocabularies from training data
def build_vocab(data, tokenizer):
    tokens = set()
    for item in data:
        tokens.update(tokenizer(item))
    # Reserve 0 for padding, 1 for <sos>, 2 for <eos>
    vocab = {token: idx for idx, token in enumerate(tokens, start=3)}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    return vocab

# Create vocabularies
pseudocode_vocab = build_vocab(train_data['text'], tokenize)
cpp_vocab = build_vocab(train_data['code'], tokenize)

# Inverse vocabularies for decoding (optional)
inv_cpp_vocab = {idx: token for token, idx in cpp_vocab.items()}

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CodeDataset(Dataset):
    def __init__(self, data, pseudocode_vocab, cpp_vocab, tokenizer):
        self.data = data
        self.pseudocode_vocab = pseudocode_vocab
        self.cpp_vocab = cpp_vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudocode = self.data.iloc[idx]['text']
        cpp = self.data.iloc[idx]['code']
        # Add special tokens
        pseudo_tokens = ['<sos>'] + self.tokenizer(pseudocode) + ['<eos>']
        cpp_tokens = ['<sos>'] + self.tokenizer(cpp) + ['<eos>']
        # Convert to indices
        pseudo_indices = [self.pseudocode_vocab.get(token, 0) for token in pseudo_tokens]
        cpp_indices = [self.cpp_vocab.get(token, 0) for token in cpp_tokens]
        return torch.tensor(pseudo_indices), torch.tensor(cpp_indices)

# Padding function for batches
def collate_fn(batch):
    pseudocode, cpp = zip(*batch)
    pseudocode = torch.nn.utils.rnn.pad_sequence(pseudocode, padding_value=0, batch_first=True)
    cpp = torch.nn.utils.rnn.pad_sequence(cpp, padding_value=0, batch_first=True)
    return pseudocode, cpp

# Create DataLoaders
train_dataset = CodeDataset(train_data, pseudocode_vocab, cpp_vocab, tokenize)
val_dataset = CodeDataset(val_data, pseudocode_vocab, cpp_vocab, tokenize)
test_dataset = CodeDataset(test_data, pseudocode_vocab, cpp_vocab, tokenize)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
import torch.nn as nn
import math

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V), attn

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attn = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)

# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))

# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.mha(x, x, x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x

# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.mha1(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.mha2(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x

# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8, num_layers=6, d_ff=2048, dropout=0.1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def create_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # Padding mask
        tgt_seq_len = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len)).bool().to(tgt.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2) & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        return self.fc_out(dec_output)

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    src_vocab_size=len(pseudocode_vocab),
    tgt_vocab_size=len(cpp_vocab),
    d_model=512,
    num_heads=8,
    num_layers=6,
    d_ff=2048,
    dropout=0.1
)

In [None]:
import time
import torch.optim as optim

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    start_time = time.time()
    train_loss = 0

    for pseudocode, cpp in train_loader:
        pseudocode, cpp = pseudocode, cpp
        optimizer.zero_grad()

        # Decoder input: <sos> + target[:-1]
        # Target output: target[1:] + <eos>
        output = model(pseudocode, cpp[:, :-1])
        loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))

        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    end_time = time.time()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for pseudocode, cpp in val_loader:
            pseudocode, cpp = pseudocode.to(device), cpp.to(device)
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Display results
    epoch_time = end_time - start_time
    lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, Val Loss: {val_loss:.4f}, LR: {lr:.6f}")


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:

# Optional: Evaluate on test set
model.eval()
test_loss = 0
with torch.no_grad():
    for pseudocode, cpp in test_loader:
        pseudocode, cpp = pseudocode, cpp
        output = model(pseudocode, cpp[:, :-1])
        loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
        test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import time
import math

nltk.download('punkt')

# Load and clean dataset
def load_clean_data(file_path):
    df = pd.read_csv(file_path, sep="\t", usecols=['text', 'code']).dropna()
    df['text'] = df['text'].str.strip().str.lower()
    df['code'] = df['code'].str.strip()
    return df

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text.lower())

# Build vocabulary
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        counter.update(tokenize_text(text))
    vocab = {word: idx + 4 for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    vocab['<sos>'] = 2
    vocab['<eos>'] = 3
    return vocab

# Custom dataset class
class CodeDataset(Dataset):
    def __init__(self, data, text_vocab, code_vocab):
        self.data = data
        self.text_vocab = text_vocab
        self.code_vocab = code_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text_tokens = tokenize_text(self.data.iloc[idx]['text'])
        code_tokens = tokenize_text(self.data.iloc[idx]['code'])
        text_indices = [self.text_vocab.get(token, 1) for token in text_tokens]  # 1 = <unk>
        code_indices = [self.code_vocab.get(token, 1) for token in code_tokens]
        code_indices = [2] + code_indices + [3]  # Add <sos> and <eos>
        return torch.tensor(text_indices), torch.tensor(code_indices)

# Custom collate function for padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Transformer Components
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.positional_encoding = PositionalEncoding(emb_dim, max_len)
        self.encoder_layers = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout, batch_first=True),
            num_layers=num_layers
        )

    def forward(self, src, src_mask=None):
        src_emb = self.embedding(src)
        src_emb = self.positional_encoding(src_emb)
        return self.encoder_layers(src_emb, src_key_padding_mask=src_mask)

class TransformerDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.positional_encoding = PositionalEncoding(emb_dim, max_len)
        self.decoder_layers = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=ff_dim, dropout=dropout, batch_first=True),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(emb_dim, output_dim)

    def forward(self, tgt, memory, tgt_mask=None, tgt_key_padding_mask=None):
        tgt_emb = self.embedding(tgt)
        tgt_emb = self.positional_encoding(tgt_emb)
        output = self.decoder_layers(tgt_emb, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)
        return self.fc_out(output)

class TransformerSeq2Seq(nn.Module):
    def __init__(self, input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len):
        super().__init__()
        self.encoder = TransformerEncoder(input_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)
        self.decoder = TransformerDecoder(output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len)

    def forward(self, src, tgt):
        src_mask = (src == 0).transpose(0, 1)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        memory = self.encoder(src, src_mask)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=(tgt == 0))
        return output

# Model Parameters
input_dim = len(text_vocab)
output_dim = len(code_vocab)
emb_dim, n_heads, num_layers, ff_dim, dropout, max_len = 256, 8, 6, 512, 0.1, 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TransformerSeq2Seq(input_dim, output_dim, emb_dim, n_heads, num_layers, ff_dim, dropout, max_len).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

torch.save(model.state_dict(), "transformer_seq2seq.pth")
print("Model saved successfully.")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# ***TESTING WITH HELP***

In [None]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/input/spoc-data/spoc-train.tsv'
data = pd.read_csv(file_path, sep='\t')

# Display the first few rows of the dataset to inspect
data.head()


Unnamed: 0,text,code,workerid,probid,subid,line,indent
0,"in the function gcd(a,b=integers)","int gcd(int a, int b) {",38,13A,41120785,0,0
1,"if b=1 return a, else call function gcd(b, a%b)","return !b ? a : gcd(b, a % b);",38,13A,41120785,1,1
2,,},38,13A,41120785,2,0
3,,int main() {,38,13A,41120785,3,0
4,"n , nn, ans = integers with ans =0","int n, nn, ans = 0;",38,13A,41120785,4,1


In [None]:
# Clean the dataset by dropping rows where 'text' or 'code' is missing
cleaned_data = data.dropna(subset=['text', 'code'])

# Split the data: 80% training, 10% testing, 10% evaluation
train_data = cleaned_data.sample(frac=0.8, random_state=42)
remaining_data = cleaned_data.drop(train_data.index)
test_data = remaining_data.sample(frac=0.5, random_state=42)
eval_data = remaining_data.drop(test_data.index)

# Display the number of rows in each split
train_data.shape, test_data.shape, eval_data.shape


((172980, 7), (21622, 7), (21623, 7))

In [None]:
import re
import numpy as np

# Basic tokenizer function to split text into words (handles punctuation as well)
def basic_tokenizer(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.split()

# Tokenize the text and code columns using the basic tokenizer
train_text_tokens = [basic_tokenizer(text) for text in train_data['text']]
train_code_tokens = [basic_tokenizer(code) for code in train_data['code']]

# Create vocabulary mappings (word to index and index to word)
text_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_text_tokens for word in sentence]))}
code_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_code_tokens for word in sentence]))}

# Function to convert tokens to indices
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, 0) for token in tokens]  # use 0 for unknown tokens

# Convert the tokens to indices for the training data
train_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in train_text_tokens]
train_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in train_code_tokens]

# Pad sequences to ensure uniform length
max_len = 100
train_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_text_sequences])
train_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_code_sequences])

# Repeat the process for the test and evaluation sets
test_text_tokens = [basic_tokenizer(text) for text in test_data['text']]
test_code_tokens = [basic_tokenizer(code) for code in test_data['code']]

test_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in test_text_tokens]
test_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in test_code_tokens]

test_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_text_sequences])
test_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_code_sequences])

eval_text_tokens = [basic_tokenizer(text) for text in eval_data['text']]
eval_code_tokens = [basic_tokenizer(code) for code in eval_data['code']]

eval_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in eval_text_tokens]
eval_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in eval_code_tokens]

eval_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_text_sequences])
eval_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_code_sequences])

# Display the shapes of the sequences
train_text_sequences.shape, train_code_sequences.shape, test_text_sequences.shape, test_code_sequences.shape, eval_text_sequences.shape, eval_code_sequences.shape


((172980, 100),
 (172980, 100),
 (21622, 100),
 (21622, 100),
 (21623, 100),
 (21623, 100))

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import time

# Re-initialize the tokenizer and model parameters
vocab_size_text = len(text_vocab) + 1  # +1 for padding
vocab_size_code = len(code_vocab) + 1  # +1 for padding
embedding_dim = 128
hidden_units = 512
num_heads = 8
num_layers = 6
max_len = 100

# Transformer Encoder Layer
def transformer_encoder(inputs, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(attention + ff)

    return ff

# Transformer Decoder Layer
def transformer_decoder(inputs, enc_output, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Cross-attention with encoder output
    cross_attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(attention, enc_output)
    cross_attention = layers.Dropout(0.1)(cross_attention)
    cross_attention = layers.LayerNormalization()(attention + cross_attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(cross_attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(cross_attention + ff)

    return ff

# Build the Transformer Model
def build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim, hidden_units, num_heads, num_layers, max_len):
    # Input layers
    input_text = layers.Input(shape=(max_len,), dtype=tf.int32, name='text_input')
    input_code = layers.Input(shape=(max_len,), dtype=tf.int32, name='code_input')

    # Embedding layers
    text_embedding = layers.Embedding(input_dim=vocab_size_text, output_dim=embedding_dim)(input_text)
    code_embedding = layers.Embedding(input_dim=vocab_size_code, output_dim=embedding_dim)(input_code)

    # Positional Encoding (for simplicity, we'll add sine/cosine functions as positional encodings)
    pos_encoding = layers.Embedding(input_dim=max_len, output_dim=embedding_dim)(tf.range(max_len))
    text_embedding += pos_encoding
    code_embedding += pos_encoding

    # Encoder stack
    enc_output = text_embedding
    for _ in range(num_layers):
        enc_output = transformer_encoder(enc_output, num_heads, hidden_units)

    # Decoder stack
    dec_output = code_embedding
    for _ in range(num_layers):
        dec_output = transformer_decoder(dec_output, enc_output, num_heads, hidden_units)

    # Final dense layer for output
    output = layers.Dense(vocab_size_code, activation='softmax')(dec_output)

    # Define the model
    model = models.Model(inputs=[input_text, input_code], outputs=output)

    return model

# Instantiate the model
transformer_model = build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim, hidden_units, num_heads, num_layers, max_len)

# Compile the model
transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
transformer_model.summary()


RuntimeError: Bad StatusOr access: INTERNAL: failed initializing StreamExecutor for CUDA device ordinal 1: INTERNAL: failed call to cuDevicePrimaryCtxRetain: CUDA_ERROR_ASSERT: device-side assert triggered

In [None]:
import pandas as pd

# Load the dataset
file_path = '/kaggle/input/spoc-data/spoc-train.tsv'
data = pd.read_csv(file_path, sep='\t')

# Display the first few rows of the dataset to inspect
data.head()





# Clean the dataset by dropping rows where 'text' or 'code' is missing
cleaned_data = data.dropna(subset=['text', 'code'])

# Split the data: 80% training, 10% testing, 10% evaluation
train_data = cleaned_data.sample(frac=0.8, random_state=42)
remaining_data = cleaned_data.drop(train_data.index)
test_data = remaining_data.sample(frac=0.5, random_state=42)
eval_data = remaining_data.drop(test_data.index)

# Display the number of rows in each split
train_data.shape, test_data.shape, eval_data.shape




import re
import numpy as np

# Basic tokenizer function to split text into words (handles punctuation as well)
def basic_tokenizer(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.split()

# Tokenize the text and code columns using the basic tokenizer
train_text_tokens = [basic_tokenizer(text) for text in train_data['text']]
train_code_tokens = [basic_tokenizer(code) for code in train_data['code']]

# Create vocabulary mappings (word to index and index to word)
text_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_text_tokens for word in sentence]))}
code_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_code_tokens for word in sentence]))}

# Function to convert tokens to indices
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, 0) for token in tokens]  # use 0 for unknown tokens

# Convert the tokens to indices for the training data
train_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in train_text_tokens]
train_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in train_code_tokens]

# Pad sequences to ensure uniform length
max_len = 100
train_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_text_sequences])
train_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_code_sequences])

# Repeat the process for the test and evaluation sets
test_text_tokens = [basic_tokenizer(text) for text in test_data['text']]
test_code_tokens = [basic_tokenizer(code) for code in test_data['code']]

test_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in test_text_tokens]
test_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in test_code_tokens]

test_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_text_sequences])
test_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_code_sequences])

eval_text_tokens = [basic_tokenizer(text) for text in eval_data['text']]
eval_code_tokens = [basic_tokenizer(code) for code in eval_data['code']]

eval_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in eval_text_tokens]
eval_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in eval_code_tokens]

eval_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_text_sequences])
eval_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_code_sequences])

# Display the shapes of the sequences
train_text_sequences.shape, train_code_sequences.shape, test_text_sequences.shape, test_code_sequences.shape, eval_text_sequences.shape, eval_code_sequences.shape


import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import time

# Re-initialize the tokenizer and model parameters
vocab_size_text = len(text_vocab) + 1  # +1 for padding
vocab_size_code = len(code_vocab) + 1  # +1 for padding
embedding_dim = 128
hidden_units = 512
num_heads = 8
num_layers = 6
max_len = 100

# Transformer Encoder Layer
def transformer_encoder(inputs, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(attention + ff)

    return ff

# Transformer Decoder Layer
def transformer_decoder(inputs, enc_output, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Cross-attention with encoder output
    cross_attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(attention, enc_output)
    cross_attention = layers.Dropout(0.1)(cross_attention)
    cross_attention = layers.LayerNormalization()(attention + cross_attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(cross_attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(cross_attention + ff)

    return ff

# Build the Transformer Model
def build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim, hidden_units, num_heads, num_layers, max_len):
    # Input layers
    input_text = layers.Input(shape=(max_len,), dtype=tf.int32, name='text_input')
    input_code = layers.Input(shape=(max_len,), dtype=tf.int32, name='code_input')

    # Embedding layers
    text_embedding = layers.Embedding(input_dim=vocab_size_text, output_dim=embedding_dim)(input_text)
    code_embedding = layers.Embedding(input_dim=vocab_size_code, output_dim=embedding_dim)(input_code)

    # Positional Encoding (for simplicity, we'll add sine/cosine functions as positional encodings)
    pos_encoding = layers.Embedding(input_dim=max_len, output_dim=embedding_dim)(tf.range(max_len))
    text_embedding += pos_encoding
    code_embedding += pos_encoding

    # Encoder stack
    enc_output = text_embedding
    for _ in range(num_layers):
        enc_output = transformer_encoder(enc_output, num_heads, hidden_units)

    # Decoder stack
    dec_output = code_embedding
    for _ in range(num_layers):
        dec_output = transformer_decoder(dec_output, enc_output, num_heads, hidden_units)

    # Final dense layer for output
    output = layers.Dense(vocab_size_code, activation='softmax')(dec_output)

    # Define the model
    model = models.Model(inputs=[input_text, input_code], outputs=output)

    return model

# Instantiate the model
transformer_model = build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim, hidden_units, num_heads, num_layers, max_len)

# Compile the model
transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
transformer_model.summary()


((172980, 100),
 (172980, 100),
 (21622, 100),
 (21622, 100),
 (21623, 100),
 (21623, 100))

In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras import layers, models
import time

# Load the dataset
file_path = '/kaggle/input/spoc-data/spoc-train.tsv'
data = pd.read_csv(file_path, sep='\t')

# Clean the dataset by dropping rows where 'text' or 'code' is missing
cleaned_data = data.dropna(subset=['text', 'code'])

# Split the data: 80% training, 10% testing, 10% evaluation
train_data = cleaned_data.sample(frac=0.8, random_state=42)
remaining_data = cleaned_data.drop(train_data.index)
test_data = remaining_data.sample(frac=0.5, random_state=42)
eval_data = remaining_data.drop(test_data.index)

# Basic tokenizer function to split text into words (handles punctuation as well)
def basic_tokenizer(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.split()

# Tokenize the text and code columns using the basic tokenizer
train_text_tokens = [basic_tokenizer(text) for text in train_data['text']]
train_code_tokens = [basic_tokenizer(code) for code in train_data['code']]

# Create vocabulary mappings (word to index and index to word)
text_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_text_tokens for word in sentence]))}
code_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_code_tokens for word in sentence]))}

# Function to convert tokens to indices
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, 0) for token in tokens]  # use 0 for unknown tokens

# Convert the tokens to indices for the training data
train_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in train_text_tokens]
train_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in train_code_tokens]

# Pad sequences to ensure uniform length
max_len = 100
train_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_text_sequences])
train_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_code_sequences])

# Repeat the process for the test and evaluation sets
test_text_tokens = [basic_tokenizer(text) for text in test_data['text']]
test_code_tokens = [basic_tokenizer(code) for code in test_data['code']]

test_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in test_text_tokens]
test_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in test_code_tokens]

test_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_text_sequences])
test_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_code_sequences])

eval_text_tokens = [basic_tokenizer(text) for text in eval_data['text']]
eval_code_tokens = [basic_tokenizer(code) for code in eval_data['code']]

eval_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in eval_text_tokens]
eval_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in eval_code_tokens]

eval_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_text_sequences])
eval_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_code_sequences])

# Define the Transformer model
def transformer_encoder(inputs, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(attention + ff)

    return ff

def transformer_decoder(inputs, enc_output, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Cross-attention with encoder output
    cross_attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(attention, enc_output)
    cross_attention = layers.Dropout(0.1)(cross_attention)
    cross_attention = layers.LayerNormalization()(attention + cross_attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(cross_attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(cross_attention + ff)

    return ff

def build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim, hidden_units, num_heads, num_layers, max_len):
    input_text = layers.Input(shape=(max_len,), dtype=tf.int32, name='text_input')
    input_code = layers.Input(shape=(max_len,), dtype=tf.int32, name='code_input')

    text_embedding = layers.Embedding(input_dim=vocab_size_text, output_dim=embedding_dim)(input_text)
    code_embedding = layers.Embedding(input_dim=vocab_size_code, output_dim=embedding_dim)(input_code)

    pos_encoding = layers.Embedding(input_dim=max_len, output_dim=embedding_dim)(tf.range(max_len))
    text_embedding += pos_encoding
    code_embedding += pos_encoding

    enc_output = text_embedding
    for _ in range(num_layers):
        enc_output = transformer_encoder(enc_output, num_heads, hidden_units)

    dec_output = code_embedding
    for _ in range(num_layers):
        dec_output = transformer_decoder(dec_output, enc_output, num_heads, hidden_units)

    output = layers.Dense(vocab_size_code, activation='softmax')(dec_output)

    model = models.Model(inputs=[input_text, input_code], outputs=output)

    return model

# Compile the model
transformer_model = build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim=128, hidden_units=512, num_heads=8, num_layers=6, max_len=100)
transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
transformer_model.summary()

# Training function
def train_transformer_model(model, train_text_sequences, train_code_sequences, eval_text_sequences, eval_code_sequences, batch_size=32, epochs=10, learning_rate=0.001):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    epoch_times = []
    validation_losses = []

    for epoch in range(epochs):
        start_time = time.time()
        print(f"Epoch {epoch+1}/{epochs}")

        indices = np.random.permutation(len(train_text_sequences))
        train_text_sequences = train_text_sequences[indices]
        train_code_sequences = train_code_sequences[indices]

        total_loss = 0
        num_batches = len(train_text_sequences) // batch_size

        for batch_num in range(num_batches):
            batch_start = batch_num * batch_size
            batch_end = batch_start + batch_size
            batch_text = train_text_sequences[batch_start:batch_end]
            batch_code = train_code_sequences[batch_start:batch_end]

            with tf.GradientTape() as tape:
                output = model([batch_text, batch_code], training=True)
                loss = loss_fn(batch_code, output)
                total_loss += loss

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        avg_loss = total_loss / num_batches
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)
        print(f"Epoch {epoch+1} - Time: {epoch_time:.2f}s - Loss: {avg_loss:.4f}")

        val_output = model([eval_text_sequences, eval_code_sequences], training=False)
        val_loss = loss_fn(eval_code_sequences, val_output)
        validation_losses.append(val_loss)

        print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")

    return epoch_times, validation_losses

# Start training the model
epoch_times, validation_losses = train_transformer_model(transformer_model, train_text_sequences, train_code_sequences, eval_text_sequences, eval_code_sequences, epochs=10, batch_size=32, learning_rate=0.001)


NameError: name 'vocab_size_text' is not defined

In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras import layers, models
import time

# Load the dataset
file_path = '/kaggle/input/spoc-data/spoc-train.tsv'
data = pd.read_csv(file_path, sep='\t')

# Clean the dataset by dropping rows where 'text' or 'code' is missing
cleaned_data = data.dropna(subset=['text', 'code'])

# Split the data: 80% training, 10% testing, 10% evaluation
train_data = cleaned_data.sample(frac=0.8, random_state=42)
remaining_data = cleaned_data.drop(train_data.index)
test_data = remaining_data.sample(frac=0.5, random_state=42)
eval_data = remaining_data.drop(test_data.index)

# Basic tokenizer function to split text into words (handles punctuation as well)
def basic_tokenizer(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.split()

# Tokenize the text and code columns using the basic tokenizer
train_text_tokens = [basic_tokenizer(text) for text in train_data['text']]
train_code_tokens = [basic_tokenizer(code) for code in train_data['code']]

# Create vocabulary mappings (word to index and index to word)
text_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_text_tokens for word in sentence]))}
code_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_code_tokens for word in sentence]))}

# Function to convert tokens to indices
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, 0) for token in tokens]  # use 0 for unknown tokens

# Convert the tokens to indices for the training data
train_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in train_text_tokens]
train_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in train_code_tokens]

# Pad sequences to ensure uniform length
max_len = 100
train_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_text_sequences])
train_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_code_sequences])

# Repeat the process for the test and evaluation sets
test_text_tokens = [basic_tokenizer(text) for text in test_data['text']]
test_code_tokens = [basic_tokenizer(code) for code in test_data['code']]

test_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in test_text_tokens]
test_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in test_code_tokens]

test_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_text_sequences])
test_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_code_sequences])

eval_text_tokens = [basic_tokenizer(text) for text in eval_data['text']]
eval_code_tokens = [basic_tokenizer(code) for code in eval_data['code']]

eval_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in eval_text_tokens]
eval_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in eval_code_tokens]

eval_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_text_sequences])
eval_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_code_sequences])

# Define the Transformer model
def transformer_encoder(inputs, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(attention + ff)

    return ff

def transformer_decoder(inputs, enc_output, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Cross-attention with encoder output
    cross_attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(attention, enc_output)
    cross_attention = layers.Dropout(0.1)(cross_attention)
    cross_attention = layers.LayerNormalization()(attention + cross_attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(cross_attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(cross_attention + ff)

    return ff


In [None]:

# Calculate vocab sizes
vocab_size_text = len(text_vocab) + 1  # +1 for padding
vocab_size_code = len(code_vocab) + 1  # +1 for padding
vocab_size_text
vocab_size_code

18893

In [None]:

def build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim, hidden_units, num_heads, num_layers, max_len):
    input_text = layers.Input(shape=(max_len,), dtype=tf.int32, name='text_input')
    input_code = layers.Input(shape=(max_len,), dtype=tf.int32, name='code_input')

    text_embedding = layers.Embedding(input_dim=vocab_size_text, output_dim=embedding_dim)(input_text)
    code_embedding = layers.Embedding(input_dim=vocab_size_code, output_dim=embedding_dim)(input_code)

    pos_encoding = layers.Embedding(input_dim=max_len, output_dim=embedding_dim)(tf.range(max_len))
    text_embedding += pos_encoding
    code_embedding += pos_encoding

    enc_output = text_embedding
    for _ in range(num_layers):
        enc_output = transformer_encoder(enc_output, num_heads, hidden_units)

    dec_output = code_embedding
    for _ in range(num_layers):
        dec_output = transformer_decoder(dec_output, enc_output, num_heads, hidden_units)

    output = layers.Dense(vocab_size_code, activation='softmax')(dec_output)

    model = models.Model(inputs=[input_text, input_code], outputs=output)

    return model

# Compile the model
transformer_model = build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim=128, hidden_units=512, num_heads=8, num_layers=6, max_len=100)
transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
transformer_model.summary()


ValueError: Cannot broadcast shape, the failure dim has value 128, which cannot be broadcasted to 512. Input shapes are: [None, 100, 128] and [None, 100, 512].

In [None]:

# Training function
def train_transformer_model(model, train_text_sequences, train_code_sequences, eval_text_sequences, eval_code_sequences, batch_size=32, epochs=10, learning_rate=0.001):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    epoch_times = []
    validation_losses = []

    for epoch in range(epochs):
        start_time = time.time()
        print(f"Epoch {epoch+1}/{epochs}")

        indices = np.random.permutation(len(train_text_sequences))
        train_text_sequences = train_text_sequences[indices]
        train_code_sequences = train_code_sequences[indices]

        total_loss = 0
        num_batches = len(train_text_sequences) // batch_size

        for batch_num in range(num_batches):
            batch_start = batch_num * batch_size
            batch_end = batch_start + batch_size
            batch_text = train_text_sequences[batch_start:batch_end]
            batch_code = train_code_sequences[batch_start:batch_end]

            with tf.GradientTape() as tape:
                output = model([batch_text, batch_code], training=True)
                loss = loss_fn(batch_code, output)
                total_loss += loss

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        avg_loss = total_loss / num_batches
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)
        print(f"Epoch {epoch+1} - Time: {epoch_time:.2f}s - Loss: {avg_loss:.4f}")

        val_output = model([eval_text_sequences, eval_code_sequences], training=False)
        val_loss = loss_fn(eval_code_sequences, val_output)
        validation_losses.append(val_loss)

        print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")

    return epoch_times, validation_losses

# Start training the model
epoch_times, validation_losses = train_transformer_model(transformer_model, train_text_sequences, train_code_sequences, eval_text_sequences, eval_code_sequences, epochs=10, batch_size=32, learning_rate=0.001)


In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras import layers, models
import time

# Load the dataset
file_path = '/kaggle/input/spoc-data/spoc-train.tsv'
data = pd.read_csv(file_path, sep='\t')

# Clean the dataset by dropping rows where 'text' or 'code' is missing
cleaned_data = data.dropna(subset=['text', 'code'])

# Split the data: 80% training, 10% testing, 10% evaluation
train_data = cleaned_data.sample(frac=0.8, random_state=42)
remaining_data = cleaned_data.drop(train_data.index)
test_data = remaining_data.sample(frac=0.5, random_state=42)
eval_data = remaining_data.drop(test_data.index)

# Basic tokenizer function to split text into words (handles punctuation as well)
def basic_tokenizer(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.split()

# Tokenize the text and code columns using the basic tokenizer
train_text_tokens = [basic_tokenizer(text) for text in train_data['text']]
train_code_tokens = [basic_tokenizer(code) for code in train_data['code']]

# Create vocabulary mappings (word to index and index to word)
text_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_text_tokens for word in sentence]))}
code_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_code_tokens for word in sentence]))}

# Calculate vocab sizes
vocab_size_text = len(text_vocab) + 1  # +1 for padding
vocab_size_code = len(code_vocab) + 1  # +1 for padding

# Function to convert tokens to indices
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, 0) for token in tokens]  # use 0 for unknown tokens

# Convert the tokens to indices for the training data
train_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in train_text_tokens]
train_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in train_code_tokens]

# Pad sequences to ensure uniform length
max_len = 100
train_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_text_sequences])
train_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_code_sequences])

# Repeat the process for the test and evaluation sets
test_text_tokens = [basic_tokenizer(text) for text in test_data['text']]
test_code_tokens = [basic_tokenizer(code) for code in test_data['code']]

test_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in test_text_tokens]
test_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in test_code_tokens]

test_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_text_sequences])
test_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_code_sequences])

eval_text_tokens = [basic_tokenizer(text) for text in eval_data['text']]
eval_code_tokens = [basic_tokenizer(code) for code in eval_data['code']]

eval_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in eval_text_tokens]
eval_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in eval_code_tokens]

eval_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_text_sequences])
eval_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_code_sequences])

# Define the Transformer model
def transformer_encoder(inputs, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(attention + ff)  # Ensure matching dimensions here

    return ff

def transformer_decoder(inputs, enc_output, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Cross-attention with encoder output
    cross_attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(attention, enc_output)
    cross_attention = layers.Dropout(0.1)(cross_attention)
    cross_attention = layers.LayerNormalization()(attention + cross_attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(cross_attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(cross_attention + ff)  # Ensure matching dimensions here

    return ff
def build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim, hidden_units, num_heads, num_layers, max_len):
    input_text = layers.Input(shape=(max_len,), dtype=tf.int32, name='text_input')
    input_code = layers.Input(shape=(max_len,), dtype=tf.int32, name='code_input')

    # Ensure embedding dimensions match hidden_units
    text_embedding = layers.Embedding(input_dim=vocab_size_text, output_dim=hidden_units)(input_text)  # Use hidden_units
    code_embedding = layers.Embedding(input_dim=vocab_size_code, output_dim=hidden_units)(input_code)  # Use hidden_units

    pos_encoding = layers.Embedding(input_dim=max_len, output_dim=hidden_units)(tf.range(max_len))
    text_embedding += pos_encoding
    code_embedding += pos_encoding

    enc_output = text_embedding
    for _ in range(num_layers):
        enc_output = transformer_encoder(enc_output, num_heads, hidden_units)

    dec_output = code_embedding
    for _ in range(num_layers):
        dec_output = transformer_decoder(dec_output, enc_output, num_heads, hidden_units)

    output = layers.Dense(vocab_size_code, activation='softmax')(dec_output)

    model = models.Model(inputs=[input_text, input_code], outputs=output)

    return model

# Compile the model
transformer_model = build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim=128, hidden_units=512, num_heads=8, num_layers=6, max_len=100)
transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
transformer_model.summary()

# Training function
def train_transformer_model(model, train_text_sequences, train_code_sequences, eval_text_sequences, eval_code_sequences, batch_size=32, epochs=10, learning_rate=0.001):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    epoch_times = []
    validation_losses = []

    for epoch in range(epochs):
        start_time = time.time()
        print(f"Epoch {epoch+1}/{epochs}")

        indices = np.random.permutation(len(train_text_sequences))
        train_text_sequences = train_text_sequences[indices]
        train_code_sequences = train_code_sequences[indices]

        total_loss = 0
        num_batches = len(train_text_sequences) // batch_size

        for batch_num in range(num_batches):
            batch_start = batch_num * batch_size
            batch_end = batch_start + batch_size
            batch_text = train_text_sequences[batch_start:batch_end]
            batch_code = train_code_sequences[batch_start:batch_end]

            with tf.GradientTape() as tape:
                output = model([batch_text, batch_code], training=True)
                loss = loss_fn(batch_code, output)
                total_loss += loss

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        avg_loss = total_loss / num_batches
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)
        print(f"Epoch {epoch+1} - Time: {epoch_time:.2f}s - Loss: {avg_loss:.4f}")

        val_output = model([eval_text_sequences, eval_code_sequences], training=False)
        val_loss = loss_fn(eval_code_sequences, val_output)
        validation_losses.append(val_loss)

        print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")

    return epoch_times, validation_losses

# Start training the model
epoch_times, validation_losses = train_transformer_model(transformer_model, train_text_sequences, train_code_sequences, eval_text_sequences, eval_code_sequences, epochs=10, batch_size=32, learning_rate=0.001)


Epoch 1/10


  output, from_logits = _get_logits(


ResourceExhaustedError: Exception encountered when calling MultiHeadAttention.call().

[1m{{function_node __wrapped__Transpose_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[32,100,8,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Transpose][0m

Arguments received by MultiHeadAttention.call():
  • query=tf.Tensor(shape=(32, 100, 512), dtype=float32)
  • value=tf.Tensor(shape=(32, 100, 512), dtype=float32)
  • key=None
  • query_mask=None
  • value_mask=None
  • key_mask=None
  • attention_mask=None
  • return_attention_scores=False
  • training=True
  • use_causal_mask=False

In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras import mixed_precision
import time

# Enable mixed precision training for memory optimization
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

# Load the dataset
file_path = '/kaggle/input/spoc-data/spoc-train.tsv'
data = pd.read_csv(file_path, sep='\t')

# Clean the dataset by dropping rows where 'text' or 'code' is missing
cleaned_data = data.dropna(subset=['text', 'code'])

# Split the data: 80% training, 10% testing, 10% evaluation
train_data = cleaned_data.sample(frac=0.8, random_state=42)
remaining_data = cleaned_data.drop(train_data.index)
test_data = remaining_data.sample(frac=0.5, random_state=42)
eval_data = remaining_data.drop(test_data.index)

# Basic tokenizer function to split text into words (handles punctuation as well)
def basic_tokenizer(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.split()

# Tokenize the text and code columns using the basic tokenizer
train_text_tokens = [basic_tokenizer(text) for text in train_data['text']]
train_code_tokens = [basic_tokenizer(code) for code in train_data['code']]

# Create vocabulary mappings (word to index and index to word)
text_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_text_tokens for word in sentence]))}
code_vocab = {word: idx + 1 for idx, word in enumerate(set([word for sentence in train_code_tokens for word in sentence]))}

# Calculate vocab sizes
vocab_size_text = len(text_vocab) + 1  # +1 for padding
vocab_size_code = len(code_vocab) + 1  # +1 for padding

# Function to convert tokens to indices
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, 0) for token in tokens]  # use 0 for unknown tokens

# Convert the tokens to indices for the training data
train_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in train_text_tokens]
train_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in train_code_tokens]

# Pad sequences to ensure uniform length
max_len = 100
train_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_text_sequences])
train_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in train_code_sequences])

# Repeat the process for the test and evaluation sets
test_text_tokens = [basic_tokenizer(text) for text in test_data['text']]
test_code_tokens = [basic_tokenizer(code) for code in test_data['code']]

test_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in test_text_tokens]
test_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in test_code_tokens]

test_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_text_sequences])
test_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in test_code_sequences])

eval_text_tokens = [basic_tokenizer(text) for text in eval_data['text']]
eval_code_tokens = [basic_tokenizer(code) for code in eval_data['code']]

eval_text_sequences = [tokens_to_indices(tokens, text_vocab) for tokens in eval_text_tokens]
eval_code_sequences = [tokens_to_indices(tokens, code_vocab) for tokens in eval_code_tokens]

eval_text_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_text_sequences])
eval_code_sequences = np.array([seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in eval_code_sequences])

# Define the Transformer model
def transformer_encoder(inputs, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(attention + ff)  # Ensure matching dimensions here

    return ff

def transformer_decoder(inputs, enc_output, num_heads, hidden_units):
    # Multi-head Self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(inputs, inputs)
    attention = layers.Dropout(0.1)(attention)
    attention = layers.LayerNormalization()(inputs + attention)

    # Cross-attention with encoder output
    cross_attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(attention, enc_output)
    cross_attention = layers.Dropout(0.1)(cross_attention)
    cross_attention = layers.LayerNormalization()(attention + cross_attention)

    # Feed-forward layer
    ff = layers.Dense(hidden_units, activation='relu')(cross_attention)
    ff = layers.Dense(hidden_units)(ff)
    ff = layers.Dropout(0.1)(ff)
    ff = layers.LayerNormalization()(cross_attention + ff)  # Ensure matching dimensions here

    return ff

def build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim, hidden_units, num_heads, num_layers, max_len):
    input_text = layers.Input(shape=(max_len,), dtype=tf.int32, name='text_input')
    input_code = layers.Input(shape=(max_len,), dtype=tf.int32, name='code_input')

    # Ensure embedding dimensions match hidden_units
    text_embedding = layers.Embedding(input_dim=vocab_size_text, output_dim=hidden_units)(input_text)
    code_embedding = layers.Embedding(input_dim=vocab_size_code, output_dim=hidden_units)(input_code)

    pos_encoding = layers.Embedding(input_dim=max_len, output_dim=hidden_units)(tf.range(max_len))
    text_embedding += pos_encoding
    code_embedding += pos_encoding

    enc_output = text_embedding
    for _ in range(num_layers):
        enc_output = transformer_encoder(enc_output, num_heads, hidden_units)

    dec_output = code_embedding
    for _ in range(num_layers):
        dec_output = transformer_decoder(dec_output, enc_output, num_heads, hidden_units)

    output = layers.Dense(vocab_size_code, activation='softmax')(dec_output)

    model = models.Model(inputs=[input_text, input_code], outputs=output)

    return model

# Reduce hidden units and batch size
hidden_units = 256  # Reduce the hidden units to 256
batch_size = 16  # Reduce batch size to 16

# Rebuild and compile the model with the updated parameters
transformer_model = build_transformer_model(vocab_size_text, vocab_size_code, embedding_dim=128, hidden_units=hidden_units, num_heads=8, num_layers=6, max_len=100)
transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
transformer_model.summary()

# Training function
def train_transformer_model(model, train_text_sequences, train_code_sequences, eval_text_sequences, eval_code_sequences, batch_size=16, epochs=10, learning_rate=0.001):
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    epoch_times = []
    validation_losses = []

    for epoch in range(epochs):
        start_time = time.time()
        print(f"Epoch {epoch+1}/{epochs}")

        indices = np.random.permutation(len(train_text_sequences))
        train_text_sequences = train_text_sequences[indices]
        train_code_sequences = train_code_sequences[indices]

        total_loss = 0
        num_batches = len(train_text_sequences) // batch_size

        for batch_num in range(num_batches):
            batch_start = batch_num * batch_size
            batch_end = batch_start + batch_size
            batch_text = train_text_sequences[batch_start:batch_end]
            batch_code = train_code_sequences[batch_start:batch_end]

            with tf.GradientTape() as tape:
                output = model([batch_text, batch_code], training=True)
                loss = loss_fn(batch_code, output)
                total_loss += loss

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        avg_loss = total_loss / num_batches
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)
        print(f"Epoch {epoch+1} - Time: {epoch_time:.2f}s - Loss: {avg_loss:.4f}")

        val_output = model([eval_text_sequences, eval_code_sequences], training=False)
        val_loss = loss_fn(eval_code_sequences, val_output)
        validation_losses.append(val_loss)

        print(f"Validation Loss after Epoch {epoch+1}: {val_loss:.4f}")

    return epoch_times, validation_losses

# Start training the model with reduced batch size
epoch_times, validation_losses = train_transformer_model(transformer_model, train_text_sequences, train_code_sequences, eval_text_sequences, eval_code_sequences, epochs=10, batch_size=batch_size, learning_rate=0.001)


Epoch 1/10


KeyboardInterrupt: 

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import time
import re
from sklearn.model_selection import train_test_split

# Load the dataset (replace 'spoc-train.tsv' with your actual file path)
data = pd.read_csv('/kaggle/input/spoc-data/spoc-train.tsv', sep='\t')

# Keep only 'text' and 'code' columns
data = data[['text', 'code']]

# Check for missing values and drop them if any
data = data.dropna()

# Split into train (80%) and temp (20%)
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

# Simple tokenizer (splits on spaces)
def tokenize(text):
    return text.split()

# Build vocabularies from training data
def build_vocab(data, tokenizer):
    tokens = set()
    for item in data:
        tokens.update(tokenizer(item))
    # Reserve 0 for padding, 1 for <sos>, 2 for <eos>
    vocab = {token: idx for idx, token in enumerate(tokens, start=3)}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    return vocab

# Create vocabularies
pseudocode_vocab = build_vocab(train_data['text'], tokenize)
cpp_vocab = build_vocab(train_data['code'], tokenize)

# Inverse vocabularies for decoding (optional)
inv_cpp_vocab = {idx: token for token, idx in cpp_vocab.items()}

# Create Dataset class
class CodeDataset(Dataset):
    def __init__(self, data, pseudocode_vocab, cpp_vocab, tokenizer):
        self.data = data
        self.pseudocode_vocab = pseudocode_vocab
        self.cpp_vocab = cpp_vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudocode = self.data.iloc[idx]['text']
        cpp = self.data.iloc[idx]['code']
        # Add special tokens
        pseudo_tokens = ['<sos>'] + self.tokenizer(pseudocode) + ['<eos>']
        cpp_tokens = ['<sos>'] + self.tokenizer(cpp) + ['<eos>']
        # Convert to indices
        pseudo_indices = [self.pseudocode_vocab.get(token, 0) for token in pseudo_tokens]
        cpp_indices = [self.cpp_vocab.get(token, 0) for token in cpp_tokens]
        return torch.tensor(pseudo_indices), torch.tensor(cpp_indices)

# Padding function for batches
def collate_fn(batch):
    pseudocode, cpp = zip(*batch)
    pseudocode = torch.nn.utils.rnn.pad_sequence(pseudocode, padding_value=0, batch_first=True)
    cpp = torch.nn.utils.rnn.pad_sequence(cpp, padding_value=0, batch_first=True)
    return pseudocode, cpp

# Create DataLoaders
train_dataset = CodeDataset(train_data, pseudocode_vocab, cpp_vocab, tokenize)
val_dataset = CodeDataset(val_data, pseudocode_vocab, cpp_vocab, tokenize)
test_dataset = CodeDataset(test_data, pseudocode_vocab, cpp_vocab, tokenize)

batch_size = 16  # Reduced batch size due to memory constraints
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V), attn

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attn = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)


# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.mha(x, x, x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x


# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.mha1(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.mha2(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x


# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8, num_layers=6, d_ff=2048, dropout=0.1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def create_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # Padding mask
        tgt_seq_len = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len)).bool().to(tgt.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2) & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        return self.fc_out(dec_output)

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    src_vocab_size=len(pseudocode_vocab),
    tgt_vocab_size=len(cpp_vocab),
    d_model=512,
    num_heads=8,
    num_layers=6,
    d_ff=2048,
    dropout=0.1
).to(device)


# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



Training samples: 172980
Validation samples: 21622
Test samples: 21623


In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    start_time = time.time()
    train_loss = 0

    for pseudocode, cpp in train_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        optimizer.zero_grad()

        # Decoder input: <sos> + target[:-1]
        # Target output: target[1:] + <eos>
        output = model(pseudocode, cpp[:, :-1])
        loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))

        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    end_time = time.time()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for pseudocode, cpp in val_loader:
            pseudocode, cpp = pseudocode.to(device), cpp.to(device)
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Display results
    epoch_time = end_time - start_time
    lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, Val Loss: {val_loss:.4f}, LR: {lr:.6f}")



OutOfMemoryError: CUDA out of memory. Tried to allocate 70.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 66.12 MiB is free. Process 2433 has 14.67 GiB memory in use. Of the allocated memory 862.97 MiB is allocated by PyTorch, and 73.03 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Optional: Evaluate on test set
model.eval()
test_loss = 0
with torch.no_grad():
    for pseudocode, cpp in test_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        output = model(pseudocode, cpp[:, :-1])
        loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
        test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import time
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler

# Load the dataset (replace 'spoc-train.tsv' with your actual file path)
data = pd.read_csv('/kaggle/input/spoc-data/spoc-train.tsv', sep='\t')

# Keep only 'text' and 'code' columns
data = data[['text', 'code']]

# Check for missing values and drop them if any
data = data.dropna()

# Split into train (80%) and temp (20%)
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

# Simple tokenizer (splits on spaces)
def tokenize(text):
    return text.split()

# Build vocabularies from training data
def build_vocab(data, tokenizer):
    tokens = set()
    for item in data:
        tokens.update(tokenizer(item))
    # Reserve 0 for padding, 1 for <sos>, 2 for <eos>
    vocab = {token: idx for idx, token in enumerate(tokens, start=3)}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    return vocab

# Create vocabularies
pseudocode_vocab = build_vocab(train_data['text'], tokenize)
cpp_vocab = build_vocab(train_data['code'], tokenize)

# Inverse vocabularies for decoding (optional)
inv_cpp_vocab = {idx: token for token, idx in cpp_vocab.items()}

# Create Dataset class
class CodeDataset(Dataset):
    def __init__(self, data, pseudocode_vocab, cpp_vocab, tokenizer):
        self.data = data
        self.pseudocode_vocab = pseudocode_vocab
        self.cpp_vocab = cpp_vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudocode = self.data.iloc[idx]['text']
        cpp = self.data.iloc[idx]['code']
        # Add special tokens
        pseudo_tokens = ['<sos>'] + self.tokenizer(pseudocode) + ['<eos>']
        cpp_tokens = ['<sos>'] + self.tokenizer(cpp) + ['<eos>']
        # Convert to indices
        pseudo_indices = [self.pseudocode_vocab.get(token, 0) for token in pseudo_tokens]
        cpp_indices = [self.cpp_vocab.get(token, 0) for token in cpp_tokens]
        return torch.tensor(pseudo_indices), torch.tensor(cpp_indices)

# Padding function for batches
def collate_fn(batch):
    pseudocode, cpp = zip(*batch)
    pseudocode = torch.nn.utils.rnn.pad_sequence(pseudocode, padding_value=0, batch_first=True)
    cpp = torch.nn.utils.rnn.pad_sequence(cpp, padding_value=0, batch_first=True)
    return pseudocode, cpp

# Create DataLoaders
batch_size = 8  # Reduced batch size due to memory constraints
train_dataset = CodeDataset(train_data, pseudocode_vocab, cpp_vocab, tokenize)
val_dataset = CodeDataset(val_data, pseudocode_vocab, cpp_vocab, tokenize)
test_dataset = CodeDataset(test_data, pseudocode_vocab, cpp_vocab, tokenize)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V), attn

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attn = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)


# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.mha(x, x, x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x


# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.mha1(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.mha2(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x


# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=128, num_heads=8, num_layers=6, d_ff=2048, dropout=0.1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def create_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # Padding mask
        tgt_seq_len = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len)).bool().to(tgt.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2) & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        return self.fc_out(dec_output)

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    src_vocab_size=len(pseudocode_vocab),
    tgt_vocab_size=len(cpp_vocab),
    d_model=128,
    num_heads=8,
    num_layers=6,
    d_ff=2048,
    dropout=0.1
).to(device)



Training samples: 172980
Validation samples: 21622
Test samples: 21623


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 2433 has 14.74 GiB memory in use. Of the allocated memory 935.34 MiB is allocated by PyTorch, and 64.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Initialize the GradScaler for mixed precision
scaler = GradScaler()

# Training loop with mixed precision
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    start_time = time.time()
    train_loss = 0

    for pseudocode, cpp in train_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        optimizer.zero_grad()

        with autocast():  # Automatic mixed precision context
            # Decoder input: <sos> + target[:-1]
            # Target output: target[1:] + <eos>
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))

        # Scales the loss and updates the weights
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()  # Updates the scale for the next iteration

        train_loss += loss.item()

    train_loss /= len(train_loader)
    end_time = time.time()

    # Validation loop with mixed precision
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for pseudocode, cpp in val_loader:
            pseudocode, cpp = pseudocode.to(device), cpp.to(device)
            with autocast():  # Automatic mixed precision context
                output = model(pseudocode, cpp[:, :-1])
                loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Display results
    epoch_time = end_time - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, Val Loss: {val_loss:.4f}")

# Optional: Evaluate on test set
model.eval()
test_loss = 0
with torch.no_grad():
    for pseudocode, cpp in test_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        with autocast():  # Automatic mixed precision context
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
        test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import time
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler
import os

# Set the environment variable to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Load the dataset (replace 'spoc-train.tsv' with your actual file path)
data = pd.read_csv('/kaggle/input/spoc-data/spoc-train.tsv', sep='\t')

# Keep only 'text' and 'code' columns
data = data[['text', 'code']]

# Check for missing values and drop them if any
data = data.dropna()

# Split into train (80%) and temp (20%)
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

# Simple tokenizer (splits on spaces)
def tokenize(text):
    return text.split()

# Build vocabularies from training data
def build_vocab(data, tokenizer):
    tokens = set()
    for item in data:
        tokens.update(tokenizer(item))
    # Reserve 0 for padding, 1 for <sos>, 2 for <eos>
    vocab = {token: idx for idx, token in enumerate(tokens, start=3)}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    return vocab

# Create vocabularies
pseudocode_vocab = build_vocab(train_data['text'], tokenize)
cpp_vocab = build_vocab(train_data['code'], tokenize)

# Inverse vocabularies for decoding (optional)
inv_cpp_vocab = {idx: token for token, idx in cpp_vocab.items()}

# Create Dataset class
class CodeDataset(Dataset):
    def __init__(self, data, pseudocode_vocab, cpp_vocab, tokenizer):
        self.data = data
        self.pseudocode_vocab = pseudocode_vocab
        self.cpp_vocab = cpp_vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudocode = self.data.iloc[idx]['text']
        cpp = self.data.iloc[idx]['code']
        # Add special tokens
        pseudo_tokens = ['<sos>'] + self.tokenizer(pseudocode) + ['<eos>']
        cpp_tokens = ['<sos>'] + self.tokenizer(cpp) + ['<eos>']
        # Convert to indices
        pseudo_indices = [self.pseudocode_vocab.get(token, 0) for token in pseudo_tokens]
        cpp_indices = [self.cpp_vocab.get(token, 0) for token in cpp_tokens]
        return torch.tensor(pseudo_indices), torch.tensor(cpp_indices)

# Padding function for batches
def collate_fn(batch):
    pseudocode, cpp = zip(*batch)
    pseudocode = torch.nn.utils.rnn.pad_sequence(pseudocode, padding_value=0, batch_first=True)
    cpp = torch.nn.utils.rnn.pad_sequence(cpp, padding_value=0, batch_first=True)
    return pseudocode, cpp

# Create DataLoaders
batch_size = 8  # Reduced batch size due to memory constraints
train_dataset = CodeDataset(train_data, pseudocode_vocab, cpp_vocab, tokenize)
val_dataset = CodeDataset(val_data, pseudocode_vocab, cpp_vocab, tokenize)
test_dataset = CodeDataset(test_data, pseudocode_vocab, cpp_vocab, tokenize)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V), attn

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attn = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)


# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.mha(x, x, x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x


# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.mha1(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.mha2(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x


# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, num_heads=4, num_layers=4, d_ff=1024, dropout=0.1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def create_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # Padding mask
        tgt_seq_len = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len)).bool().to(tgt.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2) & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        return self.fc_out(dec_output)

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    src_vocab_size=len(pseudocode_vocab),
    tgt_vocab_size=len(cpp_vocab),
    d_model=256,  # Reduced hidden units for memory efficiency
    num_heads=4,  # Reduced number of attention heads
    num_layers=4,  # Reduced number of layers
    d_ff=1024,
    dropout=0.1
).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Initialize the GradScaler for mixed precision
scaler = GradScaler()

# Training loop with mixed precision
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    start_time = time.time()
    train_loss = 0

    for pseudocode, cpp in train_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        optimizer.zero_grad()

        with autocast():  # Automatic mixed precision context
            # Decoder input: <sos> + target[:-1]
            # Target output: target[1:] + <eos>
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))

        # Scales the loss and updates the weights
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()  # Updates the scale for the next iteration

        train_loss += loss.item()

    train_loss /= len(train_loader)
    end_time = time.time()

    # Validation loop with mixed precision
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for pseudocode, cpp in val_loader:
            pseudocode, cpp = pseudocode.to(device), cpp.to(device)
            with autocast():  # Automatic mixed precision context
                output = model(pseudocode, cpp[:, :-1])
                loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Display results
    epoch_time = end_time - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, Val Loss: {val_loss:.4f}")

# Optional: Evaluate on test set
model.eval()
test_loss = 0
with torch.no_grad():
    for pseudocode, cpp in test_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        with autocast():  # Automatic mixed precision context
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
        test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")


Training samples: 172980
Validation samples: 21622
Test samples: 21623


OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 2433 has 14.74 GiB memory in use. Of the allocated memory 935.34 MiB is allocated by PyTorch, and 64.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import time
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler
import os

# Set the environment variable to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Load the dataset (replace 'spoc-train.tsv' with your actual file path)
data = pd.read_csv('/kaggle/input/spoc-data/spoc-train.tsv', sep='\t')

# Keep only 'text' and 'code' columns
data = data[['text', 'code']]

# Check for missing values and drop them if any
data = data.dropna()

# Split into train (80%) and temp (20%)
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

# Simple tokenizer (splits on spaces)
def tokenize(text):
    return text.split()

# Build vocabularies from training data
def build_vocab(data, tokenizer):
    tokens = set()
    for item in data:
        tokens.update(tokenizer(item))
    # Reserve 0 for padding, 1 for <sos>, 2 for <eos>
    vocab = {token: idx for idx, token in enumerate(tokens, start=3)}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    return vocab

# Create vocabularies
pseudocode_vocab = build_vocab(train_data['text'], tokenize)
cpp_vocab = build_vocab(train_data['code'], tokenize)

# Inverse vocabularies for decoding (optional)
inv_cpp_vocab = {idx: token for token, idx in cpp_vocab.items()}

# Create Dataset class
class CodeDataset(Dataset):
    def __init__(self, data, pseudocode_vocab, cpp_vocab, tokenizer):
        self.data = data
        self.pseudocode_vocab = pseudocode_vocab
        self.cpp_vocab = cpp_vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudocode = self.data.iloc[idx]['text']
        cpp = self.data.iloc[idx]['code']
        # Add special tokens
        pseudo_tokens = ['<sos>'] + self.tokenizer(pseudocode) + ['<eos>']
        cpp_tokens = ['<sos>'] + self.tokenizer(cpp) + ['<eos>']
        # Convert to indices
        pseudo_indices = [self.pseudocode_vocab.get(token, 0) for token in pseudo_tokens]
        cpp_indices = [self.cpp_vocab.get(token, 0) for token in cpp_tokens]
        return torch.tensor(pseudo_indices), torch.tensor(cpp_indices)

# Padding function for batches
def collate_fn(batch):
    pseudocode, cpp = zip(*batch)
    pseudocode = torch.nn.utils.rnn.pad_sequence(pseudocode, padding_value=0, batch_first=True)
    cpp = torch.nn.utils.rnn.pad_sequence(cpp, padding_value=0, batch_first=True)
    return pseudocode, cpp

# Create DataLoaders
batch_size = 8  # Reduced batch size due to memory constraints
train_dataset = CodeDataset(train_data, pseudocode_vocab, cpp_vocab, tokenize)
val_dataset = CodeDataset(val_data, pseudocode_vocab, cpp_vocab, tokenize)
test_dataset = CodeDataset(test_data, pseudocode_vocab, cpp_vocab, tokenize)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V), attn

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attn = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)


# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.mha(x, x, x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x


# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.mha1(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.mha2(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x


# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=128, num_heads=2, num_layers=2, d_ff=512, dropout=0.1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def create_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # Padding mask
        tgt_seq_len = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len)).bool().to(tgt.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2) & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        return self.fc_out(dec_output)

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    src_vocab_size=len(pseudocode_vocab),
    tgt_vocab_size=len(cpp_vocab),
    d_model=128,  # Further reduced hidden units for memory efficiency
    num_heads=2,  # Reduced number of attention heads
    num_layers=2,  # Reduced number of layers
    d_ff=512,  # Reduced feed-forward dimension
    dropout=0.1
).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Initialize the GradScaler for mixed precision
scaler = GradScaler()

# Training loop with mixed precision
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    start_time = time.time()
    train_loss = 0

    for pseudocode, cpp in train_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        optimizer.zero_grad()

        with autocast():  # Automatic mixed precision context
            # Decoder input: <sos> + target[:-1]
            # Target output: target[1:] + <eos>
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))

        # Scales the loss and updates the weights
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()  # Updates the scale for the next iteration

        train_loss += loss.item()

    train_loss /= len(train_loader)
    end_time = time.time()

    # Validation loop with mixed precision
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for pseudocode, cpp in val_loader:
            pseudocode, cpp = pseudocode.to(device), cpp.to(device)
            with autocast():  # Automatic mixed precision context
                output = model(pseudocode, cpp[:, :-1])
                loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Display results
    epoch_time = end_time - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, Val Loss: {val_loss:.4f}")

# Optional: Evaluate on test set
model.eval()
test_loss = 0
with torch.no_grad():
    for pseudocode, cpp in test_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        with autocast():  # Automatic mixed precision context
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
        test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")


Training samples: 172980
Validation samples: 21622
Test samples: 21623


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 2433 has 14.74 GiB memory in use. Of the allocated memory 935.34 MiB is allocated by PyTorch, and 64.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import time
from sklearn.model_selection import train_test_split
from torch.cuda.amp import autocast, GradScaler
import os

# Set the environment variable to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Load the dataset (replace 'spoc-train.tsv' with your actual file path)
data = pd.read_csv('/kaggle/input/spoc-data/spoc-train.tsv', sep='\t')

# Keep only 'text' and 'code' columns
data = data[['text', 'code']]

# Check for missing values and drop them if any
data = data.dropna()

# Split into train (80%) and temp (20%)
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

# Simple tokenizer (splits on spaces)
def tokenize(text):
    return text.split()

# Build vocabularies from training data
def build_vocab(data, tokenizer):
    tokens = set()
    for item in data:
        tokens.update(tokenizer(item))
    # Reserve 0 for padding, 1 for <sos>, 2 for <eos>
    vocab = {token: idx for idx, token in enumerate(tokens, start=3)}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    return vocab

# Create vocabularies
pseudocode_vocab = build_vocab(train_data['text'], tokenize)
cpp_vocab = build_vocab(train_data['code'], tokenize)

# Inverse vocabularies for decoding (optional)
inv_cpp_vocab = {idx: token for token, idx in cpp_vocab.items()}

# Create Dataset class
class CodeDataset(Dataset):
    def __init__(self, data, pseudocode_vocab, cpp_vocab, tokenizer):
        self.data = data
        self.pseudocode_vocab = pseudocode_vocab
        self.cpp_vocab = cpp_vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudocode = self.data.iloc[idx]['text']
        cpp = self.data.iloc[idx]['code']
        # Add special tokens
        pseudo_tokens = ['<sos>'] + self.tokenizer(pseudocode) + ['<eos>']
        cpp_tokens = ['<sos>'] + self.tokenizer(cpp) + ['<eos>']
        # Convert to indices
        pseudo_indices = [self.pseudocode_vocab.get(token, 0) for token in pseudo_tokens]
        cpp_indices = [self.cpp_vocab.get(token, 0) for token in cpp_tokens]
        return torch.tensor(pseudo_indices), torch.tensor(cpp_indices)

# Padding function for batches
def collate_fn(batch):
    pseudocode, cpp = zip(*batch)
    pseudocode = torch.nn.utils.rnn.pad_sequence(pseudocode, padding_value=0, batch_first=True)
    cpp = torch.nn.utils.rnn.pad_sequence(cpp, padding_value=0, batch_first=True)
    return pseudocode, cpp

# Create DataLoaders
batch_size = 8  # Reduced batch size due to memory constraints
train_dataset = CodeDataset(train_data, pseudocode_vocab, cpp_vocab, tokenize)
val_dataset = CodeDataset(val_data, pseudocode_vocab, cpp_vocab, tokenize)
test_dataset = CodeDataset(test_data, pseudocode_vocab, cpp_vocab, tokenize)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V), attn

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attn = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)


# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.mha(x, x, x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x


# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.mha1(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.mha2(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x


# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=64, num_heads=2, num_layers=2, d_ff=256, dropout=0.1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def create_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # Padding mask
        tgt_seq_len = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len)).bool().to(tgt.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2) & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        return self.fc_out(dec_output)

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    src_vocab_size=len(pseudocode_vocab),
    tgt_vocab_size=len(cpp_vocab),
    d_model=64,  # Further reduced hidden units for memory efficiency
    num_heads=2,  # Reduced number of attention heads
    num_layers=2,  # Reduced number of layers
    d_ff=256,  # Reduced feed-forward dimension
    dropout=0.1
).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Initialize the GradScaler for mixed precision
scaler = GradScaler()

# Training loop with mixed precision
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    start_time = time.time()
    train_loss = 0

    for pseudocode, cpp in train_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        optimizer.zero_grad()

        with autocast():  # Automatic mixed precision context
            # Decoder input: <sos> + target[:-1]
            # Target output: target[1:] + <eos>
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))

        # Scales the loss and updates the weights
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()  # Updates the scale for the next iteration

        train_loss += loss.item()

    train_loss /= len(train_loader)
    end_time = time.time()

    # Validation loop with mixed precision
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for pseudocode, cpp in val_loader:
            pseudocode, cpp = pseudocode.to(device), cpp.to(device)
            with autocast():  # Automatic mixed precision context
                output = model(pseudocode, cpp[:, :-1])
                loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Display results
    epoch_time = end_time - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, Val Loss: {val_loss:.4f}")

# Optional: Evaluate on test set
model.eval()
test_loss = 0
with torch.no_grad():
    for pseudocode, cpp in test_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        with autocast():  # Automatic mixed precision context
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
        test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")


Training samples: 172980
Validation samples: 21622
Test samples: 21623


  scaler = GradScaler()
  with autocast():  # Automatic mixed precision context


RuntimeError: value cannot be converted to type at::Half without overflow

# ***MODEL WITH SOME OUTPUT***

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import math
import time
from sklearn.model_selection import train_test_split
import os

# Set the environment variable to avoid fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Load the dataset (replace 'spoc-train.tsv' with your actual file path)
data = pd.read_csv('/kaggle/input/spoc-data/spoc-train.tsv', sep='\t')

# Keep only 'text' and 'code' columns
data = data[['text', 'code']]

# Check for missing values and drop them if any
data = data.dropna()

# Split into train (80%) and temp (20%)
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

# Simple tokenizer (splits on spaces)
def tokenize(text):
    return text.split()

# Build vocabularies from training data
def build_vocab(data, tokenizer):
    tokens = set()
    for item in data:
        tokens.update(tokenizer(item))
    # Reserve 0 for padding, 1 for <sos>, 2 for <eos>
    vocab = {token: idx for idx, token in enumerate(tokens, start=3)}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    return vocab

# Create vocabularies
pseudocode_vocab = build_vocab(train_data['text'], tokenize)
cpp_vocab = build_vocab(train_data['code'], tokenize)

# Inverse vocabularies for decoding (optional)
inv_cpp_vocab = {idx: token for token, idx in cpp_vocab.items()}

# Create Dataset class
class CodeDataset(Dataset):
    def __init__(self, data, pseudocode_vocab, cpp_vocab, tokenizer):
        self.data = data
        self.pseudocode_vocab = pseudocode_vocab
        self.cpp_vocab = cpp_vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudocode = self.data.iloc[idx]['text']
        cpp = self.data.iloc[idx]['code']
        # Add special tokens
        pseudo_tokens = ['<sos>'] + self.tokenizer(pseudocode) + ['<eos>']
        cpp_tokens = ['<sos>'] + self.tokenizer(cpp) + ['<eos>']
        # Convert to indices
        pseudo_indices = [self.pseudocode_vocab.get(token, 0) for token in pseudo_tokens]
        cpp_indices = [self.cpp_vocab.get(token, 0) for token in cpp_tokens]
        return torch.tensor(pseudo_indices), torch.tensor(cpp_indices)

# Padding function for batches
def collate_fn(batch):
    pseudocode, cpp = zip(*batch)
    pseudocode = torch.nn.utils.rnn.pad_sequence(pseudocode, padding_value=0, batch_first=True)
    cpp = torch.nn.utils.rnn.pad_sequence(cpp, padding_value=0, batch_first=True)
    return pseudocode, cpp

# Create DataLoaders
batch_size = 8  # Reduced batch size due to memory constraints
train_dataset = CodeDataset(train_data, pseudocode_vocab, cpp_vocab, tokenize)
val_dataset = CodeDataset(val_data, pseudocode_vocab, cpp_vocab, tokenize)
test_dataset = CodeDataset(test_data, pseudocode_vocab, cpp_vocab, tokenize)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: [1, max_len, d_model]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]


# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V), attn

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attn = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)


# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.mha(x, x, x, mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x


# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.mha1(x, x, x, tgt_mask)))
        x = self.norm2(x + self.dropout(self.mha2(x, enc_output, enc_output, src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x


# Transformer Model
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=64, num_heads=2, num_layers=2, d_ff=256, dropout=0.1, max_len=512):
        super().__init__()
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.d_model = d_model

    def create_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)  # Padding mask
        tgt_seq_len = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones(tgt_seq_len, tgt_seq_len)).bool().to(tgt.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2) & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.create_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        return self.fc_out(dec_output)

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
    src_vocab_size=len(pseudocode_vocab),
    tgt_vocab_size=len(cpp_vocab),
    d_model=64,  # Further reduced hidden units for memory efficiency
    num_heads=2,  # Reduced number of attention heads
    num_layers=2,  # Reduced number of layers
    d_ff=128,  # Reduced feed-forward dimension
    dropout=0.1
).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)


Training samples: 172980
Validation samples: 21622
Test samples: 21623


In [None]:

# Training loop without mixed precision
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    start_time = time.time()
    train_loss = 0

    for pseudocode, cpp in train_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        optimizer.zero_grad()

        # Decoder input: <sos> + target[:-1]
        # Target output: target[1:] + <eos>
        output = model(pseudocode, cpp[:, :-1])
        loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    end_time = time.time()

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for pseudocode, cpp in val_loader:
            pseudocode, cpp = pseudocode.to(device), cpp.to(device)
            output = model(pseudocode, cpp[:, :-1])
            loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Display results
    epoch_time = end_time - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s, Val Loss: {val_loss:.4f}")


Epoch 1/5, Time: 363.58s, Val Loss: 2.1023
Epoch 2/5, Time: 360.98s, Val Loss: 1.8708
Epoch 3/5, Time: 360.65s, Val Loss: 1.7683
Epoch 4/5, Time: 360.75s, Val Loss: 1.7206
Epoch 5/5, Time: 360.35s, Val Loss: 1.8546
Test Loss: 1.8228


In [None]:
# Optional: Evaluate on test set
model.eval()
test_loss = 0
with torch.no_grad():
    for pseudocode, cpp in test_loader:
        pseudocode, cpp = pseudocode.to(device), cpp.to(device)
        output = model(pseudocode, cpp[:, :-1])
        loss = criterion(output.view(-1, len(cpp_vocab)), cpp[:, 1:].contiguous().view(-1))
        test_loss += loss.item()
test_loss /= len(test_loader)
print(f"Test Loss: {test_loss:.4f}")

Test Loss: 1.8228


In [None]:
torch.save(model.state_dict(), "transformer_seq2seq_testing0.2.pth")
print("Model saved successfully.")

Model saved successfully.


In [None]:
# Load model
model.load_state_dict(torch.load("/kaggle/input/test0.1/pytorch/test0.1/1/transformer_seq2seq_testing0.1.pth"))
model.eval()
print("Model loaded successfully.")

In [None]:
# Inverse vocabularies for decoding (for C++ code)
inv_code_vocab = {idx: token for token, idx in cpp_vocab.items()}

def generate_code(model, input_text, text_vocab, code_vocab, max_len=100):
    """
    Generate C++ code from pseudocode using the trained Transformer model.

    :param model: The trained Transformer model.
    :param input_text: The input pseudocode as a string.
    :param text_vocab: Vocabulary for pseudocode (text).
    :param code_vocab: Vocabulary for C++ code.
    :param max_len: Maximum length of the generated code.
    :return: The generated C++ code as a string.
    """
    model.eval()  # Set the model to evaluation mode

    # Tokenize the input text (pseudocode)
    input_tokens = ['<sos>'] + input_text.split() + ['<eos>']

    # Convert tokens to indices using the text vocabulary
    input_indices = [text_vocab.get(token, 0) for token in input_tokens]
    input_tensor = torch.tensor(input_indices).unsqueeze(0).to(device)  # Add batch dimension

    # Start decoding the output C++ code
    generated_code = []
    tgt_input = torch.tensor([text_vocab['<sos>']]).unsqueeze(0).to(device)  # Initial target token

    for _ in range(max_len):
        with torch.no_grad():
            # Get the model's prediction for the next token
            output = model(input_tensor, tgt_input)

            # Get the predicted token (with highest probability) for each position
            predicted_token_idx = output.argmax(dim=-1)[:, -1].item()  # Get the last token prediction

            # If we predict the <eos> token, stop generating
            if predicted_token_idx == code_vocab['<eos>']:
                break

            # Add the predicted token to the output sequence
            generated_code.append(inv_code_vocab.get(predicted_token_idx, '<unk>'))

            # Update the target input (append the predicted token for the next step)
            tgt_input = torch.cat([tgt_input, torch.tensor([[predicted_token_idx]]).to(device)], dim=-1)

    # Join the generated tokens into a single string
    generated_code_str = ' '.join(generated_code)
    return generated_code_str

# Example usage:
input_text = "for i from 1 to n do"
generated_code = generate_code(model, input_text, pseudocode_vocab, cpp_vocab, max_len=100)
print("Generated C++ code:", generated_code)


Generated C++ code: for (int i = 1; i <= n; i++) {


In [None]:
input_text = "create integers x1, y1, x2, y2"
generated_code = generate_code(model, input_text, pseudocode_vocab, cpp_vocab, max_len=100)
print("Generated C++ code:", generated_code)

Generated C++ code: int n, m, k;


In [None]:
input_text = "read s"
generated_code = generate_code(model, input_text, pseudocode_vocab, cpp_vocab, max_len=100)
print("Generated C++ code:", generated_code)

Generated C++ code: cin >> s;


In [None]:
import gradio as gr

# Define the function that uses the generate_code logic
def generate_cpp_code_from_pseudocode(input_text):
    generated_code = generate_code(model, input_text, pseudocode_vocab, cpp_vocab, max_len=100)
    return generated_code

# Create a Gradio interface
interface = gr.Interface(
    fn=generate_cpp_code_from_pseudocode,          # Function to run
    inputs=gr.Textbox(lines=2, placeholder="Enter pseudocode here..."),  # Input text box for pseudocode
    outputs=gr.Textbox(label="Generated C++ Code"),  # Output text box for generated C++ code
    title="Pseudocode to C++ Code Generator",  # Title of the interface
    description="Enter pseudocode and get the corresponding C++ code generated using the Transformer model.",
    theme="compact"
)

# Launch the interface
interface.launch()



Sorry, we can't find the page you are looking for.


* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://57ed0cbbb7bf79fefa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.19.0-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.1