In [26]:
!pip install datasets transformers nltk sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
import random
import numpy as np

In [28]:
special_tokens = {"pad_token":"<PAD>", "bos_token":"<SOS>", "eos_token":"<EOS>"}
tokenizer.add_special_tokens(special_tokens)

PAD_IDX = tokenizer.pad_token_id
SOS_IDX = tokenizer.bos_token_id
EOS_IDX = tokenizer.eos_token_id

# Now, get updated vocab size
INPUT_DIM = len(tokenizer)
OUTPUT_DIM = len(tokenizer)

print("Vocab size:", INPUT_DIM)
print("PAD_IDX:", PAD_IDX, "SOS_IDX:", SOS_IDX, "EOS_IDX:", EOS_IDX)


Vocab size: 30525
PAD_IDX: 30522 SOS_IDX: 30523 EOS_IDX: 30524


In [29]:
dataset = load_dataset("Nan-Do/code-search-net-python")

# Take subset for faster training
full_data = dataset["train"].select(range(7000))

# Split 80% train, 10% val, 10% test
split1 = full_data.train_test_split(test_size=0.2, seed=42)
train_data = split1["train"]
temp_data  = split1["test"]
split2 = temp_data.train_test_split(test_size=0.5, seed=42)
val_data  = split2["train"]
test_data = split2["test"]

print(len(train_data), len(val_data), len(test_data))


5600 700 700


In [30]:
class CodeDataset(Dataset):
    def __init__(self, data, tokenizer, max_doc_len=50, max_code_len=80):
        self.data = data
        self.tokenizer = tokenizer
        self.max_doc_len = max_doc_len
        self.max_code_len = max_code_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # Tokenize docstring
        doc = self.tokenizer.encode(
            item["docstring"], truncation=True, max_length=self.max_doc_len, add_special_tokens=False
        )
        # Tokenize code
        code = self.tokenizer.encode(
            item["code"], truncation=True, max_length=self.max_code_len, add_special_tokens=False
        )
        return {"doc": doc, "code": code}


In [31]:
def collate_fn(batch):
    docs = [b["doc"] for b in batch]
    codes = [b["code"] for b in batch]
    max_doc = max(len(d) for d in docs)
    max_code = max(len(c) for c in codes)

    doc_pad, trg_in, trg_out = [], [], []

    for d,c in zip(docs,codes):
        doc_pad.append(d + [PAD_IDX]*(max_doc-len(d)))
        trg_in.append([SOS_IDX] + c + [PAD_IDX]*(max_code-len(c)))
        trg_out.append(c + [EOS_IDX] + [PAD_IDX]*(max_code-len(c)))

    return {
        "src": torch.tensor(doc_pad),
        "trg_in": torch.tensor(trg_in),
        "trg_out": torch.tensor(trg_out)
    }


In [32]:
train_ds = CodeDataset(train_data, tokenizer)
val_ds   = CodeDataset(val_data, tokenizer)
test_ds  = CodeDataset(test_data, tokenizer)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds, batch_size=32, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds, batch_size=32, collate_fn=collate_fn)


In [33]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.RNN(emb_dim, hid_dim)

    def forward(self, src):
        embedded = self.embedding(src)
        _, hidden = self.rnn(embedded)
        return hidden

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX)
        self.rnn = nn.RNN(emb_dim, hid_dim)
        self.fc = nn.Linear(hid_dim, output_dim)

    def forward(self, x, hidden):
        x = x.unsqueeze(0)
        emb = self.embedding(x)
        out, hidden = self.rnn(emb, hidden)
        pred = self.fc(out.squeeze(0))
        return pred, hidden

class Seq2Seq(nn.Module):
    def __init__(self, enc, dec, device):
        super().__init__()
        self.enc = enc
        self.dec = dec
        self.device = device

    def forward(self, src, trg, teacher_forcing=0.5):
        batch = trg.shape[1]
        trg_len = trg.shape[0]
        vocab = self.dec.fc.out_features

        outputs = torch.zeros(trg_len, batch, vocab).to(self.device)

        hidden = self.enc(src)
        x = trg[0]

        for t in range(1, trg_len):
            out, hidden = self.dec(x, hidden)
            outputs[t] = out
            top1 = out.argmax(1)
            x = trg[t] if random.random() < teacher_forcing else top1

        return outputs


In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = Seq2Seq(
    Encoder(INPUT_DIM, 128, 256),
    Decoder(OUTPUT_DIM, 128, 256),
    device
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


cuda


In [35]:
def train(model, loader):
    model.train()
    total_loss = 0
    for b in loader:
        src = b["src"].transpose(0,1).to(device)
        trg_in = b["trg_in"].transpose(0,1).to(device)
        trg_out = b["trg_out"].transpose(0,1).to(device)

        optimizer.zero_grad()
        output = model(src, trg_in)

        output = output[1:].reshape(-1, output.shape[-1])
        trg_out = trg_out[1:].reshape(-1)

        loss = criterion(output, trg_out)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for b in loader:
            src = b["src"].transpose(0,1).to(device)
            trg_in = b["trg_in"].transpose(0,1).to(device)
            trg_out = b["trg_out"].transpose(0,1).to(device)

            output = model(src, trg_in, teacher_forcing=0)
            output = output[1:].reshape(-1, output.shape[-1])
            trg_out = trg_out[1:].reshape(-1)

            loss = criterion(output, trg_out)
            total_loss += loss.item()
    return total_loss / len(loader)


In [40]:
best_val_loss = float('inf')

for epoch in range(10):
    train_loss = train(model, train_loader)
    val_loss = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")
        print("ðŸ’¾ Saved Best Model!")


Epoch 1: Train Loss = 4.8843, Val Loss = 6.0607
ðŸ’¾ Saved Best Model!
Epoch 2: Train Loss = 4.8615, Val Loss = 6.2293
Epoch 3: Train Loss = 4.8751, Val Loss = 5.9968
ðŸ’¾ Saved Best Model!
Epoch 4: Train Loss = 4.8054, Val Loss = 6.0667
Epoch 5: Train Loss = 4.7570, Val Loss = 6.1780
Epoch 6: Train Loss = 4.7393, Val Loss = 6.2079
Epoch 7: Train Loss = 4.7004, Val Loss = 6.0770
Epoch 8: Train Loss = 4.6934, Val Loss = 6.1861
Epoch 9: Train Loss = 4.6961, Val Loss = 6.0962
Epoch 10: Train Loss = 4.6772, Val Loss = 6.0596


In [None]:
def generate_code(model, docstring, max_len=80):
    model.eval()
    tokens = tokenizer.encode(docstring, add_special_tokens=False)
    src = torch.tensor(tokens).unsqueeze(1).to(device)  # [seq_len, batch=1]

    with torch.no_grad():
        hidden = model.enc(src)
        x = torch.tensor([SOS_IDX]).to(device)  # [1]
        output_tokens = []

        for _ in range(max_len):
            out, hidden = model.dec(x, hidden)  # out: [batch=1, vocab_size]
            top1 = out.argmax(dim=1).item()      # âœ… argmax over vocab dimension
            if top1 == EOS_IDX:
                break
            output_tokens.append(top1)
            x = torch.tensor([top1]).to(device)

    return tokenizer.decode(output_tokens)


In [48]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smooth = SmoothingFunction().method1

def bleu_score(model, dataset, n_samples=100):
    scores = []
    for i in range(n_samples):
        ref_text = dataset[i]["code"]
        doc_text = dataset[i]["docstring"]

        # Generate prediction
        pred_text = generate_code(model, doc_text)

        # Encode both reference and prediction using tokenizer
        ref_tokens = tokenizer.encode(ref_text, add_special_tokens=False)
        pred_tokens = tokenizer.encode(pred_text, add_special_tokens=False)

        # Convert to strings for BLEU
        ref_tokens = [str(tok) for tok in ref_tokens]
        pred_tokens = [str(tok) for tok in pred_tokens]

        # Compute BLEU score with smoothing
        scores.append(sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smooth))
    return np.mean(scores)


In [49]:
# Load best model
model.load_state_dict(torch.load("best_model.pt"))
model.to(device)

print("BLEU score on 100 test examples:", bleu_score(model, test_data, n_samples=100))


BLEU score on 100 test examples: 0.021226243711369244
