In [1]:
!pip install datasets tokenizers nltk sacrebleu -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
import random
import os
from tqdm import tqdm

In [3]:
# Configuration
MAX_DOC_LEN = 40
MAX_CODE_LEN = 80
BATCH_SIZE = 32
EMB_DIM = 256
HID_DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.3
EPOCHS = 10
LR = 1e-3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Load dataset
dataset = load_dataset("Nan-Do/code-search-net-python")
full_data = dataset["train"].select(range(7000))

split1 = full_data.train_test_split(test_size=0.2, seed=42)
train_data = split1["train"]
temp_data = split1["test"]

split2 = temp_data.train_test_split(test_size=0.5, seed=42)
val_data = split2["train"]
test_data = split2["test"]

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004-ee77a7de79eb2a(…):   0%|          | 0.00/155M [00:00<?, ?B/s]

data/train-00001-of-00004-648b3bede2edf6(…):   0%|          | 0.00/139M [00:00<?, ?B/s]

data/train-00002-of-00004-1dfd72b171e6b2(…):   0%|          | 0.00/153M [00:00<?, ?B/s]

data/train-00003-of-00004-184ab6d0e3c690(…):   0%|          | 0.00/151M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/455243 [00:00<?, ? examples/s]

Train: 5600, Val: 700, Test: 700


In [5]:
# Tokenizer setup
def setup_tokenizer(train_data, vocab_size=8000):
    tokenizer_path = "tokenizer.json"
    
    if os.path.exists(tokenizer_path):
        tokenizer = Tokenizer.from_file(tokenizer_path)
        print("Loaded existing tokenizer")
    else:
        print("Training new tokenizer...")
        tokenizer = Tokenizer(models.BPE(unk_token="<UNK>"))
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        
        trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            special_tokens=["<PAD>", "<SOS>", "<EOS>", "<UNK>"]
        )
        
        corpus = []
        for i in range(min(5000, len(train_data))):
            corpus.append(train_data[i]["docstring"])
            corpus.append(train_data[i]["code"])
        
        tokenizer.train_from_iterator(corpus, trainer)
        tokenizer.save(tokenizer_path)
        print("Tokenizer trained and saved")
    
    # Enable padding and truncation
    tokenizer.enable_padding(pad_id=tokenizer.token_to_id("<PAD>"), 
                            pad_token="<PAD>", 
                            length=MAX_CODE_LEN)
    tokenizer.enable_truncation(max_length=MAX_CODE_LEN)
    
    return tokenizer

tokenizer = setup_tokenizer(train_data)

# Token helpers
PAD_IDX = tokenizer.token_to_id("<PAD>")
SOS_IDX = tokenizer.token_to_id("<SOS>")
EOS_IDX = tokenizer.token_to_id("<EOS>")
UNK_IDX = tokenizer.token_to_id("<UNK>")
VOCAB_SIZE = tokenizer.get_vocab_size()

print(f"Vocab size: {VOCAB_SIZE}")
print(f"PAD: {PAD_IDX}, SOS: {SOS_IDX}, EOS: {EOS_IDX}, UNK: {UNK_IDX}")

def encode(text):
    return tokenizer.encode(text).ids

def decode(ids):
    return tokenizer.decode(ids)

Training new tokenizer...



Tokenizer trained and saved
Vocab size: 8000
PAD: 0, SOS: 1, EOS: 2, UNK: 3


In [6]:
# Dataset
class CodeDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Ensure we have valid strings
        docstring = item["docstring"] if item["docstring"] else ""
        code = item["code"] if item["code"] else ""
        
        doc = encode(docstring)[:MAX_DOC_LEN]
        code = encode(code)[:MAX_CODE_LEN]
        
        # Ensure sequences are not empty
        if len(doc) == 0:
            doc = [PAD_IDX]
        if len(code) == 0:
            code = [PAD_IDX]
            
        return {"doc": doc, "code": code}

In [7]:
# Collate function
def collate_fn(batch):
    docs = [b["doc"] for b in batch]
    codes = [b["code"] for b in batch]
    
    max_doc = max(len(d) for d in docs)
    max_code = max(len(c) for c in codes)
    
    src_list, trg_in_list, trg_out_list = [], [], []
    
    for d, c in zip(docs, codes):
        # Pad source
        src_list.append(d + [PAD_IDX] * (max_doc - len(d)))
        
        # Target input (with SOS)
        trg_in_list.append([SOS_IDX] + c + [PAD_IDX] * (max_code - len(c)))
        
        # Target output (with EOS)
        trg_out_list.append(c + [EOS_IDX] + [PAD_IDX] * (max_code - len(c)))
    
    return {
        "src": torch.tensor(src_list, dtype=torch.long),
        "trg_in": torch.tensor(trg_in_list, dtype=torch.long),
        "trg_out": torch.tensor(trg_out_list, dtype=torch.long)
    }

In [8]:
# DataLoaders
train_loader = DataLoader(CodeDataset(train_data), batch_size=BATCH_SIZE, 
                         shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(CodeDataset(val_data), batch_size=BATCH_SIZE, 
                       collate_fn=collate_fn)
test_loader = DataLoader(CodeDataset(test_data), batch_size=BATCH_SIZE, 
                        collate_fn=collate_fn)

In [9]:
# Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers=2, dropout=0.3):
        super().__init__()
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=PAD_IDX)
        self.dropout = nn.Dropout(dropout)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                          dropout=dropout, batch_first=False)
        
    def forward(self, src):
        # src shape: (seq_len, batch_size)
        embedded = self.dropout(self.embedding(src))
        
        # Initialize hidden states
        batch_size = src.shape[1]
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(src.device)
        cell = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(src.device)
        
        outputs, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        return hidden, cell

In [10]:
# Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, num_layers=2, dropout=0.3):
        super().__init__()
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=PAD_IDX)
        self.dropout = nn.Dropout(dropout)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                          dropout=dropout, batch_first=False)
        self.fc = nn.Linear(hid_dim, output_dim)
        
    def forward(self, x, hidden, cell):
        # x shape: (batch_size)
        x = x.unsqueeze(0)  # (1, batch_size)
        embedded = self.dropout(self.embedding(x))
        
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(0))  # (batch_size, output_dim)
        
        return prediction, hidden, cell

In [11]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: (seq_len, batch_size)
        # trg: (seq_len, batch_size)
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(trg_len, batch_size, vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        x = trg[0]  # First input is SOS token
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            
            # Teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            x = trg[t] if teacher_force else top1
            
        return outputs

In [12]:
# Initialize model
encoder = Encoder(VOCAB_SIZE, EMB_DIM, HID_DIM, NUM_LAYERS, DROPOUT)
decoder = Decoder(VOCAB_SIZE, EMB_DIM, HID_DIM, NUM_LAYERS, DROPOUT)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

# Count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

# Optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

The model has 15,556,416 trainable parameters


In [13]:
# Training function
def train_epoch(model, loader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    
    for batch in tqdm(loader, desc="Training"):
        src = batch["src"].transpose(0, 1).to(DEVICE)
        trg_in = batch["trg_in"].transpose(0, 1).to(DEVICE)
        trg_out = batch["trg_out"].transpose(0, 1).to(DEVICE)
        
        optimizer.zero_grad()
        
        output = model(src, trg_in)
        
        # Reshape for loss
        output = output[1:].reshape(-1, output.shape[-1])
        trg_out = trg_out[1:].reshape(-1)
        
        loss = criterion(output, trg_out)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(loader)


In [14]:
# Evaluation function
def evaluate_epoch(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            src = batch["src"].transpose(0, 1).to(DEVICE)
            trg_in = batch["trg_in"].transpose(0, 1).to(DEVICE)
            trg_out = batch["trg_out"].transpose(0, 1).to(DEVICE)
            
            output = model(src, trg_in, teacher_forcing_ratio=0)
            
            output = output[1:].reshape(-1, output.shape[-1])
            trg_out = trg_out[1:].reshape(-1)
            
            loss = criterion(output, trg_out)
            epoch_loss += loss.item()
    
    return epoch_loss / len(loader)

In [15]:
# Training loop
best_val_loss = float('inf')

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss = evaluate_epoch(model, val_loader, criterion)
    
    print(f"Epoch {epoch+1:02}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model_lstm.pt")
        print(f"✓ Saved best model with val loss: {val_loss:.4f}")

Training: 100%|██████████| 175/175 [00:40<00:00,  4.32it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.72it/s]


Epoch 01: Train Loss = 6.6759, Val Loss = 6.6036
✓ Saved best model with val loss: 6.6036


Training: 100%|██████████| 175/175 [00:39<00:00,  4.40it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.57it/s]


Epoch 02: Train Loss = 6.3142, Val Loss = 6.6697


Training: 100%|██████████| 175/175 [00:39<00:00,  4.39it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.68it/s]


Epoch 03: Train Loss = 6.1330, Val Loss = 6.6012
✓ Saved best model with val loss: 6.6012


Training: 100%|██████████| 175/175 [00:39<00:00,  4.39it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.51it/s]


Epoch 04: Train Loss = 6.0043, Val Loss = 6.6833


Training: 100%|██████████| 175/175 [00:40<00:00,  4.31it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.43it/s]


Epoch 05: Train Loss = 5.8931, Val Loss = 6.4968
✓ Saved best model with val loss: 6.4968


Training: 100%|██████████| 175/175 [00:40<00:00,  4.37it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.52it/s]


Epoch 06: Train Loss = 5.7868, Val Loss = 6.4615
✓ Saved best model with val loss: 6.4615


Training: 100%|██████████| 175/175 [00:39<00:00,  4.39it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.67it/s]


Epoch 07: Train Loss = 5.6818, Val Loss = 6.4578
✓ Saved best model with val loss: 6.4578


Training: 100%|██████████| 175/175 [00:39<00:00,  4.38it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.59it/s]


Epoch 08: Train Loss = 5.5931, Val Loss = 6.4002
✓ Saved best model with val loss: 6.4002


Training: 100%|██████████| 175/175 [00:39<00:00,  4.38it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.39it/s]


Epoch 09: Train Loss = 5.4952, Val Loss = 6.5477


Training: 100%|██████████| 175/175 [00:39<00:00,  4.38it/s]
Evaluating: 100%|██████████| 22/22 [00:02<00:00,  8.53it/s]

Epoch 10: Train Loss = 5.4142, Val Loss = 6.4204





In [16]:
def generate_code(model, docstring, max_len=MAX_CODE_LEN):
    model.eval()
    
    # Encode docstring
    tokens = encode(docstring)[:MAX_DOC_LEN]
    if len(tokens) == 0:
        tokens = [PAD_IDX]
    
    src = torch.tensor(tokens).unsqueeze(1).to(DEVICE)
    
    with torch.no_grad():
        hidden, cell = model.encoder(src)
        
        x = torch.tensor([SOS_IDX]).to(DEVICE)
        outputs = []
        
        for _ in range(max_len):
            output, hidden, cell = model.decoder(x, hidden, cell)
            
            # Use sampling for diversity
            probs = torch.softmax(output, dim=1)
            top1 = output.argmax(1).item()
            
            if top1 == EOS_IDX:
                break
                
            outputs.append(top1)
            x = torch.tensor([top1]).to(DEVICE)
    
    return decode(outputs)

In [17]:
# BLEU score calculation
def calculate_bleu(model, test_data, n_samples=100):
    smooth = SmoothingFunction().method1
    scores = []
    
    for i in range(min(n_samples, len(test_data))):
        ref = test_data[i]["code"]
        doc = test_data[i]["docstring"]
        
        pred = generate_code(model, doc)
        
        # Use subword tokens for BLEU
        ref_tokens = tokenizer.encode(ref).tokens
        pred_tokens = tokenizer.encode(pred).tokens
        
        if len(ref_tokens) > 0 and len(pred_tokens) > 0:
            try:
                score = sentence_bleu([ref_tokens], pred_tokens, 
                                    smoothing_function=smooth)
                scores.append(score)
            except:
                continue
    
    return np.mean(scores) if scores else 0.0

In [23]:
# Final evaluation
model.load_state_dict(torch.load("best_model_lstm.pt"))
model.to(DEVICE)

bleu = calculate_bleu(model, test_data, n_samples=100)
print(f"\nBLEU Score on 50 test samples: {bleu:.4f}")

# Example generation
print("\n" + "="*50)
print("Example Generation:")
print("-"*50)
idx = random.randint(0, len(test_data)-1)
docstring = test_data[idx]["docstring"]
reference = test_data[idx]["code"]
generated = generate_code(model, docstring)

print(f"Docstring: {docstring[:100]}..." if len(docstring) > 100 else f"Docstring: {docstring}")
print(f"\nGenerated:\n{generated[:200]}..." if len(generated) > 200 else f"\nGenerated:\n{generated}")
print(f"\nReference:\n{reference[:200]}..." if len(reference) > 200 else f"\nReference:\n{reference}")


BLEU Score on 50 test samples: 0.0067

Example Generation:
--------------------------------------------------
Docstring: Decorator for Layers, overriding add_weight for trainable initializers.

Generated:
_ ( ( ( , """ """ the the the the the .""" = . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . =

Reference:
def add_weight(cls):
  """Decorator for Layers, overriding add_weight for trainable initializers."""
  @functools.wraps(cls.add_weight)
  def _add_weight(self,
                  name=None,
           ...
