# NLBSE 2025 competition submission
by: Wyatt Markham

In [None]:
!pip install datasets
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu12x


In [42]:
import pandas as pd
import time
from datasets import Dataset, DatasetDict, load_dataset
from tqdm.auto import tqdm
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from collections import Counter
from sklearn.model_selection import ParameterGrid
tqdm.pandas()

In [4]:
# dataset from hf_hub
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

## Tokenizing helper methods

In [5]:
def sentence_to_tensor(sentence, vocab):
      return torch.tensor([vocab.get(word, vocab['<PAD>']) for word in sentence.split()])

In [6]:
class TextDataset(Dataset):
    def __init__(self, sentences, labels, vocab):
        self.sentences = sentences
        self.labels = labels
        self.vocab = vocab
        self.max_len = max([len(sentence.split()) for sentence in sentences])

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]
        sentence_tensor = sentence_to_tensor(sentence, self.vocab)
        padding = torch.tensor([self.vocab['<PAD>']] * (self.max_len - len(sentence_tensor)))
        sentence_tensor = torch.cat([sentence_tensor, padding])
        sentence_tensor = sentence_tensor.long()
        return sentence_tensor, label

## Neural Network

In [82]:
class NN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim1, hidden_dim2, num_classes):
        super(NN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc1 = nn.Linear(embed_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x= self.sigmoid(x)
        return x

## Find Best hyper parameters 
don't run this code takes 3+ hours

In [53]:
param_grid = {
    'lr': [0.0001, 0.0005, 0.001],
    'batch_size': [16, 32, 64],
    'embed_dim': [50, 70, 100],
    'hidden_dim1': [128, 256],
    'hidden_dim2': [64, 128],
    'num_epochs': [10, 20, 30]
}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
for lang in langs:
    print("Starting:", lang)
    NNlabels = ds[lang+"_train"]["labels"]
    data = ds[lang+"_train"]["combo"]
    words = [word for sentence in data for word in sentence.split()]
    vocab = {word: i+1 for i, (word, _) in enumerate(Counter(words).items())}
    vocab['<PAD>'] = 0

    NNlabels = torch.tensor(NNlabels, dtype=torch.float)

    dataset = TextDataset(data, NNlabels, vocab)

    class_counts = torch.sum(NNlabels, dim=0)  
    class_weights = 1.0 / class_counts
    sample_weights = torch.matmul(NNlabels, class_weights)  

    grid = ParameterGrid(param_grid)
    best_f1 = 0
    best_params = {}
    NNtestlabels = data = ds[lang+"_test"]["labels"]
    NNtestlabels = torch.tensor(NNtestlabels, dtype=torch.float)
    testdata = ds[lang+"_test"]["combo"]
    maxLen = max([len(sentence.split()) for sentence in testdata])

    testdata = [
        torch.tensor(sentence_to_tensor(sentence, vocab)).tolist() +
        [vocab['<PAD>']] * (maxLen - len(sentence.split()))
        if len(sentence.split()) < maxLen
        else sentence_to_tensor(sentence, vocab)[:maxLen]
        for sentence in testdata
    ]
    testdata = torch.tensor(testdata).to(device)
    for params in grid:
        print(f"Training with params: {params}")
        
        sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)
        dataloader = DataLoader(dataset, batch_size=params['batch_size'], sampler=sampler)

        model = NN(len(vocab), params['embed_dim'], params['hidden_dim1'], params['hidden_dim2'], len(labels[lang])).to(device)

        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=params['lr'], weight_decay=1e-5)
        
        for epoch in range(params['num_epochs']):
            model.train()
            running_loss = 0.0
            for data, targets in dataloader:
                data, targets = data.to(device), targets.to(device)
                optimizer.zero_grad()
                outputs = model(data)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()

            print(f"Epoch {epoch+1}, Loss: {running_loss/len(dataloader)}")

        # Evaluate the model
        y_pred = model(testdata).detach().cpu().numpy().T
        y_true = np.array(NNtestlabels).T
        f1_score = 0.0
        for i in range(len(y_pred)):
            assert(len(y_pred[i]) == len(y_true[i]))
            tp = sum([true == pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
            fp = sum([true == 0 and pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
            fn = sum([true == 1 and pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
            f1_score += (2*tp) / (2*tp + fp + fn)

        if f1_score > best_f1:
            best_f1 = f1_score
            best_params = params

    print(f"Best params for {lang}: {best_params} with F1 score: {best_f1}")

cpu
Starting: java


  torch.tensor(sentence_to_tensor(sentence, vocab)).tolist() +


Training with params: {'batch_size': 16, 'embed_dim': 50, 'hidden_dim1': 128, 'hidden_dim2': 64, 'lr': 0.0001, 'num_epochs': 10}
Epoch 1, Loss: 0.6065404353146794
Epoch 2, Loss: 0.43783108395438236
Epoch 3, Loss: 0.43305916764906477
Epoch 4, Loss: 0.43031921269012097
Epoch 5, Loss: 0.43299172768572797
Epoch 6, Loss: 0.4285357098619477
Epoch 7, Loss: 0.4255954043454483
Epoch 8, Loss: 0.4195183238812855
Epoch 9, Loss: 0.41023247914404426
Epoch 10, Loss: 0.40491175388588624
Training with params: {'batch_size': 16, 'embed_dim': 50, 'hidden_dim1': 128, 'hidden_dim2': 64, 'lr': 0.0001, 'num_epochs': 20}


  f1_score += (2*tp) / (2*tp + fp + fn)


Epoch 1, Loss: 0.5908104449886233
Epoch 2, Loss: 0.43594592033314106
Epoch 3, Loss: 0.430976827480212
Epoch 4, Loss: 0.43156654162316765
Epoch 5, Loss: 0.4277939515955308
Epoch 6, Loss: 0.4262971911235016
Epoch 7, Loss: 0.4178056012557334
Epoch 8, Loss: 0.40879538642759083
Epoch 9, Loss: 0.3996734605992542
Epoch 10, Loss: 0.39115314041616533
Epoch 11, Loss: 0.38140791360320164
Epoch 12, Loss: 0.3742677260096334
Epoch 13, Loss: 0.36515312817166834
Epoch 14, Loss: 0.35398825877854806
Epoch 15, Loss: 0.3480306798670472
Epoch 16, Loss: 0.33641489532564867
Epoch 17, Loss: 0.33103819034931037
Epoch 18, Loss: 0.3265273802045013
Epoch 19, Loss: 0.3188204631647643
Epoch 20, Loss: 0.31153234617174175
Training with params: {'batch_size': 16, 'embed_dim': 50, 'hidden_dim1': 128, 'hidden_dim2': 64, 'lr': 0.0001, 'num_epochs': 30}
Epoch 1, Loss: 0.6043270270363623
Epoch 2, Loss: 0.4370258952013585
Epoch 3, Loss: 0.4325047230520168
Epoch 4, Loss: 0.4304304194300115
Epoch 5, Loss: 0.4284043593817398
E

## Train the model

In [None]:
vocabs = {}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_params = {"java":{'batch_size':16, 'embed_dim':100, 'hidden_dim1':256, 'hidden_dim2':128,'lr':0.001, 'epochs':30},
               "python":{'batch_size':16, 'embed_dim':100, 'hidden_dim1':128, 'hidden_dim2':128,'lr':0.001,'epochs':20},
               "pharo":{'batch_size':16, 'embed_dim':100, 'hidden_dim1':256, 'hidden_dim2':128,'lr':0.001,'epochs':30},
               }
print(device)

for lang in langs:
    print("Starting:", lang)
    
    #Proccess data
    NNlabels = ds[lang+"_train"]["labels"]
    data = ds[lang+"_train"]["combo"]
    words = [word for sentence in data for word in sentence.split()]
    vocab = {word: i+1 for i, (word, _) in enumerate(Counter(words).items())}
    vocab['<PAD>'] = 0

    NNlabels = torch.tensor(NNlabels, dtype=torch.float)

    dataset = TextDataset(data, NNlabels, vocab)

    #Over sample under represented classes
    class_counts = torch.sum(NNlabels, dim=0)  
    class_weights = 1.0 / class_counts
    sample_weights = torch.matmul(NNlabels, class_weights)  

    sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)
    dataloader = DataLoader(dataset, batch_size=best_params[lang]['batch_size'], sampler=sampler)
    
    #Create the model
    vocab_size = len(vocab)
    vocabs[lang] = vocab
    num_classes = len(labels[lang])
    model = NN(vocab_size, best_params[lang]['embed_dim'], best_params[lang]['hidden_dim1'], best_params[lang]['hidden_dim2'], num_classes).to(device)

    criterion = nn.BCELoss()

    optimizer = optim.Adam(model.parameters(), lr=best_params[lang]['lr'], weight_decay=1e-5)

    #Train    
    num_epochs = best_params[lang]['epochs']
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for data, targets in dataloader:
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(dataloader)}")
    torch.save(model.state_dict(), f"./nlbse25-{lang}.pth")
    

cpu
Starting: java
Epoch 1, Loss: 0.4051371455317786
Epoch 2, Loss: 0.2561589220554388
Epoch 3, Loss: 0.1580091057924413
Epoch 4, Loss: 0.09863924617417344
Epoch 5, Loss: 0.06657852349067185
Epoch 6, Loss: 0.0467405052926047
Epoch 7, Loss: 0.03805183685539259
Epoch 8, Loss: 0.029540555745836722
Epoch 9, Loss: 0.02674905417786379
Epoch 10, Loss: 0.022795689733456352
Epoch 11, Loss: 0.018802141314352854
Epoch 12, Loss: 0.017630435069758806
Epoch 13, Loss: 0.017743268942933904
Epoch 14, Loss: 0.015981698776067022
Epoch 15, Loss: 0.01434674356626777
Epoch 16, Loss: 0.01154558828047735
Epoch 17, Loss: 0.012376577891272438
Epoch 18, Loss: 0.017140357957750818
Epoch 19, Loss: 0.012622800902678716
Epoch 20, Loss: 0.01213285100119024
Epoch 21, Loss: 0.010620399338252913
Epoch 22, Loss: 0.00955474870061764
Epoch 23, Loss: 0.009822638846489703
Epoch 24, Loss: 0.009960699546079449
Epoch 25, Loss: 0.008878248133082745
Epoch 26, Loss: 0.008727609721391987
Epoch 27, Loss: 0.007495367488292625
Epoch 2

## Evaluate the model

In [None]:
total_flops = 0
total_time = 0
scores = []
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for lang in langs:
  print("Starting:", lang)

  #Preproccess data
  NNlabels = data = ds[lang+"_test"]["labels"]
  data = ds[lang+"_test"]["combo"]

  vocab = vocabs[lang]
  NNlabels = torch.tensor(NNlabels, dtype=torch.float)

  maxLen = max([len(sentence.split()) for sentence in data])

  data = [
    torch.tensor(sentence_to_tensor(sentence, vocab)).tolist() +
    [vocab['<PAD>']] * (maxLen - len(sentence.split()))
    if len(sentence.split()) < maxLen
    else sentence_to_tensor(sentence, vocab)[:maxLen]
    for sentence in data
  ]

  data = torch.tensor(data).to(device)

  #Load the model
  vocab_size = len(vocab)
  num_classes = len(labels[lang])
  model = NN(vocab_size, best_params[lang]['embed_dim'], best_params[lang]['hidden_dim1'], best_params[lang]['hidden_dim2'], num_classes).to(device)
  model.load_state_dict(torch.load(f"./nlbse25-{lang}.pth"))
  model.eval()

  #Evaluate using the compettion given evaluation code 
  with torch.profiler.profile(with_flops=True) as p:
    begin = time.time()
    for i in range(10):
      y_pred = model(data).detach().cpu().numpy().T
      y_pred = np.round(y_pred)
      
    total = time.time() - begin
    total_time = total_time + total
  total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)

  y_true = np.array(NNlabels).T

  for i in range(len(y_pred)):
      assert(len(y_pred[i]) == len(y_true[i]))
      tp = sum([true == pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
      tn = sum([true == pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
      fp = sum([true == 0 and pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
      fn = sum([true == 1 and pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
      precision = tp / (tp + fp)
      recall = tp / (tp + fn)
      f1 = (2*tp) / (2*tp + fp + fn)
      scores.append({'lan': lang, 'cat': labels[lang][i],'precision': precision,'recall': recall,'f1': f1})

print("Compute in GFLOPs:", total_flops/10)
print("Avg runtime in seconds:", total_time/10)
scores = pd.DataFrame(scores)
scores

Starting: java


  torch.tensor(sentence_to_tensor(sentence, vocab)).tolist() +
  model.load_state_dict(torch.load(f"./nlbse25-{lang}.pth"))


Starting: python
Starting: pharo
Compute in GFLOPs: 0.26293248
Avg runtime in seconds: 0.029489469528198243


Unnamed: 0,lan,cat,precision,recall,f1
0,java,summary,0.850972,0.883408,0.866887
1,java,Ownership,0.978261,1.0,0.989011
2,java,Expand,0.396396,0.431373,0.413146
3,java,usage,0.901809,0.809745,0.853301
4,java,Pointer,0.748936,0.956522,0.840095
5,java,deprecation,0.666667,0.4,0.5
6,java,rational,0.220339,0.191176,0.204724
7,python,Usage,0.721739,0.68595,0.70339
8,python,Parameters,0.744526,0.796875,0.769811
9,python,DevelopmentNotes,0.265306,0.317073,0.288889
