In [1]:
pip show transformers datasets torch scikit-learn pandas tqdm

Name: transformers
Version: 4.46.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: c:\Users\senth\Desktop\code-comment-classification\.conda\Lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
---
Name: datasets
Version: 3.1.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: c:\Users\senth\Desktop\code-comment-classification\.conda\Lib\site-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, 

In [1]:
import torch
import time
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import precision_recall_fscore_support
from datasets import load_dataset
import numpy as np
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define the label dictionaries for each language
labels_dict = {
    "java": ["summary", "Ownership", "Expand", "usage", "Pointer", "deprecation", "rational"],
    "python": ["Usage", "Parameters", "DevelopmentNotes", "Expand", "Summary"],
    "pharo": ["Keyimplementationpoints", "Example", "Responsibilities", "Classreferences", "Intent", "Keymessages", "Collaborators"]
}

# Load the dataset from HuggingFace
dataset_name = "NLBSE/nlbse25-code-comment-classification"
dataset = load_dataset(dataset_name)

In [3]:
from torch.profiler import profile, ProfilerActivity

def prepare_data_loaders(data_split, tokenizer, max_length=128, batch_size=32):
    # Tokenize the data
    def tokenize_function(examples):
        return tokenizer(examples["comment_sentence"], truncation=True, padding="max_length", max_length=max_length)
    
    tokenized_data = data_split.map(tokenize_function, batched=True)
    
    # Ensure labels are correctly formatted (convert labels to Long type)
    tokenized_data = tokenized_data.map(lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.float32)}, batched=True)
    tokenized_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    return DataLoader(tokenized_data, batch_size=batch_size)

# Train a language-specific model
def train_language_model(lang, dataset, num_labels, epochs=3, lr=5e-5):
    print(f"Training model for {lang}...")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

    train_loader = prepare_data_loaders(dataset["train"], tokenizer)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=lr)

    # Loss Function for Multi-Label Classification
    criterion = torch.nn.BCEWithLogitsLoss()

    # Training loop
    total_flops = 0
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch_idx, batch in enumerate(tqdm(train_loader)):
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device).float()  # Labels should be Float type now
            
            if batch_idx == 0:  # Skip first batch to warm up GPU
                continue

            with torch.profiler.profile(
                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                with_flops=True
            ) as p:
                optimizer.zero_grad()
                outputs = model(**inputs)
                logits = outputs.logits
            
                # Calculate the loss for multi-label classification
                loss = criterion(logits, labels)
                
                # Backpropagation
                loss.backward()
                optimizer.step()

            # Summing FLOPs for this batch (convert to GFLOPs)
            total_flops += sum(k.flops for k in p.key_averages()) / 1e12

    return model, total_flops

In [5]:
# Function to calculate metrics (precision, recall, f1 score)
from sklearn.metrics import precision_recall_fscore_support

# Evaluate the model and calculate metrics
def evaluate_model(lang, dataset, model):
    model.eval()
    test_loader = prepare_data_loaders(dataset["test"], AutoTokenizer.from_pretrained("bert-base-uncased"))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
            labels = batch["labels"].to(device)

            outputs = torch.sigmoid(model(**inputs).logits)
            preds = (outputs > 0.4).int()

            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    # Concatenate all predictions and true labels
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    # Calculate per-label precision, recall, and F1 score
    label_list = labels_dict[lang]
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average=None)
    
    # Create a dataframe for better display
    metrics_df = pd.DataFrame({
        "Language": [lang] * len(label_list),
        "Category": label_list,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    })

    print(metrics_df)
    return metrics_df

def score(avg_f1, avg_runtime, avg_flops, max_avg_runtime=5, max_avg_flops=5000):
    return (0.6 * avg_f1 +
            0.2 * ((max_avg_runtime - avg_runtime) / max_avg_runtime) +
            0.2 * ((max_avg_flops - avg_flops) / max_avg_flops))

# Main workflow
results = []
total_time = 0
total_flops = 0

for lang in labels_dict.keys():
    print(f"Processing {lang}...")
    num_labels = len(labels_dict[lang])

    # Filter the dataset for the specific language
    language_dataset = {
        "train": dataset[f"{lang}_train"].map(lambda x: {"labels": torch.tensor(x["labels"], dtype=torch.float32)}),
        "test": dataset[f"{lang}_test"].map(lambda x: {"labels": torch.tensor(x["labels"], dtype=torch.float32)})
    }

    start_time = time.time()
    # Train and evaluate
    model, flops = train_language_model(lang, language_dataset, num_labels)
    elapsed_time = time.time() - start_time
    total_time += (elapsed_time/360)
    total_flops += (flops/3)

    lang_metrics = evaluate_model(lang, language_dataset, model)
    results.append(lang_metrics)

avg_f1 = pd.concat(results).F1.mean()
avg_runtime = total_time / (len(labels_dict))
avg_flops = total_flops / (len(labels_dict))

norm_avg_flops = avg_flops
norm_avg_runtime = avg_runtime

print("Average F1: ", avg_f1)
print("Average Runtime: ", norm_avg_runtime)
print("Average Flops: ", norm_avg_flops)

final_score = round(score(avg_f1, norm_avg_runtime, norm_avg_flops), 2)
print(f"Compute in GFLOPs: {norm_avg_flops}")
print(f"Avg runtime in seconds: {norm_avg_runtime}")
print(f"Final Score: {final_score}")

# Combine results into a single dataframe
final_results = pd.concat(results, ignore_index=True)
print(final_results)

# Save the results to a CSV file if needed
final_results.to_csv("classification_metrics.csv", index=False)

Processing java...
Training model for java...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  warn("CUDA is not available, disabling CUDA profiling")
100%|██████████| 238/238 [32:08<00:00,  8.10s/it]


Epoch 2/3


100%|██████████| 238/238 [32:30<00:00,  8.19s/it]


Epoch 3/3


100%|██████████| 238/238 [37:36<00:00,  9.48s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  Language     Category  Precision    Recall        F1
0     java      summary   0.786517  0.863229  0.823089
1     java    Ownership   0.978261  1.000000  0.989011
2     java       Expand   0.205128  0.078431  0.113475
3     java        usage   0.960656  0.679814  0.796196
4     java      Pointer   0.737991  0.918478  0.818402
5     java  deprecation   0.000000  0.000000  0.000000
6     java     rational   0.100840  0.176471  0.128342
Processing python...
Training model for python...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  warn("CUDA is not available, disabling CUDA profiling")
100%|██████████| 59/59 [06:33<00:00,  6.68s/it]


Epoch 2/3


100%|██████████| 59/59 [06:17<00:00,  6.40s/it]


Epoch 3/3


100%|██████████| 59/59 [06:16<00:00,  6.38s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  Language          Category  Precision    Recall        F1
0   python             Usage   0.758621  0.545455  0.634615
1   python        Parameters   0.632653  0.484375  0.548673
2   python  DevelopmentNotes   0.000000  0.000000  0.000000
3   python            Expand   0.318182  0.109375  0.162791
4   python           Summary   0.563107  0.707317  0.627027
Processing pharo...
Training model for pharo...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


  warn("CUDA is not available, disabling CUDA profiling")
100%|██████████| 41/41 [04:28<00:00,  6.56s/it]


Epoch 2/3


100%|██████████| 41/41 [04:10<00:00,  6.11s/it]


Epoch 3/3


100%|██████████| 41/41 [04:07<00:00,  6.05s/it]


  Language                 Category  Precision    Recall        F1
0    pharo  Keyimplementationpoints   0.000000  0.000000  0.000000
1    pharo                  Example   0.904762  0.638655  0.748768
2    pharo         Responsibilities   0.487805  0.769231  0.597015
3    pharo          Classreferences   0.000000  0.000000  0.000000
4    pharo                   Intent   0.870968  0.900000  0.885246
5    pharo              Keymessages   0.000000  0.000000  0.000000
6    pharo            Collaborators   0.000000  0.000000  0.000000
Average F1:  0.4143500123270708
Average Runtime:  7.4562011747448524
Average Flops:  239.18828599705384
Compute in GFLOPs: 239.18828599705384
Avg runtime in seconds: 7.4562011747448524
Final Score: 0.34
   Language                 Category  Precision    Recall        F1
0      java                  summary   0.786517  0.863229  0.823089
1      java                Ownership   0.978261  1.000000  0.989011
2      java                   Expand   0.205128  0.078431

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
