## Evaluation: Research Questions¹:
- **RQ1 (Developer eval.):** Do developers use a precise commit message in the fixing commit? Quantify the hit rate.
- **RQ2 (LLM eval.):** Does the LLM generate a precise commit message in the fixing commit? Quantify the hit rate.
- **RQ3 (Rectifier eval.):** To what extent were you able to rectify the message? Quantify the hit rate.

In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("PyTorch version:", torch.__version__)
if device.type == 'cuda':
    print("CUDA version:", torch.version.cuda)
    print("Device name:", torch.cuda.get_device_name(0))

def set_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

Using device: cuda
PyTorch version: 2.8.0+cu129
CUDA version: 12.9
Device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [2]:
from transformers import RobertaTokenizer, RobertaModel

MODEL_NAME = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
model = RobertaModel.from_pretrained(MODEL_NAME).to(device)

In [5]:
def get_code_embedding(code_snippet):
    tokens = tokenizer(code_snippet, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in tokens.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token representation as the embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding

def cosine_sim(vec1, vec2):
    return F.cosine_similarity(vec1, vec2).item()

def score(code, msg):
    if not msg.strip() or not code.strip():
        return 0
    code_emb = get_code_embedding(code)
    msg_emb = get_code_embedding(msg)
    return cosine_sim(code_emb, msg_emb)

def evaluate_with_codebert(df):
    print("Evaluating with CodeBERT...")
    print("\nScoring original commit messages...")
    df["dev_score"] = df.progress_apply(lambda r: score(r["Diff"], r["Message"]), axis=1)
    print("\nScoring LLM inference messages...")
    df["llm_inference_score"] = df.progress_apply(lambda r: score(r["Diff"], r["LLM Inference (fix type)"]), axis=1)
    print("\nScoring rectified messages...")
    df["rectifier_score"] = df.progress_apply(lambda r: score(r["Diff"], r["Rectified Message"]), axis=1)
    return df

In [None]:
df = pd.read_csv("results/ollama_rectified_commits.csv")
df = evaluate_with_codebert(df)
scores_df = df[["Hash", "File Name", "dev_score", "llm_inference_score", "rectifier_score"]]
scores_df.to_csv("results/ollama_scores_codebert.csv", index=False)
display(scores_df.head(20))

Evaluating with CodeBERT...

Scoring original commit messages...


100%|██████████| 2041/2041 [01:52<00:00, 18.21it/s]



Scoring LLM inference messages...


100%|██████████| 2041/2041 [01:45<00:00, 19.28it/s]



Scoring rectified messages...


100%|██████████| 2041/2041 [01:46<00:00, 19.10it/s]


Unnamed: 0,Hash,File Name,dev_score,llm_inference_score,rectifier_score
0,014a277a97759bbc0e6ec8fba588bc6e6de65a86,constants.py,0.962658,0.942259,0.976323
1,014a277a97759bbc0e6ec8fba588bc6e6de65a86,displayer.py,0.918488,0.902997,0.94155
2,014a277a97759bbc0e6ec8fba588bc6e6de65a86,mobject.py,0.937761,0.921378,0.959316
3,2e074afb60d13262ce1e42e83bcf0ed28d95ad82,__init__.py,0.942789,0.933025,0.963519
4,2e074afb60d13262ce1e42e83bcf0ed28d95ad82,__init__.py,0.970627,0.962571,0.969677
5,2e074afb60d13262ce1e42e83bcf0ed28d95ad82,animation.py,0.9533,0.954102,0.969746
6,2e074afb60d13262ce1e42e83bcf0ed28d95ad82,meta_animations.py,0.897986,0.91128,0.977757
7,2e074afb60d13262ce1e42e83bcf0ed28d95ad82,simple_animations.py,0.892042,0.898379,0.90363
8,2e074afb60d13262ce1e42e83bcf0ed28d95ad82,transform.py,0.889994,0.870787,0.966823
9,2e074afb60d13262ce1e42e83bcf0ed28d95ad82,displayer.py,0.911511,0.92499,0.937381


In [None]:
scores_df = pd.read_csv("results/scores_codebert.csv")
threshold = 0.9

scores_df.loc[:, "dev_precise"] = scores_df["dev_score"] > threshold
scores_df.loc[:, "llm_precise"] = scores_df["llm_inference_score"] > threshold
scores_df.loc[:, "rectifier_precise"] = scores_df["rectifier_score"] > threshold

# RQ results
rq1 = scores_df["dev_precise"].mean() * 100
rq2 = scores_df["llm_precise"].mean() * 100
rq3 = scores_df["rectifier_precise"].mean() * 100

print(f"RQ1 - Developer precise hit rate: {rq1:.2f}%")
print(f"RQ2 - LLM precise hit rate: {rq2:.2f}%")
print(f"RQ3 - Rectifier precise hit rate: {rq3:.2f}%")

RQ1 - Developer precise hit rate: 91.28%
RQ2 - LLM precise hit rate: 93.39%
RQ3 - Rectifier precise hit rate: 100.00%
