In [8]:
import torch
import os
import glob
from sklearn.model_selection import train_test_split

class FinalCombinedDataset(torch.utils.data.Dataset):
    # This __init__ is slightly modified to accept a list of files directly,
    # which makes it more flexible for our splitting strategy.
    def __init__(self, all_files, small_dir, large_dir, small_chunk_size=100):
        self.all_files = all_files
        self.small_chunk_size = small_chunk_size
        
        # Determine the boundary between small and large files
        self.num_small_samples = 0
        for f in self.all_files:
            if small_dir in f:
                self.num_small_samples += self.small_chunk_size
        
        # Caching to improve speed
        self.last_chunk_idx = -1
        self.last_chunk_data = None

    def __len__(self):
        # Calculate total length based on file types
        num_large_samples = len(self.all_files) - (self.num_small_samples // self.small_chunk_size)
        return self.num_small_samples + num_large_samples

    def __getitem__(self, idx):
        if idx < self.num_small_samples:
            # This index belongs to a "small diff" chunk
            chunk_idx = idx // self.small_chunk_size
            local_idx = idx % self.small_chunk_size
            
            if chunk_idx != self.last_chunk_idx:
                self.last_chunk_data = torch.load(self.all_files[chunk_idx])
                self.last_chunk_idx = chunk_idx
            
            data = self.last_chunk_data
            return {key: val[local_idx] for key, val in data.items()}
        else:
            # This index belongs to a "large diff" file
            # The index into all_files is the number of small chunks plus the local large file index
            num_small_chunks = self.num_small_samples // self.small_chunk_size
            large_file_idx = num_small_chunks + (idx - self.num_small_samples)
            
            data = torch.load(self.all_files[large_file_idx])
            return {key: val.squeeze(0) for key, val in data.items()}


# --- Configuration ---
SMALL_DIR = "data/tokenized_data_test/small_diffs"
LARGE_DIR = "data/tokenized_data_test4/large_diffs"

# --- Step 1: Get all file paths and sort them ---
# This ensures that older files come before newer files.
small_files = sorted(glob.glob(os.path.join(SMALL_DIR, "*.pt")))
large_files = sorted(glob.glob(os.path.join(LARGE_DIR, "*.pt")))
all_files = small_files + large_files

# --- Step 2: Perform a single, correct chronological split on the list of files ---
train_files, eval_files = train_test_split(
    all_files, 
    test_size=0.2, 
    shuffle=False # <-- This is crucial
)

# --- Step 3: Create separate Dataset objects for training and evaluation ---
train_dataset = FinalCombinedDataset(train_files, SMALL_DIR, LARGE_DIR)
eval_dataset = FinalCombinedDataset(eval_files, SMALL_DIR, LARGE_DIR)


print(f"Chronological split is correct. ✅")
print(f"Created training dataset with {len(train_dataset)} samples.")
print(f"Created evaluation dataset with {len(eval_dataset)} samples.")

Chronological split is correct. ✅
Created training dataset with 25433 samples.
Created evaluation dataset with 146 samples.


In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import mlflow

# --- Load a fresh model for training ---
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)

# --- Define the function to compute metrics ---
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# --- Start an MLflow Run ---
with mlflow.start_run(run_name="CodeBERT_FineTune_Run") as run:
    print(f"MLflow run started. Run ID: {run.info.run_id}")

    # --- Define Training Arguments with Corrected Parameters ---

    training_args = TrainingArguments(
        output_dir='./results_codebert_mlflow',
        run_name=run.info.run_name,
        num_train_epochs=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        logging_dir='./logs_codebert_mlflow',
        # --- THE KEY CHANGES ARE HERE (reverted to modern names) ---
        logging_strategy="steps",
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=200,
        # -----------------------------------------------------------
        save_strategy="steps",
        save_steps=200,
        load_best_model_at_end=True,
        report_to="mlflow",
    )

    # Manually log key parameters for easy viewing
    mlflow.log_params({
        "model_name": "microsoft/codebert-base",
        "train_epochs": training_args.num_train_epochs,
        "batch_size": training_args.per_device_train_batch_size
    })

    # --- Initialize and Run the Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    print("\nStarting model fine-tuning with MLflow logging...")
    trainer.train()
    print("Fine-tuning complete. ✅")

    print("\nRunning final evaluation...")
    final_metrics = trainer.evaluate()
    mlflow.log_metrics({f"final_{k}": v for k, v in final_metrics.items()})

print("MLflow run finished.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MLflow run started. Run ID: bc625410604149928fcef9ca18ed369e

Starting model fine-tuning with MLflow logging...




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,0.6003,0.412333,0.883562,0.0,0.0,0.0
400,0.5627,0.455286,0.883562,0.0,0.0,0.0
600,0.5639,0.589432,0.80137,0.171429,0.166667,0.176471
800,0.6724,0.602583,0.883562,0.0,0.0,0.0
1000,0.6641,0.471059,0.883562,0.0,0.0,0.0
1200,0.6614,0.510045,0.883562,0.0,0.0,0.0
1400,0.6687,0.569059,0.883562,0.0,0.0,0.0
1600,0.65,0.468868,0.883562,0.0,0.0,0.0
1800,0.6544,0.412945,0.883562,0.0,0.0,0.0
2000,0.6365,0.489673,0.883562,0.0,0.0,0.0




Fine-tuning complete. ✅

Running final evaluation...




MLflow run finished.
