In [None]:
!pip install -q transformers==4.57.1 datasets sentence-transformers imblearn scikit-learn accelerate sentencepiece wandb

In [None]:
# Cell 1 - basic imports and reproducibility
import os, random, time
from pathlib import Path
import numpy as np
import pandas as pd
import torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)

Device: cuda


In [None]:
# Cell: Load CodeXGLUE Defect Detection Dataset (Devign task)
from datasets import load_dataset

# The correct, working ID as of 2025:
HF_ID = "code_x_glue_cc_defect_detection"

print("📦 Loading dataset from Hugging Face:", HF_ID)
ds = load_dataset(HF_ID)

print(ds)

# Convert to Pandas
df_train = ds["train"].to_pandas()[["func", "target"]].rename(columns={"func": "code", "target": "label"})
df_val   = ds["validation"].to_pandas()[["func", "target"]].rename(columns={"func": "code", "target": "label"})
df_test  = ds["test"].to_pandas()[["func", "target"]].rename(columns={"func": "code", "target": "label"})

print("✅ Dataset loaded successfully!")
print(f"Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")
df_train.head()


📦 Loading dataset from Hugging Face: code_x_glue_cc_defect_detection


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21854 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2732 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 21854
    })
    validation: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 2732
    })
    test: Dataset({
        features: ['id', 'func', 'target', 'project', 'commit_id'],
        num_rows: 2732
    })
})
✅ Dataset loaded successfully!
Train: 21854, Val: 2732, Test: 2732


Unnamed: 0,code,label
0,static av_cold int vdadec_init(AVCodecContext ...,False
1,static int transcode(AVFormatContext **output_...,False
2,"static void v4l2_free_buffer(void *opaque, uin...",False
3,"int av_opencl_buffer_write(cl_mem dst_cl_buf, ...",False
4,"static int r3d_read_rdvo(AVFormatContext *s, A...",True


In [None]:
# Cell 3 - small preprocessing helper
import re

def preprocess_code(code: str, remove_comments=False, normalize_whitespace=True):
    txt = str(code)
    if remove_comments:
        txt = re.sub(r'//.*?$|/\*.*?\*/', '', txt, flags=re.DOTALL|re.MULTILINE)
    if normalize_whitespace:
        txt = re.sub(r'\s+', ' ', txt).strip()
    return txt

# apply
REMOVE_COMMENTS = False
NORMALIZE_WHITESPACE = True
for df in (df_train, df_val, df_test):
    df['code'] = df['code'].apply(lambda x: preprocess_code(x, REMOVE_COMMENTS, NORMALIZE_WHITESPACE))


In [None]:
# Cell 4 - compatibility helper for TrainingArguments
import inspect
from transformers import TrainingArguments

def make_training_args(**kwargs):
    # sensible defaults, but allow overrides
    default = dict(
        output_dir=kwargs.pop('output_dir','./results'),
        per_device_train_batch_size=kwargs.pop('per_device_train_batch_size',8),
        per_device_eval_batch_size=kwargs.pop('per_device_eval_batch_size',16),
        learning_rate=kwargs.pop('learning_rate',5e-5),
        num_train_epochs=kwargs.pop('num_train_epochs',3),
        weight_decay=kwargs.pop('weight_decay',0.01),
        logging_steps=kwargs.pop('logging_steps',100),
        load_best_model_at_end=True,
        metric_for_best_model=kwargs.pop('metric_for_best_model','f1'),
        greater_is_better=kwargs.pop('greater_is_better',True),
        fp16=torch.cuda.is_available(),
    )
    # inspect signature
    sig = inspect.signature(TrainingArguments.__init__)
    params = {p.name for p in sig.parameters.values()}

    if 'evaluation_strategy' in params:
        ta = {**default, 'evaluation_strategy':'epoch', 'save_strategy':'epoch', 'save_total_limit':2}
        print('Using modern TrainingArguments API')
    else:
        ta = {**default, 'do_eval':True}
        if 'eval_strategy' in params:
            ta['eval_strategy'] = 'epoch'
        if 'save_strategy' in params:
            ta['save_strategy'] = 'epoch'
        if 'eval_steps' in params:
            ta['eval_steps'] = 500
        if 'save_steps' in params:
            ta['save_steps'] = 500
        print('Using legacy TrainingArguments API')

    filtered = {k:v for k,v in ta.items() if k in params}
    return TrainingArguments(**filtered)

In [None]:
# Cell 5 - Embedding SMOTE + paraphrase augmentation (CodeT5) to enrich minority class
# NOTE: this can take time; Colab GPU recommended. We'll generate synthetic minority examples and then
# use text-level oversampling of those synthetic outputs plus original minority samples.

from sentence_transformers import SentenceTransformer
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

EMBED_MODEL = 'all-MiniLM-L6-v2'  # fast embedder
PARA_MODEL = 'Salesforce/codet5-base'  # paraphraser
PARAPHRASE_PER_SAMPLE = 2
MAX_NEW_SAMPLES = None
K_NEIGHBORS = 3

print('Loading embedder...')
embedder = SentenceTransformer(EMBED_MODEL, device=device)
print('Loading paraphraser...')
par_tok = AutoTokenizer.from_pretrained(PARA_MODEL)
par_model = AutoModelForSeq2SeqLM.from_pretrained(PARA_MODEL).to(device)

# compute embeddings
train_texts = df_train['code'].astype(str).tolist()
train_labels = df_train['label'].values
print('Computing embeddings...')
emb = embedder.encode(train_texts, show_progress_bar=True, convert_to_numpy=True)

# attempt SMOTE to determine balanced target
try:
    sm = SMOTE(random_state=SEED, sampling_strategy='auto', k_neighbors=K_NEIGHBORS)
    Xr, yr = sm.fit_resample(emb, train_labels)
    target_count = int(np.bincount(yr).max())
    print('SMOTE target count:', target_count)
except Exception as e:
    print('SMOTE failed, falling back to majority upsample target')
    counts = np.bincount(train_labels)
    target_count = int(counts.max())

# prepare minority pool
minor_df = df_train[df_train['label']==1].reset_index(drop=True)
maj_df = df_train[df_train['label']==0]
print('Counts before:', len(maj_df), len(minor_df))

# paraphrase generator helper
import torch

def paraphrase_code(code_str, num_return=1, max_len=256):
    try:
        inputs = par_tok(code_str, return_tensors='pt', truncation=True, max_length=1024).to(device)
        gen = par_model.generate(
            **inputs,
            do_sample=True,
            top_k=50,
            top_p=0.92,
            temperature=0.95,
            num_return_sequences=num_return,
            max_length=max_len,
            early_stopping=True,
            no_repeat_ngram_size=3
        )
        outs = [par_tok.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in gen]
        return outs
    except Exception as e:
        return []

# identifier rename augmentation
import re, uuid

def identifier_rename(code_str, num_variants=1):
    ids = list(set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]{0,6}\b', code_str)))
    keywords = {"int","return","if","else","for","while","true","false","NULL","void","static","class","public","private","protected","def","import"}
    ids = [i for i in ids if i not in keywords and not i.isupper() and not i.isdigit()]
    out = []
    for _ in range(num_variants):
        s = code_str
        repl_map = {}
        for i in ids:
            new_name = 'v_' + uuid.uuid4().hex[:6]
            repl_map[i] = new_name
            s = re.sub(r'\b'+re.escape(i)+r'\b', new_name, s)
        out.append(s)
    return out

# iterate and generate
new_examples = []
cur_min = len(minor_df)
for idx,row in minor_df.iterrows():
    if cur_min >= target_count: break
    code = row['code']
    paras = paraphrase_code(code, num_return=PARAPHRASE_PER_SAMPLE, max_len=512)
    renames = identifier_rename(code, num_variants=2)
    candidates = [p for p in paras if isinstance(p,str) and len(p.strip())>10] + renames
    for cand in candidates:
        if cur_min >= target_count: break
        new_examples.append({'code':cand,'label':1})
        cur_min += 1
    if MAX_NEW_SAMPLES and len(new_examples) >= MAX_NEW_SAMPLES:
        break

# fallback to random oversample if still short
if cur_min < target_count:
    need = target_count - cur_min
    ups = resample(minor_df, replace=True, n_samples=need, random_state=SEED)
    for _,r in ups.iterrows():
        new_examples.append({'code':r['code'],'label':1})

print('Generated synthetic minority examples:', len(new_examples))

# build balanced train DF
if len(new_examples)>0:
    synth_df = pd.DataFrame(new_examples)
    df_train_bal = pd.concat([maj_df, minor_df, synth_df], ignore_index=True).sample(frac=1.0, random_state=SEED)
else:
    df_train_bal = pd.concat([maj_df, resample(minor_df, replace=True, n_samples=len(maj_df), random_state=SEED)], ignore_index=True).sample(frac=1.0, random_state=SEED)

print('Balanced counts:', df_train_bal['label'].value_counts())


Loading embedder...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading paraphraser...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Computing embeddings...


Batches:   0%|          | 0/683 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


SMOTE target count: 11836
Counts before: 11836 10018
Generated synthetic minority examples: 1818
Balanced counts: label
0    11836
1    11836
Name: count, dtype: int64


In [None]:
print("Train samples after SMOTE:", len(df_train))
df_train['label'].value_counts()


Train samples after SMOTE: 21854


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
False,11836
True,10018


In [None]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

MODEL_NAME = 'microsoft/graphcodebert-base'
print('Using model backbone:', MODEL_NAME)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(examples):
    return tokenizer(examples['code'], truncation=True, padding='max_length', max_length=256)

train_ds = Dataset.from_pandas(df_train_bal[['code','label']].rename(columns={'code':'code'}))
val_ds = Dataset.from_pandas(df_val[['code','label']].rename(columns={'code':'code'}))
test_ds = Dataset.from_pandas(df_test[['code','label']].rename(columns={'code':'code'}))

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=['code'])
val_tok = val_ds.map(tokenize_fn, batched=True, remove_columns=['code'])
test_tok = test_ds.map(tokenize_fn, batched=True, remove_columns=['code'])

train_tok.set_format(type='torch', columns=['input_ids','attention_mask','label'])
val_tok.set_format(type='torch', columns=['input_ids','attention_mask','label'])
test_tok.set_format(type='torch', columns=['input_ids','attention_mask','label'])

dataset = DatasetDict({'train':train_tok,'validation':val_tok,'test':test_tok})
print(dataset)

Using model backbone: microsoft/graphcodebert-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Map:   0%|          | 0/23672 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

Map:   0%|          | 0/2732 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 23672
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2732
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 2732
    })
})


In [None]:
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
import wandb

print('Loading model...')
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:,1]
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary', zero_division=0)
    try:
        roc = roc_auc_score(labels, probs)
    except Exception:
        roc = float('nan')
    return {'accuracy':acc,'precision':precision,'recall':recall,'f1':f1,'roc_auc':roc}

# W&B login (you must paste your API key when prompted)
print('If you want to use wandb, run:')
print("import wandb; wandb.login()  # then enter your API key")
import wandb
wandb.login()


Loading model...


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


If you want to use wandb, run:
import wandb; wandb.login()  # then enter your API key


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnationnighcore[0m ([33mnationnighcore-federal-institute-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# =====================================================
# Compute class weights (run this before WeightedTrainer)
# =====================================================
import torch

counts = df_train_bal['label'].value_counts().sort_index().values.astype(float)
inv = 1.0 / counts
class_weights = inv / inv.sum()
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
print("✅ Class weights:", class_weights, class_weights_tensor)


✅ Class weights: [0.5 0.5] tensor([0.5000, 0.5000], device='cuda:0')


In [None]:
# =====================================================
# Define get_optimizer() helper again (LLRD - Layer-wise Learning Rate Decay)
# =====================================================
from torch.optim import AdamW

def get_optimizer(model, base_lr=2e-5, decay=0.9):
    """
    Create layer-wise learning rate decay (LLRD) optimizer.
    Top layers get higher LR; lower layers smaller LR.
    """
    layers = [f'encoder.layer.{i}.' for i in range(12)] + ['pooler.', 'classifier.']
    no_decay = ['bias', 'LayerNorm.weight']
    lr = base_lr
    optimizer_grouped_parameters = []

    for layer in reversed(layers):  # from top (11) down to 0
        params = [p for n, p in model.named_parameters() if layer in n and p.requires_grad]
        if params:
            optimizer_grouped_parameters.append({'params': params, 'lr': lr})
            lr *= decay

    print(f"✅ Created LLRD optimizer groups ({len(optimizer_grouped_parameters)} groups)")
    return AdamW(optimizer_grouped_parameters, lr=base_lr)
# =====================================================
# WeightedTrainer with Focal Loss (robust loss for imbalance)
# =====================================================
from transformers import Trainer
import torch.nn as nn
import torch

# Focal Loss definition (if not already defined)
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss(weight=alpha)

    def forward(self, logits, labels):
        ce_loss = self.ce(logits, labels)
        pt = torch.exp(-ce_loss)
        return ((1 - pt) ** self.gamma * ce_loss).mean()


# WeightedTrainer — overrides compute_loss to use Focal Loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").long().to(model.device)
        outputs = model(**{k: v.to(model.device) for k, v in inputs.items()})
        logits = outputs.get("logits", outputs[0])

        # Weighted focal loss
        loss_fct = FocalLoss(alpha=class_weights_tensor)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



In [None]:
# ============================
# REPLACEMENT: Improved trainer + compatibility for legacy transformers
# Paste/replace the old training cell with this one
# ============================
import inspect
import os
import math
import torch
from torch.optim import AdamW
from transformers import TrainingArguments, get_cosine_schedule_with_warmup, TrainerCallback

# --- Quick helpers: compute num_train_steps safely
per_device_train_batch_size = 8
gradient_accumulation_steps = 2
num_epochs = 8

train_size = len(dataset['train'])
# number of update steps per epoch (rounded up)
steps_per_epoch = math.ceil(train_size / per_device_train_batch_size / gradient_accumulation_steps)
num_train_steps = steps_per_epoch * num_epochs
num_warmup_steps = int(0.1 * num_train_steps)

print(f"train_size={train_size}, steps_per_epoch={steps_per_epoch}, num_train_steps={num_train_steps}, warmup={num_warmup_steps}")

# --- Recreate optimizer with LLRD (reuse get_optimizer defined earlier)
optimizer = get_optimizer(model, base_lr=3e-5, decay=0.9)

# --- Create scheduler (we'll pass scheduler to Trainer as the "lr_scheduler")
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_train_steps
)

# --- Simple EarlyStopping callback (works with legacy & modern Trainer callbacks)
class SimpleEarlyStopping(TrainerCallback):
    def __init__(self, patience=2, metric_name='eval_f1'):
        self.patience = patience
        self.metric_name = metric_name
        self.best = None
        self.count = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None:
            return control
        score = metrics.get(self.metric_name)
        if score is None:
            return control
        if self.best is None or score > self.best:
            self.best = score
            self.count = 0
        else:
            self.count += 1
            print(f"[EarlyStopping] no improvement in {self.metric_name} for {self.count}/{self.patience}")
            if self.count >= self.patience:
                print("[EarlyStopping] stopping training")
                control.should_training_stop = True
        return control

early_stop_cb = SimpleEarlyStopping(patience=2, metric_name='eval_f1')

# --- Build TrainingArguments compatibly depending on transformers version
init_sig = inspect.signature(TrainingArguments.__init__)
params = {p.name for p in init_sig.parameters.values()}

if "evaluation_strategy" in params:
    # modern API
    training_args = TrainingArguments(
        output_dir='./results_graphcodebert_scheduler',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=3e-5,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_steps=200,
        metric_for_best_model="f1",
        greater_is_better=True,
        load_best_model_at_end=True,
        report_to=["wandb"],
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
    )
    print("Using modern TrainingArguments API (evaluation_strategy supported).")
else:
    # legacy API: must set do_eval and make eval/save steps consistent
    # choose eval_steps so that evaluation happens ~once per epoch
    eval_steps = max(1, steps_per_epoch)  # steps per epoch (approx)
    save_steps = eval_steps
    training_args = TrainingArguments(
        output_dir='./results_graphcodebert_scheduler',
        do_eval=True,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=3e-5,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_steps=200,
        eval_steps=eval_steps,
        save_steps=save_steps,
        load_best_model_at_end=False,   # avoid legacy mismatch
        report_to=["wandb"],
        fp16=torch.cuda.is_available(),
    )
    print("Using legacy TrainingArguments API (evaluation_strategy not supported). eval_steps and save_steps set to steps_per_epoch.")

# --- Data collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

# --- Build the Trainer using your WeightedTrainer (replace WeightedTrainer if necessary)
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),  # pass optimizer + lr scheduler
)

# Add early stopping callback
trainer.add_callback(early_stop_cb)

# --- Start training
print("Starting training with scheduler + early stopping...")
trainer.train()

# --- If legacy TrainingArguments didn't support load_best_model_at_end, pick latest checkpoint (optional)
if init_sig and "evaluation_strategy" not in params:
    from pathlib import Path
    ckpts = sorted(Path(training_args.output_dir).glob("checkpoint-*"), key=os.path.getmtime)
    if ckpts:
        best_ckpt = ckpts[-1]
        print("Legacy mode: latest checkpoint:", best_ckpt)
    else:
        print("No checkpoints found in", training_args.output_dir)


train_size=23672, steps_per_epoch=1480, num_train_steps=11840, warmup=1184
✅ Created LLRD optimizer groups (13 groups)
Using legacy TrainingArguments API (evaluation_strategy not supported). eval_steps and save_steps set to steps_per_epoch.
Starting training with scheduler + early stopping...


  trainer = WeightedTrainer(


Step,Training Loss
200,0.1743
400,0.1679
600,0.1587
800,0.1468
1000,0.1512
1200,0.1415
1400,0.1492
1600,0.1401
1800,0.1337
2000,0.1337


Legacy mode: latest checkpoint: results_graphcodebert_scheduler/checkpoint-11840


In [None]:
# =====================================================
# 8️⃣ Evaluation summary
# =====================================================
eval_results = trainer.evaluate()
print("\n📊 Final Evaluation Metrics:")
for k, v in eval_results.items():
    if k.startswith('eval_'):
        print(f"{k:20s}: {v:.4f}")




📊 Final Evaluation Metrics:
eval_loss           : 0.2477
eval_accuracy       : 0.6285
eval_precision      : 0.5725
eval_recall         : 0.5720
eval_f1             : 0.5723
eval_roc_auc        : 0.7032
eval_runtime        : 9.3369
eval_samples_per_second: 292.6030
eval_steps_per_second: 18.3140


In [None]:
# =====================================================
# Cell 10 - Evaluate model on test set
# =====================================================
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

print("🔍 Evaluating on test set...")
preds_output = trainer.predict(dataset['test'])
logits, labels = preds_output.predictions, preds_output.label_ids
probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:,1]
preds = np.argmax(logits, axis=1)

acc = accuracy_score(labels, preds)
prec = precision_score(labels, preds, zero_division=0)
rec = recall_score(labels, preds)
f1 = f1_score(labels, preds)
roc = roc_auc_score(labels, probs)
cm = confusion_matrix(labels, preds)

print(f"Test Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f} | ROC_AUC: {roc:.4f}")
print("Confusion Matrix:\n", cm)


🔍 Evaluating on test set...


Test Accuracy: 0.6230 | Precision: 0.5946 | Recall: 0.5633 | F1: 0.5786 | ROC_AUC: 0.6891
Confusion Matrix:
 [[995 482]
 [548 707]]


In [None]:
# =====================================================
# Cell 11 - Threshold optimization
# =====================================================
thresholds = np.linspace(0.1, 0.9, 17)
best_f1, best_t = 0, 0.5

for t in thresholds:
    preds_t = (probs >= t).astype(int)
    f1_t = f1_score(labels, preds_t)
    if f1_t > best_f1:
        best_f1, best_t = f1_t, t

print(f"Best threshold: {best_t:.2f} -> F1: {best_f1:.4f}")

final_preds = (probs >= best_t).astype(int)
acc_t = accuracy_score(labels, final_preds)
prec_t = precision_score(labels, final_preds, zero_division=0)
rec_t = recall_score(labels, final_preds)
roc_t = roc_auc_score(labels, probs)
cm_t = confusion_matrix(labels, final_preds)

print(f"\nOptimized Metrics (threshold={best_t:.2f})")
print(f"Accuracy: {acc_t:.4f} | Precision: {prec_t:.4f} | Recall: {rec_t:.4f} | F1: {best_f1:.4f} | ROC_AUC: {roc_t:.4f}")
print("Confusion Matrix:\n", cm_t)


Best threshold: 0.10 -> F1: 0.6679

Optimized Metrics (threshold=0.10)
Accuracy: 0.5900 | Precision: 0.5319 | Recall: 0.8972 | F1: 0.6679 | ROC_AUC: 0.6891
Confusion Matrix:
 [[ 486  991]
 [ 129 1126]]


In [None]:
# =====================================================
# Cell 12 - Save best model for inference
# =====================================================
SAVED_MODEL = "./final_graphcodebert_balanced_best"
trainer.save_model(SAVED_MODEL)
tokenizer.save_pretrained(SAVED_MODEL)
print(f"✅ Model saved to {SAVED_MODEL}")


✅ Model saved to ./final_graphcodebert_balanced_best


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# =====================================================
# Cell 13 - Inference test
# =====================================================
from transformers import pipeline

pipe = pipeline(
    'text-classification',
    model=SAVED_MODEL,
    tokenizer=SAVED_MODEL,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)

sample_code = '''
void swap(int *a, int *b){ int t=*a *a=*b; *b=t; }
'''
res = pipe(sample_code)
probs = {r['label']: r['score'] for r in res[0]}
pred_idx = int(np.argmax([r['score'] for r in res[0]]))
label = 'defective' if pred_idx == 1 else 'clean'

print("pipeline raw:", res)
print({'label': label, 'score': res[0][pred_idx]['score']})


In [None]:
# Zip the entire folder
!zip -r final_graphcodebert_balanced_best.zip /content/final_graphcodebert_balanced_best

# Download the zip file to your local Mac
from google.colab import files
files.download('final_graphcodebert_balanced_best.zip')


  adding: content/final_graphcodebert_balanced_best/ (stored 0%)
  adding: content/final_graphcodebert_balanced_best/special_tokens_map.json (deflated 85%)
  adding: content/final_graphcodebert_balanced_best/merges.txt (deflated 53%)
  adding: content/final_graphcodebert_balanced_best/training_args.bin (deflated 53%)
  adding: content/final_graphcodebert_balanced_best/model.safetensors (deflated 7%)
  adding: content/final_graphcodebert_balanced_best/tokenizer.json (deflated 82%)
  adding: content/final_graphcodebert_balanced_best/tokenizer_config.json (deflated 76%)
  adding: content/final_graphcodebert_balanced_best/config.json (deflated 49%)
  adding: content/final_graphcodebert_balanced_best/vocab.json (deflated 59%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>