<a href="https://colab.research.google.com/github/ShamaSharma/SVD/blob/main/Unixcoder_on_Devign(short)_from_paper_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Fix Current Code and Get Baseline Results
!pip install transformers datasets evaluate
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
import evaluate
from sklearn.metrics import confusion_matrix
from google.colab import files

# Load your data (same as before)
print("Upload devign files")
uploaded = files.upload()

dataset = load_dataset("json", data_files={
    "train": "devign_0-512_train.json",
    "validation": "devign_0-512_validate.json",
    "test": "devign_0-512_test.json"
})

# CRITICAL FIX 1: Change max_length to 512 (match paper)
model_name = "microsoft/unixcoder-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["input"],
        truncation=True,
        max_length=512,  # CHANGED FROM 128 to 512
        padding=False    # Use dynamic padding
    )

#tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=[col for col in dataset["train"].column_names if col not in ["output"]]
)
tokenized_datasets = tokenized_datasets.rename_column("output", "labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# CRITICAL FIX 2: Fix metrics for binary classification
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = evaluate.load("accuracy").compute(predictions=predictions, references=labels)["accuracy"]
    # CHANGED: Use 'binary' instead of 'macro' for binary classification
    precision = evaluate.load("precision").compute(predictions=predictions, references=labels, average="binary")["precision"]
    recall = evaluate.load("recall").compute(predictions=predictions, references=labels, average="binary")["recall"]
    f1 = evaluate.load("f1").compute(predictions=predictions, references=labels, average="binary")["f1"]

    # Calculate FPR
    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "fpr": fpr}

# CRITICAL FIX 3: Better training setup
num_labels = len(set(dataset["train"]["output"]))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=2e-5,              # Match paper
    per_device_train_batch_size=16,  # Larger if GPU allows
    per_device_eval_batch_size=32,
    num_train_epochs=5,              # CHANGED: More epochs
    weight_decay=0.01,
    logging_steps=50,
    warmup_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()

"""
# CRITICAL FIX 4: Actually evaluate on test set!
test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("\n" + "="*50)
print("FINAL TEST RESULTS")
print("="*50)
for key, value in test_results.items():
    if key.startswith('eval_'):
        print(f"{key.replace('eval_', '').upper()}: {value:.4f}")

print(f"\nTarget from paper (UniXcoder/CodeBERT on Devign): F1~0.61")
print(f"Your result: F1={test_results['eval_f1']:.3f}")
"""


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Upload devign files


Saving devign_0-512_validate.json to devign_0-512_validate.json
Saving devign_0-512_train.json to devign_0-512_train.json
Saving devign_0-512_test.json to devign_0-512_test.json


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Map:   0%|          | 0/15376 [00:00<?, ? examples/s]

Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

Map:   0%|          | 0/1923 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/unixcoder-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/504M [00:00<?, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshama-aus[0m ([33mshama-aus-nit-kurukshetra[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Fpr
1,0.6003,0.573206,0.67898,0.685413,0.471014,0.558339,0.16362
2,0.5247,0.556222,0.708117,0.704441,0.555556,0.621202,0.176417
3,0.4599,0.651686,0.701873,0.740113,0.474638,0.578366,0.126143
4,0.3188,0.779111,0.691467,0.631874,0.679952,0.655032,0.299817


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Fpr
1,0.6003,0.573206,0.67898,0.685413,0.471014,0.558339,0.16362
2,0.5247,0.556222,0.708117,0.704441,0.555556,0.621202,0.176417
3,0.4599,0.651686,0.701873,0.740113,0.474638,0.578366,0.126143
4,0.3188,0.779111,0.691467,0.631874,0.679952,0.655032,0.299817
5,0.2272,0.91254,0.693028,0.656992,0.601449,0.627995,0.23766


'\n# CRITICAL FIX 4: Actually evaluate on test set!\ntest_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])\nprint("\n" + "="*50)\nprint("FINAL TEST RESULTS")\nprint("="*50)\nfor key, value in test_results.items():\n    if key.startswith(\'eval_\'):\n        print(f"{key.replace(\'eval_\', \'\').upper()}: {value:.4f}")\n\nprint(f"\nTarget from paper (UniXcoder/CodeBERT on Devign): F1~0.61")\nprint(f"Your result: F1={test_results[\'eval_f1\']:.3f}")\n'