<a href="https://colab.research.google.com/github/ShamaSharma/SVD/blob/main/CodeBert_on_Devign(Short_samples).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Fix Current Code and Get Baseline Results
!pip install transformers datasets evaluate scikit-learn accelerate

import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizer,  # Changed from AutoTokenizer for CodeBERT
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
import evaluate
from sklearn.metrics import confusion_matrix
from google.colab import files

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load your data
print("Upload devign files")
uploaded = files.upload()

print("Loading dataset...")
try:
    dataset = load_dataset("json", data_files={
        "train": "devign_0-512_train.json",
        "validation": "devign_0-512_validate.json",
        # "test": "devign_0-512_test.json"  # COMMENTED OUT FOR INITIAL VALIDATION RUN
    })
    print("Dataset loaded successfully!")

    # Validate dataset structure
    print("Dataset structure:")
    print("Train columns:", dataset["train"].column_names)
    print("Sample from train:", dataset["train"][0])

    # Check if we have the right columns
    train_columns = dataset["train"].column_names

    # Handle different column naming conventions
    if "func" in train_columns and "target" in train_columns:
        print("✓ Using standard Devign format: 'func' and 'target'")
        code_column = "func"
        label_column = "target"
    elif "input" in train_columns and "output" in train_columns:
        print("✓ Using alternative format: 'input' and 'output'")
        code_column = "input"
        label_column = "output"
    else:
        print(f"ERROR: Unsupported column format")
        print(f"Available columns: {train_columns}")
        print("Expected either ['func', 'target'] or ['input', 'output']")
        raise ValueError("Dataset format not supported")

    print(f"✓ Dataset validation passed")
    print(f"Using columns: code='{code_column}', labels='{label_column}'")
    print(f"Train samples: {len(dataset['train'])}")
    print(f"Validation samples: {len(dataset['validation'])}")

except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# CRITICAL FIX 1: Change to CodeBERT and fix tokenizer
model_name = "microsoft/codebert-base"  # Changed from unixcoder-base
print(f"Loading tokenizer: {model_name}")

try:
    tokenizer = RobertaTokenizer.from_pretrained(model_name)  # Changed to RobertaTokenizer
    print("✓ Tokenizer loaded successfully")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

def tokenize_function(examples):
    # Use the detected column name for code
    return tokenizer(
        examples[code_column],  # Use dynamic column name
        truncation=True,
        max_length=512,  # Keep 512 as specified
        padding=False    # Use dynamic padding
    )

print("Tokenizing datasets...")
try:
    tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=1000)
    # Use dynamic column name for labels
    tokenized_datasets = tokenized_datasets.rename_column(label_column, "labels")
    print("✓ Tokenization completed successfully")

    # Validate tokenized sample
    sample = tokenized_datasets["train"][0]
    print(f"Tokenized sample - input_ids length: {len(sample['input_ids'])}")
    print(f"Labels: {sample['labels']}")

except Exception as e:
    print(f"Error during tokenization: {e}")
    raise

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# CRITICAL FIX 4: Enhanced metrics for binary classification with FPR
def compute_metrics(eval_pred):
    """Compute evaluation metrics including FPR"""
    try:
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        # Load metrics
        accuracy = evaluate.load("accuracy").compute(predictions=predictions, references=labels)["accuracy"]
        # Use 'binary' for binary classification
        precision = evaluate.load("precision").compute(predictions=predictions, references=labels, average="binary")["precision"]
        recall = evaluate.load("recall").compute(predictions=predictions, references=labels, average="binary")["recall"]
        f1 = evaluate.load("f1").compute(predictions=predictions, references=labels, average="binary")["f1"]

        # Calculate FPR and confusion matrix components
        tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "fpr": fpr,
            "tpr": tpr,
            "true_positives": int(tp),
            "false_positives": int(fp),
            "true_negatives": int(tn),
            "false_negatives": int(fn)
        }
    except Exception as e:
        print(f"Error computing metrics: {e}")
        return {"accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "fpr": 0.0}

# CRITICAL FIX 5: Proper model setup
print("Loading model...")
try:
    # Fix num_labels calculation with dynamic column name
    num_labels = len(set(dataset["train"][label_column]))  # Use dynamic column name
    print(f"Number of labels: {num_labels}")

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model = model.to(device)
    print(f"✓ Model loaded and moved to {device}")

    # Check GPU memory if available
    if torch.cuda.is_available():
        print(f"GPU Memory - Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
        torch.cuda.empty_cache()

except Exception as e:
    print(f"Error loading model: {e}")
    raise

# CRITICAL FIX 6: Optimized training setup for GPU memory
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=2e-5,              # Match paper
    per_device_train_batch_size=8,   # Reduced from 16 for GPU memory
    per_device_eval_batch_size=16,   # Keep higher for evaluation
    gradient_accumulation_steps=2,   # Simulate batch size 16 (8*2)
    num_train_epochs=5,              # More epochs as specified
    weight_decay=0.01,
    logging_steps=50,
    warmup_steps=500,
    fp16=True,                       # Enable mixed precision for memory savings
    dataloader_pin_memory=False,     # Fix pin_memory warning
    dataloader_num_workers=0,        # Reduce CPU overhead
    report_to=[],                    # Disable wandb
)

print("Training Arguments:")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Mixed precision: {training_args.fp16}")

# Initialize trainer
print("Initializing trainer...")
try:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    print("✓ Trainer initialized successfully")
except Exception as e:
    print(f"Error initializing trainer: {e}")
    raise

# Train and evaluate
print("\n" + "="*50)
print("STARTING CODEBERT TRAINING")
print("="*50)

try:
    # Quick validation check before training
    print("Running pre-training validation...")
    small_val = tokenized_datasets["validation"].select(range(min(100, len(tokenized_datasets["validation"]))))
    pre_results = trainer.evaluate(eval_dataset=small_val)
    print(f"Pre-training F1: {pre_results.get('eval_f1', 0):.4f}")

    # Start training
    print("Starting full training...")
    trainer.train()
    print("✓ Training completed successfully!")

except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print(f"GPU Out of Memory: {e}")
        print("Try reducing batch size further or using CPU")
        torch.cuda.empty_cache()
    else:
        print(f"Training error: {e}")
    raise
except Exception as e:
    print(f"Training failed: {e}")
    raise

# Final validation results
print("\n" + "="*50)
print("FINAL VALIDATION RESULTS")
print("="*50)

try:
    val_results = trainer.evaluate()
    print("Validation Results:")
    for key, value in val_results.items():
        if key.startswith('eval_'):
            print(f"{key.replace('eval_', '').upper()}: {value:.4f}")

    print(f"\nCodeBERT on Devign - Validation Results:")
    print(f"F1-Score: {val_results['eval_f1']:.3f}")
    print(f"Accuracy: {val_results['eval_accuracy']:.3f}")
    print(f"FPR: {val_results['eval_fpr']:.3f}")

    # Save model
    print("\nSaving model...")
    trainer.save_model("./codebert_devign_model")
    tokenizer.save_pretrained("./codebert_devign_model")
    print("✓ Model saved successfully")

except Exception as e:
    print(f"Error during evaluation: {e}")

"""
# CRITICAL FIX 7: Test evaluation commented out for initial run
# UNCOMMENT WHEN READY FOR FINAL TESTING:

print("\n" + "="*50)
print("LOADING TEST SET FOR FINAL EVALUATION")
print("="*50)

# Load test set
test_dataset = load_dataset("json", data_files={"test": "devign_0-512_test.json"})
test_tokenized = test_dataset.map(tokenize_function, batched=True)
test_tokenized = test_tokenized.rename_column(label_column, "labels")  # Use dynamic column name

# Evaluate on test set
test_results = trainer.evaluate(eval_dataset=test_tokenized["test"])
print("FINAL TEST RESULTS:")
for key, value in test_results.items():
    if key.startswith('eval_'):
        print(f"{key.replace('eval_', '').upper()}: {value:.4f}")

print(f"\nTarget from paper (CodeBERT on Devign): F1~0.62")
print(f"Your result: F1={test_results['eval_f1']:.3f}")
"""

print("\n" + "="*50)
print("TRAINING PIPELINE COMPLETED!")
print("Uncomment test evaluation section when ready for final testing")
print("="*50)

Using device: cuda
Upload devign files


Saving devign_0-512_validate.json to devign_0-512_validate (2).json
Saving devign_0-512_train.json to devign_0-512_train (2).json
Saving devign_0-512_test.json to devign_0-512_test (2).json
Loading dataset...
Dataset loaded successfully!
Dataset structure:
Train columns: ['input', 'output']
Sample from train: {'input': 'static void common_end(FFV1Context *s){\n\n    int i;\n\n\n\n    for(i=0; i<s->plane_count; i++){\n\n        PlaneContext *p= &s->plane[i];\n\n\n\n        av_freep(&p->state);\n\n    }\n\n}\n', 'output': 1}
✓ Using alternative format: 'input' and 'output'
✓ Dataset validation passed
Using columns: code='input', labels='output'
Train samples: 15376
Validation samples: 1922
Loading tokenizer: microsoft/codebert-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

✓ Tokenizer loaded successfully
Tokenizing datasets...


Map:   0%|          | 0/15376 [00:00<?, ? examples/s]

Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

✓ Tokenization completed successfully
Tokenized sample - input_ids length: 91
Labels: 1
Loading model...
Number of labels: 2


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded and moved to cuda
GPU Memory - Allocated: 0.47 GB
Training Arguments:
Effective batch size: 16
Mixed precision: True
Initializing trainer...
✓ Trainer initialized successfully

STARTING CODEBERT TRAINING
Running pre-training validation...


  trainer = Trainer(


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Pre-training F1: 0.0000
Starting full training...


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,Precision,Recall,F1,Fpr,Tpr,True Positives,False Positives,True Negatives,False Negatives
1,0.6238,0.630135,0.0045,0.633715,0.942857,0.15942,0.272727,0.007313,0.15942,132,8,1086,696
2,0.5916,0.583238,0.0045,0.671176,0.687023,0.434783,0.532544,0.149909,0.434783,360,164,930,468
3,0.5297,0.62621,0.0045,0.682102,0.725572,0.421498,0.533231,0.120658,0.421498,349,132,962,479
4,0.4399,0.725172,0.0045,0.688345,0.697755,0.487923,0.574271,0.159963,0.487923,404,175,919,424


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,Precision,Recall,F1,Fpr,Tpr,True Positives,False Positives,True Negatives,False Negatives
1,0.6238,0.630135,0.0045,0.633715,0.942857,0.15942,0.272727,0.007313,0.15942,132,8,1086,696
2,0.5916,0.583238,0.0045,0.671176,0.687023,0.434783,0.532544,0.149909,0.434783,360,164,930,468
3,0.5297,0.62621,0.0045,0.682102,0.725572,0.421498,0.533231,0.120658,0.421498,349,132,962,479
4,0.4399,0.725172,0.0045,0.688345,0.697755,0.487923,0.574271,0.159963,0.487923,404,175,919,424
