In [None]:
# Check GPU availability and specs
!nvidia-smi

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")

In [None]:
# Fix numpy binary incompatibility first
!pip uninstall -y numpy
!pip install numpy==1.26.4 --force-reinstall

# Install required packages with compatible versions for Colab
!pip install -q transformers==4.46.0
!pip install -q datasets==3.2.0
!pip install -q accelerate==1.2.0
!pip install -q sentencepiece==0.2.0
!pip install -q scikit-learn==1.6.0
!pip install -q tokenizers==0.21.0

# For H100 optimizations
!pip install -q torch-tb-profiler
# Flash attention installation (may take a few minutes)
!pip install -q flash-attn --no-build-isolation

print("âœ“ All packages installed successfully!")

In [None]:
# Optional: Mount Google Drive to save models
from google.colab import drive
drive.mount('/content/drive')

# Create output directory
import os
OUTPUT_DIR = '/content/drive/MyDrive/code_plagiarism_model'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Models will be saved to: {OUTPUT_DIR}")

In [None]:
# Restart runtime after package installation to avoid conflicts
# Click Runtime > Restart Runtime in Colab menu, then run from this cell onwards

# Import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModel,
    AutoConfig,
    get_linear_schedule_with_warmup,
    set_seed
)
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
set_seed(42)

print("All libraries imported successfully!")

## Configuration for H100 GPU Optimization

In [None]:
# GPU-adaptive configuration for fast training (30 min target)
class Config:
    # Model configuration
    model_name = "microsoft/codebert-base"  # Pre-trained on code
    max_length = 256  # Reduced from 512 for faster processing
    
    # Adaptive GPU optimizations - Works on T4 (15GB), V100 (16GB), A100 (40/80GB)
    batch_size = 32  # Conservative for 15GB GPU (T4/Colab)
    gradient_accumulation_steps = 4  # Effective batch = 128
    effective_batch_size = batch_size * gradient_accumulation_steps  # 128
    
    # Training configuration - FAST MODE
    learning_rate = 3e-5  # Slightly higher for faster convergence
    num_epochs = 2  # Reduced from 5 for speed
    warmup_ratio = 0.05  # Reduced warmup
    weight_decay = 0.01
    max_grad_norm = 1.0
    
    # Mixed precision optimizations
    fp16 = True   # Use FP16 for memory efficiency
    bf16 = False  # bf16 is better for H100, but FP16 works on all GPUs
    tf32 = True   # Enable TF32 if available
    use_flash_attention = False  # Disabled for compatibility
    gradient_checkpointing = False  # Disabled for speed
    
    # Data loading optimization
    num_workers = 2  # Reduced to save memory
    prefetch_factor = 2  # Reduced prefetching
    pin_memory = True
    
    # Output
    output_dir = OUTPUT_DIR
    logging_steps = 50
    eval_steps = 500
    save_steps = 1000
    save_total_limit = 3
    
    # Advanced
    optim = "adamw_torch_fused"  # Fused optimizer when available
    ddp_find_unused_parameters = False
    
config = Config()

# Enable TF32 if available (H100/A100)
if config.tf32 and torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

# Clear CUDA cache to free memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Configuration:")
for key, value in vars(config).items():
    if not key.startswith('_'):
        print(f"  {key}: {value}")

# GPU info
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"\nðŸ’» GPU Memory: {gpu_memory:.1f} GB")
    print(f"âš¡ Optimized for your GPU size")

## Load Dataset from HuggingFace

We'll use the BigCloneBench dataset which contains code clone pairs labeled as clones or non-clones.

In [None]:
# Load the code clone detection dataset
print("Loading dataset from HuggingFace...")

try:
    # Primary dataset: BigCloneBench
    dataset = load_dataset(
        "code_x_glue_cc_clone_detection_big_clone_bench",
        "default"
    )
    print(f"Loaded BigCloneBench dataset")
    print(f"Train samples: {len(dataset['train'])}")
    print(f"Validation samples: {len(dataset['validation'])}")
    print(f"Test samples: {len(dataset['test'])}")
    
except Exception as e:
    print(f"Error loading primary dataset: {e}")
    print("Falling back to alternative dataset...")
    # Alternative: POJ-104 Clone Detection
    dataset = load_dataset(
        "code_x_glue_cc_clone_detection_poj104",
        "default"
    )

# Show example
print("\nDataset example:")
print(dataset['train'][0])

In [None]:
# FAST TRAINING MODE: Use subset for 30-minute training
USE_FULL_DATASET = False  # Set to True for full training (will take much longer)

# Use 15% of data for fast training while maintaining statistical significance
if not USE_FULL_DATASET:
    print("âš¡ FAST TRAINING MODE: Using subset for 30-min training...")
    train_size = min(50000, len(dataset['train']))  # ~15-20% of typical dataset
    val_size = min(5000, len(dataset['validation']))
    test_size = min(5000, len(dataset['test']))
    
    dataset['train'] = dataset['train'].select(range(train_size))
    dataset['validation'] = dataset['validation'].select(range(val_size))
    dataset['test'] = dataset['test'].select(range(test_size))
    print(f"Reduced train samples: {len(dataset['train'])}")
    print(f"Reduced validation samples: {len(dataset['validation'])}")
    print(f"Reduced test samples: {len(dataset['test'])}")
    print(f"ðŸ’¡ To use full dataset, set USE_FULL_DATASET = True (will take hours)")

# Analyze label distribution
train_labels = [item['label'] for item in dataset['train']]
print(f"\nLabel distribution in training set:")
print(f"  Clones (1): {sum(train_labels)} ({sum(train_labels)/len(train_labels)*100:.2f}%)")
print(f"  Non-clones (0): {len(train_labels)-sum(train_labels)} ({(1-sum(train_labels)/len(train_labels))*100:.2f}%)")

## Load Tokenizer and Model

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

# Add special tokens if needed
special_tokens = {
    'additional_special_tokens': ['<CODE>', '</CODE>', '<FUNC>', '</FUNC>']
}
tokenizer.add_special_tokens(special_tokens)

print(f"Tokenizer loaded. Vocab size: {len(tokenizer)}")

In [None]:
# Clear GPU memory before model initialization
import gc
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    print(f"ðŸ§¹ Cleared GPU cache")
    
    # Show available memory
    gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
    gpu_memory_reserved = torch.cuda.memory_reserved(0) / 1024**3
    gpu_memory_allocated = torch.cuda.memory_allocated(0) / 1024**3
    print(f"GPU Memory: {gpu_memory_allocated:.2f} GB allocated / {gpu_memory_total:.2f} GB total")

# Define the Siamese Network for Code Clone Detection
class CodeCloneDetector(nn.Module):
    """Siamese network using CodeBERT for code clone detection"""
    
    def __init__(self, model_name, hidden_size=768, dropout=0.1):
        super().__init__()
        
        # Load pre-trained CodeBERT
        model_config = AutoConfig.from_pretrained(model_name)
        self.encoder = AutoModel.from_pretrained(model_name, config=model_config)
        
        # Resize embeddings if we added special tokens
        self.encoder.resize_token_embeddings(len(tokenizer))
        
        # Gradient checkpointing disabled for speed in fast training mode
        # if config.gradient_checkpointing:
        #     self.encoder.gradient_checkpointing_enable()
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 4, 512),  # Concatenate + element-wise ops
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 2)  # Binary classification
        )
        
    def encode(self, input_ids, attention_mask):
        """Encode a code snippet"""
        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        # Use [CLS] token representation
        return outputs.last_hidden_state[:, 0, :]
    
    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2, labels=None):
        """Forward pass for two code snippets"""
        # Encode both code snippets
        emb1 = self.encode(input_ids1, attention_mask1)
        emb2 = self.encode(input_ids2, attention_mask2)
        
        # Compute similarity features
        diff = torch.abs(emb1 - emb2)  # Element-wise difference
        prod = emb1 * emb2  # Element-wise product
        
        # Concatenate all features
        features = torch.cat([emb1, emb2, diff, prod], dim=1)
        
        # Classification
        logits = self.classifier(features)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
        
        return {
            'loss': loss,
            'logits': logits,
            'embeddings': (emb1, emb2)
        }

# Initialize model
print("Initializing model...")
model = CodeCloneDetector(config.model_name)
model = model.cuda()

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

## Data Preprocessing and Loading

In [None]:
class CodeCloneDataset(Dataset):
    """Custom Dataset for code clone detection"""
    
    def __init__(self, hf_dataset, tokenizer, max_length=512):
        self.data = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Get code snippets
        code1 = item.get('func1', item.get('code1', ''))
        code2 = item.get('func2', item.get('code2', ''))
        label = item['label']
        
        # Tokenize code1
        encoding1 = self.tokenizer(
            code1,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Tokenize code2
        encoding2 = self.tokenizer(
            code2,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids1': encoding1['input_ids'].squeeze(),
            'attention_mask1': encoding1['attention_mask'].squeeze(),
            'input_ids2': encoding2['input_ids'].squeeze(),
            'attention_mask2': encoding2['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
print("Creating datasets...")
train_dataset = CodeCloneDataset(dataset['train'], tokenizer, config.max_length)
val_dataset = CodeCloneDataset(dataset['validation'], tokenizer, config.max_length)
test_dataset = CodeCloneDataset(dataset['test'], tokenizer, config.max_length)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Create dataloaders with H100 optimizations
train_loader = DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    shuffle=True,
    num_workers=config.num_workers,
    pin_memory=config.pin_memory,
    prefetch_factor=config.prefetch_factor,
    persistent_workers=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=config.num_workers,
    pin_memory=config.pin_memory,
    prefetch_factor=config.prefetch_factor,
    persistent_workers=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=config.num_workers,
    pin_memory=config.pin_memory
)

print("Dataloaders created successfully!")

## Training Setup

In [None]:
# Setup optimizer and scheduler
from torch.optim import AdamW
from torch.cuda.amp import GradScaler, autocast

# Calculate total steps
total_steps = len(train_loader) * config.num_epochs // config.gradient_accumulation_steps
warmup_steps = int(total_steps * config.warmup_ratio)

print(f"Total training steps: {total_steps}")
print(f"Warmup steps: {warmup_steps}")

# Initialize optimizer with fused implementation for A100
try:
    optimizer = AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay,
        fused=True  # Fused optimizer for speed
    )
    print("âœ“ Using fused AdamW optimizer")
except:
    # Fallback if fused not available
    optimizer = AdamW(
        model.parameters(),
        lr=config.learning_rate,
        weight_decay=config.weight_decay
    )
    print("âœ“ Using standard AdamW optimizer")

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Mixed precision training with FP16 for A100 (faster than bf16)
scaler = GradScaler(enabled=config.fp16)

print(f"\nâš¡ Training optimizations:")
print(f"  - Batch size: {config.batch_size}")
print(f"  - Sequence length: {config.max_length}")
print(f"  - Mixed precision: {'FP16' if config.fp16 else 'BF16' if config.bf16 else 'FP32'}")
print(f"  - Gradient checkpointing: {config.gradient_checkpointing}")
print(f"  - Expected time per epoch: ~15 minutes")
print("\nOptimizer and scheduler initialized!")

In [None]:
# Evaluation function
def evaluate(model, dataloader, device):
    """Evaluate model on validation/test set"""
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move to device
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass with autocast for bf16
            with autocast(dtype=torch.bfloat16 if config.bf16 else torch.float16, enabled=config.bf16 or config.fp16):
                outputs = model(
                    input_ids1, attention_mask1,
                    input_ids2, attention_mask2,
                    labels
                )
            
            total_loss += outputs['loss'].item()
            
            # Get predictions
            logits = outputs['logits']
            probs = F.softmax(logits, dim=-1)
            preds = torch.argmax(logits, dim=-1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())  # Probability of being a clone
    
    # Calculate metrics
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='binary'
    )
    auc = roc_auc_score(all_labels, all_probs)
    
    return {
        'loss': avg_loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc
    }

print("Evaluation function ready!")

## Training Loop (H100 Optimized)

In [None]:
# Training function
def train_epoch(model, dataloader, optimizer, scheduler, device, epoch):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    
    optimizer.zero_grad()
    
    for step, batch in enumerate(progress_bar):
        # Move to device
        input_ids1 = batch['input_ids1'].to(device, non_blocking=True)
        attention_mask1 = batch['attention_mask1'].to(device, non_blocking=True)
        input_ids2 = batch['input_ids2'].to(device, non_blocking=True)
        attention_mask2 = batch['attention_mask2'].to(device, non_blocking=True)
        labels = batch['labels'].to(device, non_blocking=True)
        
        # Forward pass with mixed precision
        with autocast(dtype=torch.bfloat16 if config.bf16 else torch.float16, enabled=config.bf16 or config.fp16):
            outputs = model(
                input_ids1, attention_mask1,
                input_ids2, attention_mask2,
                labels
            )
            loss = outputs['loss'] / config.gradient_accumulation_steps
        
        # Backward pass
        if config.fp16 and scaler is not None:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        
        total_loss += loss.item() * config.gradient_accumulation_steps
        
        # Update weights every gradient_accumulation_steps
        if (step + 1) % config.gradient_accumulation_steps == 0:
            # Gradient clipping
            if config.fp16 and scaler is not None:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                optimizer.step()
            
            scheduler.step()
            optimizer.zero_grad()
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f"{loss.item() * config.gradient_accumulation_steps:.4f}",
            'lr': f"{scheduler.get_last_lr()[0]:.2e}"
        })
    
    return total_loss / len(dataloader)

print("Training function ready!")

In [None]:
# Main training loop
print("\n" + "="*50)
print("Starting Training")
print("="*50)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

best_f1 = 0
best_model_path = None
training_history = {
    'train_loss': [],
    'val_loss': [],
    'val_accuracy': [],
    'val_f1': [],
    'val_auc': []
}

for epoch in range(config.num_epochs):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{config.num_epochs}")
    print(f"{'='*50}")
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, epoch)
    print(f"\nTrain Loss: {train_loss:.4f}")
    
    # Evaluate
    val_metrics = evaluate(model, val_loader, device)
    
    print(f"\nValidation Metrics:")
    print(f"  Loss: {val_metrics['loss']:.4f}")
    print(f"  Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"  Precision: {val_metrics['precision']:.4f}")
    print(f"  Recall: {val_metrics['recall']:.4f}")
    print(f"  F1 Score: {val_metrics['f1']:.4f}")
    print(f"  AUC-ROC: {val_metrics['auc']:.4f}")
    
    # Save history
    training_history['train_loss'].append(train_loss)
    training_history['val_loss'].append(val_metrics['loss'])
    training_history['val_accuracy'].append(val_metrics['accuracy'])
    training_history['val_f1'].append(val_metrics['f1'])
    training_history['val_auc'].append(val_metrics['auc'])
    
    # Save best model
    if val_metrics['f1'] > best_f1:
        best_f1 = val_metrics['f1']
        best_model_path = os.path.join(config.output_dir, f'best_model_epoch_{epoch+1}.pt')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'val_metrics': val_metrics,
            'config': vars(config)
        }, best_model_path)
        print(f"\nâœ“ New best model saved! F1: {best_f1:.4f}")
    
    # Save checkpoint
    if (epoch + 1) % 2 == 0:
        checkpoint_path = os.path.join(config.output_dir, f'checkpoint_epoch_{epoch+1}.pt')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'training_history': training_history
        }, checkpoint_path)
        print(f"Checkpoint saved to {checkpoint_path}")

print("\n" + "="*50)
print("Training Complete!")
print("="*50)
print(f"Best F1 Score: {best_f1:.4f}")
print(f"Best model saved to: {best_model_path}")

## Visualize Training Progress

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss
axes[0, 0].plot(training_history['train_loss'], label='Train Loss', marker='o')
axes[0, 0].plot(training_history['val_loss'], label='Val Loss', marker='s')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Training and Validation Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Accuracy
axes[0, 1].plot(training_history['val_accuracy'], label='Accuracy', marker='o', color='green')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Validation Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# F1 Score
axes[1, 0].plot(training_history['val_f1'], label='F1 Score', marker='o', color='orange')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('F1 Score')
axes[1, 0].set_title('Validation F1 Score')
axes[1, 0].legend()
axes[1, 0].grid(True)

# AUC
axes[1, 1].plot(training_history['val_auc'], label='AUC-ROC', marker='o', color='red')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('AUC-ROC')
axes[1, 1].set_title('Validation AUC-ROC')
axes[1, 1].legend()
axes[1, 1].grid(True)

plt.tight_layout()
plt.savefig(os.path.join(config.output_dir, 'training_history.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"Training visualization saved!")

## Test Set Evaluation

In [None]:
# Load best model
print("Loading best model for final evaluation...")
checkpoint = torch.load(best_model_path, weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])

# Evaluate on test set
print("\nEvaluating on test set...")
test_metrics = evaluate(model, test_loader, device)

print("\n" + "="*50)
print("Final Test Set Results")
print("="*50)
print(f"Loss: {test_metrics['loss']:.4f}")
print(f"Accuracy: {test_metrics['accuracy']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"F1 Score: {test_metrics['f1']:.4f}")
print(f"AUC-ROC: {test_metrics['auc']:.4f}")

# Save test results
test_results = {
    'test_metrics': test_metrics,
    'training_history': training_history,
    'config': vars(config),
    'best_epoch': checkpoint['epoch']
}

with open(os.path.join(config.output_dir, 'test_results.json'), 'w') as f:
    json.dump({k: v if not isinstance(v, np.floating) else float(v) 
               for k, v in test_results.items()}, f, indent=2, default=str)

print(f"\nTest results saved to {config.output_dir}/test_results.json")

## Save Model and Tokenizer for Deployment

In [None]:
# Save final model in deployment format
final_model_dir = os.path.join(config.output_dir, 'final_model')
os.makedirs(final_model_dir, exist_ok=True)

print("Saving model for deployment...")

# Save model architecture and weights
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': {
        'model_name': config.model_name,
        'hidden_size': 768,
        'dropout': 0.1
    },
    'test_metrics': test_metrics
}, os.path.join(final_model_dir, 'model.pt'))

# Save tokenizer
tokenizer.save_pretrained(final_model_dir)

# Save configuration
config_dict = {
    'model_name': config.model_name,
    'max_length': config.max_length,
    'version': '1.0.0',
    'training_date': '2026-01-20',
    'test_f1': float(test_metrics['f1']),
    'test_accuracy': float(test_metrics['accuracy'])
}

with open(os.path.join(final_model_dir, 'config.json'), 'w') as f:
    json.dump(config_dict, f, indent=2)

print(f"\nâœ“ Model saved to: {final_model_dir}")
print("\nDeployment files:")
print("  - model.pt (model weights)")
print("  - config.json (model configuration)")
print("  - tokenizer files")

## Inference Example

In [None]:
def predict_clone(code1, code2, model, tokenizer, device, max_length=512):
    """Predict if two code snippets are clones"""
    model.eval()
    
    # Tokenize
    encoding1 = tokenizer(
        code1,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)
    
    encoding2 = tokenizer(
        code2,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)
    
    with torch.no_grad():
        outputs = model(
            encoding1['input_ids'],
            encoding1['attention_mask'],
            encoding2['input_ids'],
            encoding2['attention_mask']
        )
    
    probs = F.softmax(outputs['logits'], dim=-1)
    clone_prob = probs[0, 1].item()
    prediction = 'Clone' if clone_prob > 0.5 else 'Not Clone'
    
    return {
        'prediction': prediction,
        'clone_probability': clone_prob,
        'confidence': max(clone_prob, 1 - clone_prob)
    }

# Example usage
code_snippet_1 = """
def calculate_sum(numbers):
    total = 0
    for num in numbers:
        total += num
    return total
"""

code_snippet_2 = """
def sum_numbers(arr):
    result = 0
    for x in arr:
        result = result + x
    return result
"""

code_snippet_3 = """
def factorial(n):
    if n <= 1:
        return 1
    return n * factorial(n - 1)
"""

print("Example 1: Similar functions")
result = predict_clone(code_snippet_1, code_snippet_2, model, tokenizer, device)
print(f"Prediction: {result['prediction']}")
print(f"Clone Probability: {result['clone_probability']:.4f}")
print(f"Confidence: {result['confidence']:.4f}")

print("\nExample 2: Different functions")
result = predict_clone(code_snippet_1, code_snippet_3, model, tokenizer, device)
print(f"Prediction: {result['prediction']}")
print(f"Clone Probability: {result['clone_probability']:.4f}")
print(f"Confidence: {result['confidence']:.4f}")

## Export to ONNX (Optional - for Production Deployment)

In [None]:
# Export model to ONNX format for faster inference
print("Exporting model to ONNX format...")

try:
    # Create dummy inputs
    dummy_input_ids = torch.randint(0, len(tokenizer), (1, config.max_length)).to(device)
    dummy_attention_mask = torch.ones((1, config.max_length)).to(device)
    
    # Export
    onnx_path = os.path.join(final_model_dir, 'model.onnx')
    torch.onnx.export(
        model,
        (dummy_input_ids, dummy_attention_mask, dummy_input_ids, dummy_attention_mask),
        onnx_path,
        input_names=['input_ids1', 'attention_mask1', 'input_ids2', 'attention_mask2'],
        output_names=['logits'],
        dynamic_axes={
            'input_ids1': {0: 'batch_size'},
            'attention_mask1': {0: 'batch_size'},
            'input_ids2': {0: 'batch_size'},
            'attention_mask2': {0: 'batch_size'},
            'logits': {0: 'batch_size'}
        },
        opset_version=14
    )
    print(f"âœ“ ONNX model exported to: {onnx_path}")
except Exception as e:
    print(f"ONNX export failed: {e}")
    print("This is optional and doesn't affect the trained model.")

## Summary and Next Steps

In [None]:
print("="*70)
print("TRAINING SUMMARY")
print("="*70)
print(f"\nModel: {config.model_name}")
print(f"Training samples: {len(train_dataset):,}")
print(f"Validation samples: {len(val_dataset):,}")
print(f"Test samples: {len(test_dataset):,}")
print(f"\nTotal epochs: {config.num_epochs}")
print(f"Batch size: {config.batch_size}")
print(f"Effective batch size: {config.effective_batch_size}")
print(f"Learning rate: {config.learning_rate}")
print(f"\nBest validation F1: {best_f1:.4f}")
print(f"\nFinal Test Metrics:")
print(f"  Accuracy: {test_metrics['accuracy']:.4f}")
print(f"  Precision: {test_metrics['precision']:.4f}")
print(f"  Recall: {test_metrics['recall']:.4f}")
print(f"  F1 Score: {test_metrics['f1']:.4f}")
print(f"  AUC-ROC: {test_metrics['auc']:.4f}")
print(f"\nModel saved to: {final_model_dir}")
print("\n" + "="*70)
print("\nNEXT STEPS:")
print("1. Download the model files from Google Drive")
print("2. Integrate with your backend API")
print("3. Test on your own code samples")
print("4. Fine-tune further if needed on domain-specific data")
print("5. Deploy to production environment")
print("="*70)