In [None]:
!pip install datasets



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np


In [None]:

class CodeGenerationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        code = item['code']

        # Combine text and code
        combined = f"{text} <CODE> {code}"

        # Tokenize with proper padding and truncation
        encoding = self.tokenizer(
            combined,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Remove the batch dimension added by tokenizer
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        # Create labels (shifted input_ids)
        labels = input_ids.clone()

        # Shift labels to the right by 1 and set first token to -100
        labels[:-1] = input_ids[1:]
        labels[-1] = -100  # Ignore last token prediction

        # Mask padding tokens in labels with -100
        labels[attention_mask == 0] = -100

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

def create_dataloaders(dataset, tokenizer, batch_size=16, max_length=512, train_split=0.8):
    """Create train and validation dataloaders"""

    # Calculate split indices
    train_size = int(len(dataset['train']) * train_split)
    train_data = dataset['train'].select(range(train_size))
    val_data = dataset['train'].select(range(train_size, len(dataset['train'])))

    # Create datasets
    train_dataset = CodeGenerationDataset(train_data, tokenizer, max_length)
    val_dataset = CodeGenerationDataset(val_data, tokenizer, max_length)

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )

    return train_loader, val_loader

In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from transformers import (
    AutoModelForCausalLM,
    get_scheduler,
    get_linear_schedule_with_warmup,
    AutoTokenizer
)
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.auto import tqdm
import time
from datetime import datetime


In [None]:
class TrainingLogger:
    def __init__(self, log_dir='runs'):
        self.log_dir = Path(log_dir) / datetime.now().strftime('%Y%m%d_%H%M%S')
        self.writer = SummaryWriter(self.log_dir)
        self.train_losses = []
        self.val_losses = []
        self.learning_rates = []

    def log_step(self, train_loss=None, val_loss=None, lr=None, step=0):
        if train_loss is not None:
            self.train_losses.append(train_loss)
            self.writer.add_scalar('Loss/train', train_loss, step)

        if val_loss is not None:
            self.val_losses.append(val_loss)
            self.writer.add_scalar('Loss/validation', val_loss, step)

        if lr is not None:
            self.learning_rates.append(lr)
            self.writer.add_scalar('Learning_rate', lr, step)

    def plot_metrics(self):
        fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

        # Plot losses
        ax1.plot(self.train_losses, label='Train')
        ax1.plot(self.val_losses, label='Validation')
        ax1.set_title('Losses')
        ax1.set_xlabel('Step')
        ax1.set_ylabel('Loss')
        ax1.legend()

        # Plot learning rate
        ax2.plot(self.learning_rates)
        ax2.set_title('Learning Rate')
        ax2.set_xlabel('Step')

        # Plot loss distributions
        sns.kdeplot(data=self.train_losses, ax=ax3, label='Train')
        sns.kdeplot(data=self.val_losses, ax=ax3, label='Validation')
        ax3.set_title('Loss Distribution')
        ax3.legend()

        plt.tight_layout()
        plt.savefig(self.log_dir / 'training_metrics.png')
        plt.close()


In [None]:
class EarlyStopping:
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.best_weights = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_weights = model.state_dict().copy()
            return False

        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.best_weights = model.state_dict().copy()
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [None]:
class CodeGenerationModel(nn.Module):
    def __init__(self, model_name="microsoft/CodeGPT-small-py", dropout_rate=0.1):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(model_name)

        # Add dropout for regularization
        self.dropout = nn.Dropout(dropout_rate)

        # Freeze the first n layers of the transformer
        self._freeze_base_layers()

    def _freeze_base_layers(self, num_layers_to_freeze=6):
        """Freeze the first n layers of the transformer"""
        for param in self.model.parameters():
            param.requires_grad = False

        # Unfreeze the last few layers for fine-tuning
        for param in self.model.transformer.h[num_layers_to_freeze:].parameters():
            param.requires_grad = True

        # Always unfreeze the final layer for task-specific adaptation
        for param in self.model.lm_head.parameters():
            param.requires_grad = True

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs

In [None]:
def train_model(
    model,
    train_loader,
    val_loader,
    num_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    patience=7,
    save_dir='checkpoints',
    device=None
):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Create save directory
    save_dir = Path(save_dir)
    save_dir.mkdir(exist_ok=True)

    # Initialize logger
    logger = TrainingLogger()

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=patience)

    # Move model to device
    model = model.to(device)

    # Initialize optimizer
    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        weight_decay=weight_decay
    )

    # Initialize learning rate scheduler
    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        epochs=num_epochs,
        steps_per_epoch=len(train_loader),
        pct_start=0.3,
        anneal_strategy='cos'
    )

    # Training loop
    global_step = 0
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")

        # Training phase
        model.train()
        train_losses = []

        train_pbar = tqdm(train_loader, desc='Training')
        for batch in train_pbar:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss

            # Backward pass
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Update weights
            optimizer.step()
            scheduler.step()

            # Log metrics
            train_losses.append(loss.item())
            current_lr = scheduler.get_last_lr()[0]

            logger.log_step(
                train_loss=loss.item(),
                lr=current_lr,
                step=global_step
            )

            # Update progress bar
            train_pbar.set_postfix({
                'loss': f"{loss.item():.4f}",
                'lr': f"{current_lr:.2e}"
            })

            global_step += 1

        # Validation phase
        model.eval()
        val_losses = []

        with torch.no_grad():
            val_pbar = tqdm(val_loader, desc='Validation')
            for batch in val_pbar:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                val_losses.append(loss.item())

                val_pbar.set_postfix({'loss': f"{loss.item():.4f}"})

        # Calculate average losses
        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)

        # Log validation metrics
        logger.log_step(val_loss=avg_val_loss, step=global_step)

        print(f"\nAvg train loss: {avg_train_loss:.4f}")
        print(f"Avg validation loss: {avg_val_loss:.4f}")

        # Save checkpoint if best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
            }, save_dir / 'best_model.pt')

        # Regular checkpoint
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
        }, save_dir / f'checkpoint_epoch_{epoch+1}.pt')

        # Early stopping check
        if early_stopping(avg_val_loss, model):
            print("\nEarly stopping triggered!")
            model.load_state_dict(early_stopping.best_weights)
            break

    # Final plots
    logger.plot_metrics()

    return model, logger

In [None]:
dataset = load_dataset("codeparrot/xlcost-text-to-code", "Python-snippet-level")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/CodeGPT-small-py")
tokenizer.pad_token = tokenizer.eos_token

# Create dataloaders
train_loader, val_loader = create_dataloaders(
    dataset=dataset,
    tokenizer=tokenizer,
    batch_size=5,
    max_length=512
)

# Initialize and train model
model = CodeGenerationModel()
trained_model, logger = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    num_epochs=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    patience=7,
    save_dir='/content/drive/MyDrive/SIS421/NLP/CheckpointTranslator'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Epoch 1/10


Training:   0%|          | 0/8121 [00:00<?, ?it/s]