In [1]:
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader


# Step 1: Load the Dataset
def load_arrow_dataset():
    """
    Load train and evaluation datasets stored in Hugging Face's dataset format (.arrow).
    """
    train_data = load_from_disk("./train")
    eval_data = load_from_disk("./eval")
    return train_data, eval_data


# Step 2: Initialize Labeled and Unlabeled Pools
def initialize_pools(train_data, initial_size=0.01):
    """
    Split the training dataset into an initial labeled pool and an unlabeled pool.

    Args:
    - train_data: The full training dataset.
    - initial_size: Proportion of data to use as the labeled pool initially.

    Returns:
    - labeled_data: The initial labeled pool.
    - unlabeled_data: The remaining unlabeled pool.
    """
    labeled_data = train_data.shuffle(seed=42).select(range(int(len(train_data) * initial_size)))
    unlabeled_data = train_data.select(range(int(len(train_data) * initial_size), len(train_data)))
    return labeled_data, unlabeled_data


# Step 3: Define Margin Sampling for Active Learning
def margin_sampling(model, tokenizer, unlabeled_data, n_samples=50):
    """
    Perform margin sampling to select the most informative samples from the unlabeled pool.

    Args:
    - model: The pre-trained model.
    - tokenizer: The tokenizer for text inputs.
    - unlabeled_data: The current unlabeled dataset.
    - n_samples: The number of samples to select.

    Returns:
    - selected_samples: A dataset containing the most informative samples.
    """
    model.eval()
    dataloader = DataLoader(
        unlabeled_data, batch_size=256, shuffle=False, num_workers=4, pin_memory=True
    )
    uncertainties = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = tokenizer(batch["problem"], padding=True, truncation=True, return_tensors="pt").to("cuda")
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            margins = probs.topk(2, dim=1).values
            uncertainties.extend((margins[:, 0] - margins[:, 1]).cpu().numpy())

    selected_indices = sorted(range(len(uncertainties)), key=lambda i: uncertainties[i])[:n_samples]
    return unlabeled_data.select(selected_indices)


# Step 4: Define Dataset Class for Fine-Tuning
class RewardDataset(torch.utils.data.Dataset):
    """
    Custom PyTorch dataset class to handle tokenized data for fine-tuning.
    """
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(item["problem"], padding="max_length", truncation=True, return_tensors="pt")
        labels = 1  # Assuming "accepted" responses are the preferred ones
        return {**{k: v.squeeze(0) for k, v in inputs.items()}, "labels": torch.tensor(labels)}


# Step 5: Fine-Tune the Model
def fine_tune_model(model, tokenizer, labeled_data, eval_data):
    """
    Fine-tune the reward model on the labeled dataset.

    Args:
    - model: The pre-trained model.
    - tokenizer: The tokenizer for text inputs.
    - labeled_data: The labeled dataset for training.
    - eval_data: The evaluation dataset.

    Returns:
    - trainer: The Trainer object after training.
    """
    train_dataset = RewardDataset(labeled_data, tokenizer)
    eval_dataset = RewardDataset(eval_data, tokenizer)

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=64,  # Increased for H100
        gradient_accumulation_steps=1,  # No accumulation needed on H100
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
        fp16=True,  # Mixed precision for better performance
        logging_dir="./logs",
        report_to="tensorboard"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    trainer.train()
    return trainer


# Step 6: Iterative Active Learning Process
def active_learning_process(model, tokenizer, labeled_data, unlabeled_data, eval_data, iterations=5, n_samples=50):
    """
    Perform the active learning process with multiple iterations.

    Args:
    - model: The pre-trained model.
    - tokenizer: The tokenizer for text inputs.
    - labeled_data: The initial labeled dataset.
    - unlabeled_data: The initial unlabeled dataset.
    - eval_data: The evaluation dataset.
    - iterations: Number of active learning iterations to perform.
    - n_samples: Number of samples to select in each iteration.

    Returns:
    - trainer: The Trainer object after the final iteration.
    """
    for iteration in range(iterations):
        print(f"Active Learning Iteration {iteration + 1}/{iterations}")

        # Perform margin sampling to select the most informative samples
        selected_samples = margin_sampling(model, tokenizer, unlabeled_data, n_samples)

        # Update labeled and unlabeled pools
        labeled_data = Dataset.from_pandas(pd.concat([labeled_data.to_pandas(), selected_samples.to_pandas()]))
        unlabeled_data = unlabeled_data.filter(lambda example: example not in selected_samples)

        # Fine-tune the model with the updated labeled pool
        trainer = fine_tune_model(model, tokenizer, labeled_data, eval_data)

        # Evaluate the model
        metrics = trainer.evaluate()
        print(f"Metrics after iteration {iteration + 1}: {metrics}")

        # Save the model for this iteration
        trainer.save_model(f"./models/iteration_{iteration + 1}")

    return trainer


# Main Function to Run the Process
def main():
    """
    Main function to execute the entire active learning pipeline.
    """
    # Step 1: Load the dataset
    train_data, eval_data = load_arrow_dataset()

    # Step 2: Initialize labeled and unlabeled pools
    labeled_data, unlabeled_data = initialize_pools(train_data)

    # Step 3: Load the pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
    model = AutoModelForSequenceClassification.from_pretrained("bert-large-uncased")
    model.to("cuda")  # Ensure the model is on the GPU

    # Step 4: Perform the active learning process
    trainer = active_learning_process(model, tokenizer, labeled_data, unlabeled_data, eval_data)

    # Step 5: Save the final model
    model.save_pretrained("./final_model")
    tokenizer.save_pretrained("./final_model")


if __name__ == "__main__":
    main()

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Loading cached shuffled indices for dataset at /Users/novaxs/Documents/Deep Learning/math-data/math-dataset/train/cache-b3d132b6767da97b.arrow


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AssertionError: Torch not compiled with CUDA enabled