<a href="https://colab.research.google.com/github/OneFineStarstuff/OneFineStarstuff/blob/main/Data_Efficient_Training_with_Active_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer

def select_most_uncertain_samples(model, tokenizer, unlabeled_data, num_samples=10):
    model.eval()
    uncertainties = []
    processed_data = []

    for text in unlabeled_data:
        inputs = tokenizer(text, return_tensors="pt")
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        with torch.no_grad():
            logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            # Compute the uncertainty over the entire sequence
            softmax_probs = logits.softmax(dim=-1)
            max_probs, _ = torch.max(softmax_probs, dim=-1)
            sequence_uncertainty = -torch.mean(max_probs)
            uncertainties.append(sequence_uncertainty.item())
            processed_data.append(inputs)

    selected_indices = sorted(range(len(uncertainties)), key=lambda i: uncertainties[i], reverse=True)[:num_samples]
    selected_samples = [processed_data[i] for i in selected_indices]

    return selected_samples

# Placeholder function for training the model on selected samples
def train_model_on_selected_samples(model, selected_samples, labels):
    model.train()
    # Implement your training loop here
    for inputs, label in zip(selected_samples, labels):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        # Perform training step with the labeled data
        # ...

# Example usage
unlabeled_data = [
    "This is an example sentence.",
    "Another example sentence here.",
    # Add more unlabeled data here
]

labels = [0, 1]  # Manually labeled examples

# Define the number of active learning iterations
active_learning_iterations = 5

# Define your model and tokenizer (e.g., GPT-2)
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

for _ in range(active_learning_iterations):
    selected_samples = select_most_uncertain_samples(model, tokenizer, unlabeled_data, num_samples=2)
    train_model_on_selected_samples(model, selected_samples, labels)

print("Active learning complete!")