In [1]:
!pip install evaluate accelerate



In [2]:
import argparse
import json
import numpy as np
import torch
import gc
import os
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
    pipeline,
)
import evaluate
from transformers import logging
logging.set_verbosity_error()

# Force cleanup and set environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

def cleanup_memory():
    """Aggressive memory cleanup"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

cleanup_memory()
print("Libraries imported and memory cleaned")

2025-09-07 14:42:33.422423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757256153.444794     352 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757256153.451435     352 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Libraries imported and memory cleaned


In [3]:
def prepare_train_features(examples, tokenizer, max_length=256, doc_stride=64):
    """
    Simplified tokenization with smaller sequences to reduce memory usage
    """
    # Tokenize with smaller sequences
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors=None,  # Don't return tensors to save memory
    )

    # Map features back to examples
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Find context tokens
            token_start_index = 0
            while token_start_index < len(sequence_ids) and sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while token_end_index >= 0 and sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Check bounds
            if (token_start_index >= len(offsets) or token_end_index >= len(offsets) or
                token_start_index > token_end_index or
                not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char)):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                # Find token positions
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(max(0, token_start_index - 1))

                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(min(len(input_ids) - 1, token_end_index + 1))

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

def flatten_squad_data(dataset, max_samples=None):
    """
    Flatten SQuAD data with optional sample limiting
    """
    flattened_data = {
        "id": [],
        "title": [],
        "context": [],
        "question": [],
        "answers": []
    }
    
    sample_count = 0
    for article in dataset:
        if max_samples and sample_count >= max_samples:
            break
            
        title = article["title"]
        for paragraph in article["paragraphs"]:
            if max_samples and sample_count >= max_samples:
                break
                
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                if max_samples and sample_count >= max_samples:
                    break
                    
                flattened_data["id"].append(qa["id"])
                flattened_data["title"].append(title)
                flattened_data["context"].append(context)
                flattened_data["question"].append(qa["question"])
                flattened_data["answers"].append({
                    "text": [answer["text"] for answer in qa["answers"]],
                    "answer_start": [answer["answer_start"] for answer in qa["answers"]]
                })
                sample_count += 1
    
    return flattened_data

print("Data preparation functions defined")

Data preparation functions defined


In [4]:
print("Loading datasets...")

# Start with a smaller subset for testing
USE_SMALL_DATASET = True
MAX_TRAIN_SAMPLES = 5000 if USE_SMALL_DATASET else None
MAX_VAL_SAMPLES = 500 if USE_SMALL_DATASET else None

try:
    train_raw = load_dataset(
        "json",
        data_files="/kaggle/input/stanford-question-answering-dataset/train-v1.1.json",
        field="data"
    )["train"]

    val_raw = load_dataset(
        "json", 
        data_files="/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json",
        field="data"
    )["train"]
    
    print(f"Loaded {len(train_raw)} training articles and {len(val_raw)} validation articles")
    
except Exception as e:
    print(f"Error loading Kaggle datasets: {e}")
    print("Falling back to HuggingFace SQuAD dataset...")
    squad_dataset = load_dataset("squad")
    
    if USE_SMALL_DATASET:
        train_raw = squad_dataset["train"].select(range(min(MAX_TRAIN_SAMPLES, len(squad_dataset["train"]))))
        val_raw = squad_dataset["validation"].select(range(min(MAX_VAL_SAMPLES, len(squad_dataset["validation"]))))
    else:
        train_raw = squad_dataset["train"]
        val_raw = squad_dataset["validation"]

# Flatten data
print("Flattening data structure...")
train_flattened = flatten_squad_data(train_raw, MAX_TRAIN_SAMPLES)
val_flattened = flatten_squad_data(val_raw, MAX_VAL_SAMPLES)

# Create datasets
raw_datasets = {
    "train": Dataset.from_dict(train_flattened),
    "validation": Dataset.from_dict(val_flattened)
}

print(f"Using {len(raw_datasets['train'])} training examples and {len(raw_datasets['validation'])} validation examples")

# Cleanup
del train_raw, val_raw, train_flattened, val_flattened
cleanup_memory()

Loading datasets...
Loaded 442 training articles and 48 validation articles
Flattening data structure...
Using 5000 training examples and 500 validation examples


In [5]:
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 256  # Reduced from 384 to save memory
DOC_STRIDE = 64   # Reduced from 128
OUTPUT_DIR = "./distilbert-squad-iou"

print(f"Loading model and tokenizer: {MODEL_NAME}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"Total GPU memory: {total_memory:.1f} GB")
    
    # Check initial memory usage
    cleanup_memory()
    allocated = torch.cuda.memory_allocated(0) / 1024**3
    print(f"Initial GPU memory allocated: {allocated:.2f} GB")

model.to(device)
cleanup_memory()

print("Model loaded successfully")

Loading model and tokenizer: distilbert-base-uncased
Using device: cuda
GPU: Tesla T4
Total GPU memory: 14.7 GB
Initial GPU memory allocated: 0.00 GB
Model loaded successfully


In [6]:
print("Preprocessing training data...")

# Very conservative batch size for preprocessing
PREPROCESS_BATCH_SIZE = 100

def monitor_memory():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated(0) / 1024**3
        reserved = torch.cuda.memory_reserved(0) / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")

monitor_memory()

# Process training data
train_dataset = raw_datasets["train"].map(
    lambda examples: prepare_train_features(
        examples, tokenizer, max_length=MAX_LENGTH, doc_stride=DOC_STRIDE
    ),
    batched=True,
    batch_size=PREPROCESS_BATCH_SIZE,
    remove_columns=raw_datasets["train"].column_names,
    desc="Processing training data",
    writer_batch_size=PREPROCESS_BATCH_SIZE
)

print(f"Training dataset processed: {len(train_dataset)} examples")
monitor_memory()
cleanup_memory()

Preprocessing training data...
GPU Memory - Allocated: 0.25GB, Reserved: 0.27GB


Processing training data:   0%|          | 0/5000 [00:00<?, ? examples/s]

Training dataset processed: 5765 examples
GPU Memory - Allocated: 0.25GB, Reserved: 0.27GB


In [7]:
print("Setting up training with minimal resource usage...")

# Very conservative training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="no",
    save_strategy="no",  # Disable saving during training to save space
    logging_strategy="steps",
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # Very small batch size
    gradient_accumulation_steps=8,  # Compensate with gradient accumulation
    num_train_epochs=1,  # Start with just 1 epoch
    weight_decay=0.01,
    warmup_steps=100,
    max_grad_norm=1.0,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    dataloader_pin_memory=False,
    remove_unused_columns=True,
    report_to=None,
    push_to_hub=False,
    skip_memory_metrics=False,
    log_level="error",
)

print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    data_collator=default_data_collator,
)

print("Trainer created successfully")
monitor_memory()

Setting up training with minimal resource usage...
Effective batch size: 16
Trainer created successfully
GPU Memory - Allocated: 0.25GB, Reserved: 0.27GB


In [8]:
print("Running pre-training memory and data checks...")

# Check if we can load a small batch
try:
    # Test with a tiny batch
    test_batch = trainer.get_train_dataloader()
    print(f"Train dataloader created successfully")
    print(f"Number of batches: {len(test_batch)}")
    
    # Try to get one batch
    batch_iter = iter(test_batch)
    first_batch = next(batch_iter)
    print(f"First batch keys: {first_batch.keys()}")
    print(f"Batch size: {first_batch['input_ids'].shape[0]}")
    print(f"Sequence length: {first_batch['input_ids'].shape[1]}")
    
    cleanup_memory()
    monitor_memory()
    
    print("✓ Data loading test successful")
    
except Exception as e:
    print(f"✗ Data loading test failed: {e}")
    print("Try reducing batch size further or using CPU")

# Test forward pass
try:
    with torch.no_grad():
        # Move batch to device
        test_inputs = {k: v.to(device) for k, v in first_batch.items()}
        outputs = model(**test_inputs)
        print(f"✓ Forward pass test successful")
        print(f"Loss: {outputs.loss.item():.4f}")
        
    cleanup_memory()
    monitor_memory()
    
except Exception as e:
    print(f"✗ Forward pass test failed: {e}")
    print("Model may be too large for available GPU memory")

Running pre-training memory and data checks...
Train dataloader created successfully
Number of batches: 1442
First batch keys: dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])
Batch size: 4
Sequence length: 256
GPU Memory - Allocated: 0.25GB, Reserved: 0.27GB
✓ Data loading test successful
✓ Forward pass test successful
Loss: 5.5433
GPU Memory - Allocated: 0.26GB, Reserved: 0.29GB


In [9]:
print("="*50)
print("STARTING TRAINING")
print("="*50)

try:
    # Clear everything before training
    cleanup_memory()
    monitor_memory()
    
    print(f"Training on {len(train_dataset)} examples...")
    print(f"This will be approximately {len(trainer.get_train_dataloader())} steps")
    
    # Start training with error recovery
    trainer.train()
    
    print("✓ Training completed successfully!")
    
except torch.cuda.OutOfMemoryError as e:
    print(f"✗ CUDA Out of Memory Error: {e}")
    print("Solutions:")
    print("1. Reduce per_device_train_batch_size to 1")
    print("2. Increase gradient_accumulation_steps")
    print("3. Reduce MAX_LENGTH further")
    print("4. Use CPU training (slower but more memory)")
    
    cleanup_memory()
    
except RuntimeError as e:
    print(f"✗ Runtime Error: {e}")
    print("This might be a CUDA or model loading issue")
    cleanup_memory()
    
except Exception as e:
    print(f"✗ Unexpected error: {e}")
    print(f"Error type: {type(e).__name__}")
    cleanup_memory()

STARTING TRAINING
GPU Memory - Allocated: 0.26GB, Reserved: 0.29GB
Training on 5765 examples...
This will be approximately 1442 steps




{'loss': 4.373, 'grad_norm': 664331.5625, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.5547850208044383}
{'train_runtime': 158.2547, 'train_samples_per_second': 36.429, 'train_steps_per_second': 1.144, 'train_loss': 3.589718897698334, 'init_mem_cpu_alloc_delta': 262144, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 753188864, 'train_mem_gpu_alloc_delta': 553178112, 'train_mem_cpu_peaked_delta': 53248, 'train_mem_gpu_peaked_delta': 810799104, 'before_init_mem_cpu': 1554771968, 'before_init_mem_gpu': 266594816, 'epoch': 1.0}
✓ Training completed successfully!


In [10]:
print("Saving model and cleaning up...")

try:
    # Save the model
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"✓ Model saved to {OUTPUT_DIR}")
    
except Exception as e:
    print(f"✗ Error saving model: {e}")

# Final cleanup
del trainer, model
cleanup_memory()
monitor_memory()

print("Training session complete!")

Saving model and cleaning up...
✓ Model saved to ./distilbert-squad-iou
GPU Memory - Allocated: 0.03GB, Reserved: 0.04GB
Training session complete!


In [11]:
print("Testing saved model...")

try:
    # Load saved model
    model = AutoModelForQuestionAnswering.from_pretrained(OUTPUT_DIR)
    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
    
    # Create simple pipeline
    qa_pipeline = pipeline(
        "question-answering",
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )
    
    # Test inference
    test_context = "The quick brown fox jumps over the lazy dog. The fox is very clever."
    test_question = "What jumps over the dog?"
    
    result = qa_pipeline(question=test_question, context=test_context)
    print(f"✓ Test successful!")
    print(f"Question: {test_question}")
    print(f"Answer: {result['answer']}")
    print(f"Confidence: {result['score']:.4f}")
    
except Exception as e:
    print(f"✗ Model testing failed: {e}")

print("All done!")

Testing saved model...
✓ Test successful!
Question: What jumps over the dog?
Answer: quick brown fox
Confidence: 0.2650
All done!


In [12]:
print("CPU FALLBACK TRAINING")
print("="*30)
print("Use this only if GPU training failed!")

def train_on_cpu():
    """Fallback training on CPU with very small dataset"""
    
    # Use even smaller dataset for CPU
    small_train = raw_datasets["train"].select(range(100))
    
    # Load model on CPU
    model_cpu = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
    
    # Process data
    train_dataset_cpu = small_train.map(
        lambda examples: prepare_train_features(
            examples, tokenizer, max_length=128, doc_stride=32
        ),
        batched=True,
        batch_size=50,
        remove_columns=small_train.column_names,
    )
    
    # CPU training args
    training_args_cpu = TrainingArguments(
        output_dir=f"{OUTPUT_DIR}_cpu",
        eval_strategy="no",
        save_strategy="no",
        logging_steps=10,
        learning_rate=5e-5,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        dataloader_num_workers=0,
        report_to=None,
        fp16=False,  # No mixed precision on CPU
    )
    
    trainer_cpu = Trainer(
        model=model_cpu,
        args=training_args_cpu,
        train_dataset=train_dataset_cpu,
        processing_class=tokenizer,
        data_collator=default_data_collator,
    )
    
    print("Starting CPU training...")
    trainer_cpu.train()
    
    # Save CPU model
    trainer_cpu.save_model(f"{OUTPUT_DIR}_cpu")
    print("CPU training completed!")

# Uncomment the next line to run CPU training
# train_on_cpu()

CPU FALLBACK TRAINING
Use this only if GPU training failed!
