# RunPod Deployment & Validation Notebook
## Astrobiology AI Platform - Production Training Setup

**Date:** 2025-10-06  
**Environment:** RunPod with 2x Nvidia RTX A5000 GPUs (48GB total VRAM)  
**Model:** 13.14B parameter multi-modal AI platform  
**Training Duration:** 4 weeks  
**Target Accuracy:** 96%+

---

## Table of Contents
1. Environment Setup & Validation
2. Dependency Installation
3. GPU Configuration & Testing
4. Memory Optimization Tests
5. Production Readiness Tests
6. 100-Step Training Validation
7. Production Training Launch
8. Monitoring & Checkpointing

---

## 1. Environment Setup & Validation

In [None]:
# Check system information
import sys
import os
import platform

print("="*70)
print("SYSTEM INFORMATION")
print("="*70)
print(f"Python Version: {sys.version}")
print(f"Platform: {platform.platform()}")
print(f"Architecture: {platform.machine()}")
print(f"Working Directory: {os.getcwd()}")
print("="*70)

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Verify PyTorch and CUDA
import torch

print("="*70)
print("PYTORCH & CUDA INFORMATION")
print("="*70)
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
        print(f"  Compute Capability: {torch.cuda.get_device_capability(i)}")
print("="*70)

# Verify we have 2 A5000 GPUs
assert torch.cuda.device_count() == 2, "Expected 2 GPUs"
print("\n✅ GPU Configuration Validated: 2 GPUs available")

## 2. Dependency Installation

In [None]:
# Install critical dependencies
print("Installing bitsandbytes (8-bit optimizer)...")
!pip install bitsandbytes

print("\nInstalling flash-attn (Linux only)...")
!pip install flash-attn --no-build-isolation

print("\nInstalling torch_geometric...")
!pip install torch_geometric

print("\n✅ All dependencies installed")

In [None]:
# Verify installations
print("="*70)
print("DEPENDENCY VERIFICATION")
print("="*70)

try:
    import bitsandbytes as bnb
    print("✅ bitsandbytes imported successfully")
except ImportError as e:
    print(f"❌ bitsandbytes import failed: {e}")

try:
    from flash_attn import flash_attn_func
    print("✅ flash-attn imported successfully")
except ImportError as e:
    print(f"❌ flash-attn import failed: {e}")

try:
    import torch_geometric
    print("✅ torch_geometric imported successfully")
except ImportError as e:
    print(f"❌ torch_geometric import failed: {e}")

print("="*70)

## 3. GPU Configuration & Testing

In [None]:
# Configure distributed training environment
import os

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'
os.environ['WORLD_SIZE'] = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

print("✅ Distributed training environment configured")
print(f"   MASTER_ADDR: {os.environ['MASTER_ADDR']}")
print(f"   MASTER_PORT: {os.environ['MASTER_PORT']}")
print(f"   WORLD_SIZE: {os.environ['WORLD_SIZE']}")
print(f"   CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")

In [None]:
# Test GPU memory allocation
import torch

print("Testing GPU memory allocation...")

for gpu_id in range(torch.cuda.device_count()):
    torch.cuda.set_device(gpu_id)
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    # Allocate test tensor
    test_tensor = torch.randn(1000, 1000, device=f'cuda:{gpu_id}')
    
    allocated = torch.cuda.memory_allocated(gpu_id) / 1e9
    reserved = torch.cuda.memory_reserved(gpu_id) / 1e9
    
    print(f"\nGPU {gpu_id}:")
    print(f"  Allocated: {allocated:.2f} GB")
    print(f"  Reserved: {reserved:.2f} GB")
    
    del test_tensor
    torch.cuda.empty_cache()

print("\n✅ GPU memory allocation test passed")

## 4. Memory Optimization Tests

In [None]:
# Run memory optimization tests
import sys
sys.path.insert(0, '/workspace/astrobio_gen')

print("Running memory optimization tests...")
!cd /workspace/astrobio_gen && python tests/test_memory_optimizations.py

## 5. Production Readiness Tests

In [None]:
# Run production readiness tests
print("Running production readiness tests...")
!cd /workspace/astrobio_gen && python -m pytest tests/test_production_readiness.py -v -s

## 6. Production Training Launch

In [None]:
# Setup W&B logging
import wandb

print("Setting up Weights & Biases...")
wandb.login()

print("\n✅ W&B configured")

In [None]:
# Launch production training
print("="*70)
print("LAUNCHING PRODUCTION TRAINING")
print("="*70)
print("Model: RebuiltLLMIntegration (13.14B parameters)")
print("GPUs: 2x Nvidia RTX A5000 (48GB total VRAM)")
print("Duration: 4 weeks")
print("Target Accuracy: 96%+")
print("="*70)

!cd /workspace/astrobio_gen && python train_unified_sota.py \
    --model rebuilt_llm_integration \
    --epochs 100 \
    --batch-size 32 \
    --micro-batch-size 1 \
    --gradient-accumulation-steps 32 \
    --use-8bit-optimizer \
    --use-cpu-offloading \
    --use-mixed-precision \
    --use-gradient-checkpointing \
    --distributed \
    --gpus 2 \
    --log-every-n-steps 10 \
    --save-every-n-epochs 1 \
    --output-dir outputs/production_training \
    --wandb-project astrobiology-ai-platform \
    --wandb-name production-training-4week

## 7. Monitoring & Checkpointing

In [None]:
# Monitor training progress
import glob
import os

checkpoint_dir = '/workspace/astrobio_gen/outputs/production_training/checkpoints'

if os.path.exists(checkpoint_dir):
    checkpoints = sorted(glob.glob(f"{checkpoint_dir}/*.pt"))
    print(f"Found {len(checkpoints)} checkpoints:")
    for ckpt in checkpoints[-5:]:  # Show last 5
        size_mb = os.path.getsize(ckpt) / 1e6
        print(f"  {os.path.basename(ckpt)}: {size_mb:.2f} MB")
else:
    print("No checkpoints found yet")

In [None]:
# Check GPU memory usage during training
import torch

print("="*70)
print("CURRENT GPU MEMORY USAGE")
print("="*70)

for gpu_id in range(torch.cuda.device_count()):
    allocated = torch.cuda.memory_allocated(gpu_id) / 1e9
    reserved = torch.cuda.memory_reserved(gpu_id) / 1e9
    max_allocated = torch.cuda.max_memory_allocated(gpu_id) / 1e9
    
    print(f"\nGPU {gpu_id}:")
    print(f"  Current Allocated: {allocated:.2f} GB")
    print(f"  Current Reserved: {reserved:.2f} GB")
    print(f"  Peak Allocated: {max_allocated:.2f} GB")
    print(f"  Target: <45 GB")
    
    if allocated > 45:
        print(f"  ⚠️ WARNING: Memory usage exceeds target!")
    else:
        print(f"  ✅ Memory usage within target")

print("="*70)