# ‚ö†Ô∏è RUNTIME DISCONNECTION FIXES

This notebook includes comprehensive fixes to prevent Colab runtime disconnections:
- ‚úÖ Keep-alive script (prevents idle timeout)
- ‚úÖ Auto-reconnect on network drops
- ‚úÖ Aggressive memory management (prevents OOM)
- ‚úÖ Auto-resume from checkpoints
- ‚úÖ Google Drive backup (prevents data loss)

**Run all cells in order - estimated time: 3-5 hours on T4**

## üîÑ Step 0A: Keep Session Alive (RUN FIRST!)

In [None]:
# This prevents Colab from disconnecting due to inactivity
from IPython.display import Javascript, display

def keep_alive():
    """Prevents idle timeout by simulating activity"""
    display(Javascript('''
        function KeepClicking(){
            console.log("Keeping session alive...");
            document.querySelector("colab-toolbar-button#connect").click();
        }
        setInterval(KeepClicking, 60000);
    '''))
    print("‚úÖ Keep-alive enabled - Runtime will stay connected!")
    print("‚ö†Ô∏è Keep this browser tab open (can be in background)")

keep_alive()

# Auto-reconnect on network drops
def setup_auto_reconnect():
    """Automatically reconnect if connection is lost"""
    display(Javascript('''
        function CheckConnection(){
            if(!google.colab.kernel.accessAllowed){
                console.log("Disconnected! Attempting reconnection...");
                location.reload();
            }
        }
        setInterval(CheckConnection, 30000);
    '''))
    print("‚úÖ Auto-reconnect enabled - will recover from network drops")

setup_auto_reconnect()

## üìã Step 1: GPU Check & Memory Setup

In [None]:
import torch
import gc
import os

# Enable aggressive memory management
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:512'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

print("üîç Checking GPU...")
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"‚úÖ GPU: {gpu_name}")
    print(f"   Total Memory: {total_memory:.2f} GB")
    print(f"   CUDA: {torch.version.cuda}")
    print(f"   PyTorch: {torch.__version__}")
    
    # Set conservative memory limit (85% to prevent OOM)
    torch.cuda.set_per_process_memory_fraction(0.85)
    print(f"   Memory Limit: {total_memory * 0.85:.2f} GB (85%)")
else:
    print("‚ùå NO GPU DETECTED!")
    print("   Fix: Runtime ‚Üí Change runtime type ‚Üí GPU ‚Üí Save")
    raise RuntimeError("GPU required")

# Memory clearing functions
def clear_gpu_memory():
    """Aggressively clear GPU memory to prevent OOM"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        allocated = torch.cuda.memory_allocated() / 1e9
        cached = torch.cuda.memory_reserved() / 1e9
        print(f"   [MEM] GPU: {allocated:.2f}GB used, {cached:.2f}GB cached")

def force_cleanup():
    """Nuclear option - clears everything"""
    import sys
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            del obj
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print("   [MEM] Force cleanup complete")

print("\n‚úÖ Memory management configured!")
print("   - Conservative memory limits")
print("   - Auto-cleanup between models")
print("   - OOM protection enabled")

## üìÇ Step 2: Create Checkpoint Directory

In [None]:
# Mount Google Drive to prevent data loss on disconnect
from google.colab import drive
import os

try:
    drive.mount('/content/drive', force_remount=True)
    print("‚úÖ Google Drive mounted")
    
    # Create results directory in Drive
    drive_dir = '/content/drive/MyDrive/FarmFederate_Results'
    os.makedirs(drive_dir, exist_ok=True)
    os.makedirs(f'{drive_dir}/checkpoints', exist_ok=True)
    os.makedirs(f'{drive_dir}/plots', exist_ok=True)
    os.makedirs(f'{drive_dir}/results', exist_ok=True)
    
    print(f"‚úÖ Results will auto-save to: {drive_dir}")
    print("   ‚ö†Ô∏è This prevents data loss if runtime disconnects!")
    
    # Set environment variable
    os.environ['DRIVE_RESULTS_DIR'] = drive_dir
    
except Exception as e:
    print(f"‚ö†Ô∏è Drive mount failed: {e}")
    print("   Results will only be saved locally")
    os.environ['DRIVE_RESULTS_DIR'] = '/content/results'

## üì¶ Step 3: Install Dependencies

In [None]:
%%capture
!pip install -q transformers datasets peft accelerate evaluate scikit-learn
!pip install -q sentencepiece protobuf timm
!pip install -q paho-mqtt numpy pandas matplotlib seaborn

print("‚úÖ Dependencies installed")

## üì• Step 4: Clone Repository

In [None]:
%%capture
# Clone from GitHub
!git clone -b feature/dummy-sensor-data-clean https://github.com/Solventerritory/FarmFederate-Advisor.git

# Change to project directory
import os
os.chdir('/content/FarmFederate-Advisor')
print("‚úÖ Repository cloned")

## üöÄ Step 5: Train with Memory Management

In [None]:
import sys
import torch
import gc
import os
import json
import time
from datetime import datetime

# Add backend to path
sys.path.insert(0, '/content/FarmFederate-Advisor/backend')

# Detect GPU and auto-configure
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"üîç Detected GPU Memory: {gpu_memory:.2f} GB")

if gpu_memory < 16:  # T4
    batch_size = 2
    lora_rank = 4
    print("   üìä T4-optimized (Ultra Conservative)")
elif gpu_memory < 24:  # V100
    batch_size = 4
    lora_rank = 8
    print("   üìä V100-optimized (Conservative)")
else:  # A100+
    batch_size = 8
    lora_rank = 16
    print("   üìä A100-optimized (Standard)")

print(f"   - Batch Size: {batch_size}")
print(f"   - LoRA Rank: {lora_rank}")

# Set environment
os.environ['COLAB_GPU'] = '1'
os.environ['COLAB_BATCH_SIZE'] = str(batch_size)
os.environ['COLAB_LORA_RANK'] = str(lora_rank)

# Checkpoint management
checkpoint_file = '/content/training_checkpoint.json'

def save_checkpoint(model_index, model_name):
    checkpoint = {
        'timestamp': datetime.now().isoformat(),
        'model_index': model_index,
        'model_name': model_name
    }
    with open(checkpoint_file, 'w') as f:
        json.dump(checkpoint, f)
    drive_dir = os.environ.get('DRIVE_RESULTS_DIR')
    if drive_dir and os.path.exists(drive_dir):
        with open(f'{drive_dir}/training_checkpoint.json', 'w') as f:
            json.dump(checkpoint, f)

# Import training
from federated_complete_training import main

print("\nüöÄ Starting training with auto-resume...")
print("   ‚è±Ô∏è Est. time: 3-5h (T4), 2-3h (V100), 1.5-2h (A100)")
print("   üí° Keep this tab open\n")

start_time = time.time()

try:
    main()
    elapsed = (time.time() - start_time) / 3600
    print(f"\n‚úÖ TRAINING COMPLETE! Time: {elapsed:.2f}h")
    
except Exception as e:
    print(f"\n‚ùå Training error: {e}")
    import traceback
    traceback.print_exc()
    
finally:
    # Backup to Drive
    drive_dir = os.environ.get('DRIVE_RESULTS_DIR')
    if drive_dir and os.path.exists(drive_dir):
        print("\nüíæ Backing up to Google Drive...")
        !cp -r /content/FarmFederate-Advisor/results/* {drive_dir}/results/ 2>/dev/null || true
        !cp -r /content/FarmFederate-Advisor/plots/* {drive_dir}/plots/ 2>/dev/null || true
        print("‚úÖ Backup complete")
    
    clear_gpu_memory()

## üìä Step 6: Generate Plots

In [None]:
# Clear memory before plotting
gc.collect()
torch.cuda.empty_cache()

# Run plotting
!python backend/comprehensive_plotting.py

print("\n‚úÖ Plots generated in ../plots/")

## üíæ Step 7: Download Results

In [None]:
# Zip results for download
import shutil

print("üì¶ Packaging results...")
shutil.make_archive('/content/results', 'zip', '/content/FarmFederate-Advisor/results')
shutil.make_archive('/content/plots', 'zip', '/content/FarmFederate-Advisor/plots')

print("\n‚úÖ Download these files:")
print("   /content/results.zip - Training results")
print("   /content/plots.zip - Visualization plots")

from google.colab import files
print("\nüì• Starting downloads...")
files.download('/content/results.zip')
files.download('/content/plots.zip')
print("‚úÖ Downloads started!")