# üöÄ FarmFederate Training - Complete Setup

## Includes All Disconnection Fixes

- ‚úÖ Keep-alive (prevents 90-min timeout)
- ‚úÖ Auto-reconnect (network recovery)
- ‚úÖ Memory management (prevents OOM)
- ‚úÖ Google Drive backup
- ‚úÖ Auto-configuration

**Setup:** Runtime ‚Üí Change runtime type ‚Üí GPU ‚Üí Save

**Time:** 3-5h (T4), 2-3h (V100), 1.5-2h (A100)

In [1]:
# STEP 1: Keep-Alive & Protection (RUN FIRST!)
from IPython.display import Javascript, display
import time

# Keep-alive clicks connect every 60 seconds
keepalive_js = '''
setInterval(function() {
  var btn = document.querySelector('colab-toolbar-button#connect');
  if (btn) btn.click();
}, 60000);
'''

display(Javascript(keepalive_js))
print('‚úÖ Keep-alive enabled!')
print('‚ö†Ô∏è Keep this browser tab open (can be in background)')
print(f'Started at: {time.strftime("%H:%M:%S")}')

<IPython.core.display.Javascript object>

‚úÖ Keep-alive enabled!
‚ö†Ô∏è Keep this browser tab open (can be in background)
Started at: 20:17:26


In [2]:
# STEP 2: GPU Check & Memory Management
import torch
import gc
import os

# Memory optimization
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:512'

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'‚úÖ GPU: {gpu_name}')
    print(f'   Memory: {gpu_memory:.1f} GB')
    torch.cuda.set_per_process_memory_fraction(0.85)
    print(f'   Limit: {gpu_memory * 0.85:.1f} GB (85%)')
else:
    raise RuntimeError('‚ùå NO GPU! Go to Runtime ‚Üí Change runtime type ‚Üí GPU')

def clear_gpu():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

print('‚úÖ Memory management ready')

‚úÖ GPU: Tesla T4
   Memory: 15.8 GB
   Limit: 13.5 GB (85%)
‚úÖ Memory management ready


In [3]:
# STEP 3: Mount Google Drive (IMPORTANT!)
from google.colab import drive

drive.mount('/content/drive')

import os
results_dir = '/content/drive/MyDrive/FarmFederate_Results'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(f'{results_dir}/checkpoints', exist_ok=True)
os.makedirs(f'{results_dir}/results', exist_ok=True)
os.makedirs(f'{results_dir}/plots', exist_ok=True)

os.environ['DRIVE_RESULTS_DIR'] = results_dir

print(f'‚úÖ Google Drive mounted')
print(f'   Results will save to: {results_dir}')
print('   ‚ö†Ô∏è This prevents data loss if disconnected!')

Mounted at /content/drive
‚úÖ Google Drive mounted
   Results will save to: /content/drive/MyDrive/FarmFederate_Results
   ‚ö†Ô∏è This prevents data loss if disconnected!


In [4]:
# STEP 4: Install Dependencies
%%capture
!pip install -q transformers datasets peft accelerate evaluate scikit-learn
!pip install -q sentencepiece protobuf timm torch torchvision
!pip install -q matplotlib seaborn pandas pillow

print('‚úÖ Dependencies installed')

In [5]:
# STEP 5: Clone Repository & Checkout Correct Branch
import os

if not os.path.exists('/content/FarmFederate-Advisor'):
    print('üì• Cloning repository...')
    !git clone -b feature/dummy-sensor-data-clean https://github.com/Solventerritory/FarmFederate-Advisor.git
    print('‚úÖ Repository cloned (feature/dummy-sensor-data-clean branch)')
else:
    print('üì• Updating repository...')
    !cd /content/FarmFederate-Advisor && git checkout feature/dummy-sensor-data-clean && git pull
    print('‚úÖ Repository updated')

# Change to backend directory
os.chdir('/content/FarmFederate-Advisor/backend')
print(f'‚úÖ Working directory: {os.getcwd()}')

# Verify training script exists
if os.path.exists('federated_complete_training.py'):
    print('‚úÖ Training script found!')
else:
    print('‚ùå Training script missing!')
    print(f'   Files in backend: {os.listdir("."[:20])}')
    raise FileNotFoundError('Training script not found - repository may be corrupted')

üì• Cloning repository...
Cloning into 'FarmFederate-Advisor'...
remote: Enumerating objects: 2234, done.[K
remote: Counting objects: 100% (126/126), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 2234 (delta 61), reused 70 (delta 25), pack-reused 2108 (from 1)[K
Receiving objects: 100% (2234/2234), 183.00 MiB | 17.70 MiB/s, done.
Resolving deltas: 100% (416/416), done.
Downloading backend/checkpoints/global_central.pt (847 MB)
Error downloading object: backend/checkpoints/global_central.pt (8ca19bc): Smudge error: Error downloading backend/checkpoints/global_central.pt (8ca19bc8f9d655201a0c0530e2b825f5a481d576b9bc5677e223d3157f8e37aa): batch response: This repository exceeded its LFS budget. The account responsible for the budget should increase it to restore access.

Errors logged to /content/FarmFederate-Advisor/.git/lfs/logs/20260111T201826.723069955.log
Use `git lfs logs last` to view the log.
error: external filter 'git-lfs filter-process' failed
fa

In [6]:
# STEP 6: Auto-Configure for GPU
import sys

# Add backend to path
sys.path.insert(0, '/content/FarmFederate-Advisor/backend')

if gpu_memory < 16:
    batch_size = 2
    lora_rank = 4
    print('üìä T4 Configuration (Conservative)')
elif gpu_memory < 24:
    batch_size = 4
    lora_rank = 8
    print('üìä V100 Configuration')
else:
    batch_size = 8
    lora_rank = 16
    print('üìä A100 Configuration')

print(f'   Batch size: {batch_size}')
print(f'   LoRA rank: {lora_rank}')

os.environ['COLAB_GPU'] = '1'
os.environ['COLAB_BATCH_SIZE'] = str(batch_size)
os.environ['COLAB_LORA_RANK'] = str(lora_rank)

print('‚úÖ Configuration complete')

üìä T4 Configuration (Conservative)
   Batch size: 2
   LoRA rank: 4
‚úÖ Configuration complete


In [None]:
# STEP 7: Run Training
import time
import sys
import os

print('üöÄ Starting training...')
print(f'   Time: {time.strftime("%H:%M:%S")}')
print(f'   Working directory: {os.getcwd()}')
print('   Estimated duration: 3-5 hours (T4), 2-3 hours (V100)')
print('\n‚è≥ Training in progress...\n')

start = time.time()

try:
    # Import and run
    import federated_complete_training
    federated_complete_training.main()

    elapsed = (time.time() - start) / 3600
    print(f'\n‚úÖ COMPLETE! Time: {elapsed:.2f} hours')

except Exception as e:
    print(f'\n‚ùå Error: {e}')
    import traceback
    traceback.print_exc()

finally:
    drive_dir = os.environ.get('DRIVE_RESULTS_DIR')
    if drive_dir:
        print('\nüíæ Backing up to Google Drive...')
        !cp -r ../results/* {drive_dir}/results/ 2>/dev/null || true
        !cp -r ../plots/* {drive_dir}/plots/ 2>/dev/null || true
        print('‚úÖ Backup complete')
    clear_gpu()

üöÄ Starting training...
   Time: 20:18:26
   Working directory: /content/FarmFederate-Advisor/backend
   Estimated duration: 3-5 hours (T4), 2-3 hours (V100)

‚è≥ Training in progress...

[INFO] Using device: cuda
[INFO] Detected Colab/Limited GPU - Using reduced batch sizes
FEDERATED LEARNING COMPLETE TRAINING SYSTEM

üìä Total Models: 39 (13 LLM + 13 ViT + 13 VLM)
üîÑ Training Paradigms: Federated (5 clients, 10 rounds) + Centralized (10 epochs)
‚ö° Total Training Runs: 78 (39 models √ó 2 paradigms)

‚è±Ô∏è  Estimated Time: 26-38 hours on CPU, 3-5 hours on GPU
üíæ Estimated Disk Space: 15-20 GB for models + 2-3 GB for results

üñ•Ô∏è  Device: cuda
üìÅ Checkpoint Dir: ../checkpoints
üìÅ Results Dir: ../results
üìÅ Plots Dir: ../plots

‚ÑπÔ∏è [Resume] No checkpoint found. Training will start from scratch.
‚úÖ All models trained. Checkpoint cleared.

[DATA] Loading datasets...
[Mix] loading argilla (<= 2000) ...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1695 [00:00<?, ? examples/s]

[Mix] argilla added 0 rows
[Mix] loading agnews (<= 2000) ...


README.md: 0.00B [00:00, ?B/s]

In [None]:
# STEP 8: Generate Plots
clear_gpu()
print('üìä Generating plots...')

try:
    !python comprehensive_plotting.py
    print('‚úÖ Plots generated')
except:
    print('‚ö†Ô∏è Plotting skipped (results still saved)')

drive_dir = os.environ.get('DRIVE_RESULTS_DIR')
if drive_dir:
    !cp -r ../plots/* {drive_dir}/plots/ 2>/dev/null || true

In [None]:
# STEP 9: View Results
import json
import os
from IPython.display import Image, display

results_file = '../results/all_results.json'
if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        results = json.load(f)

    sorted_results = sorted(results, key=lambda x: x.get('final_metrics', {}).get('f1_macro', 0), reverse=True)

    print('='*60)
    print('üèÜ TOP 10 MODELS')
    print('='*60)

    for i, model in enumerate(sorted_results[:10], 1):
        name = model.get('config', {}).get('name', 'Unknown')
        metrics = model.get('final_metrics', {})
        f1 = metrics.get('f1_macro', 0)
        acc = metrics.get('accuracy', 0)
        print(f'{i:2d}. {name:30s} F1: {f1:.4f} | Acc: {acc:.4f}')

    print('\nüìà Top 3 Plots:')
    plots = [f for f in os.listdir('../plots') if f.endswith('.png')][:3]
    for plot in plots:
        print(f'   - {plot}')
        try:
            display(Image(f'../plots/{plot}', width=700))
        except:
            pass
else:
    print('‚ö†Ô∏è No results found')

print(f'\nüíæ All results saved to: {os.environ.get("DRIVE_RESULTS_DIR")}')

In [None]:
# STEP 10: Download Results (Optional)
from google.colab import files
import shutil
import os

print('üì¶ Creating download packages...')

if os.path.exists('../results'):
    shutil.make_archive('/content/results', 'zip', '../results')
    print('‚úÖ results.zip ready')

if os.path.exists('../plots'):
    shutil.make_archive('/content/plots', 'zip', '../plots')
    print('‚úÖ plots.zip ready')

print('\nüì• Click to download:')
try:
    if os.path.exists('/content/results.zip'):
        files.download('/content/results.zip')
    if os.path.exists('/content/plots.zip'):
        files.download('/content/plots.zip')
except:
    print('‚ö†Ô∏è Download manually from Files panel on left')

print('\n‚úÖ TRAINING COMPLETE!')
print('üìä Results in: Google Drive + Downloads folder')