# 🚀 FarmFederate Training - Complete Setup

## Includes All Disconnection Fixes

- ✅ Keep-alive (prevents 90-min timeout)
- ✅ Auto-reconnect (network recovery)
- ✅ Memory management (prevents OOM)
- ✅ Google Drive backup
- ✅ Auto-configuration

**Setup:** Runtime → Change runtime type → GPU → Save

**Time:** 3-5h (T4), 2-3h (V100), 1.5-2h (A100)

In [1]:
# STEP 1: Keep-Alive & Protection (RUN FIRST!)
from IPython.display import Javascript, display
import time

# Keep-alive clicks connect every 60 seconds
keepalive_js = '''
setInterval(function() {
  var btn = document.querySelector('colab-toolbar-button#connect');
  if (btn) btn.click();
}, 60000);
'''

display(Javascript(keepalive_js))
print('✅ Keep-alive enabled!')
print('⚠️ Keep this browser tab open (can be in background)')
print(f'Started at: {time.strftime("%H:%M:%S")}')

<IPython.core.display.Javascript object>

✅ Keep-alive enabled!
⚠️ Keep this browser tab open (can be in background)
Started at: 13:25:44


In [2]:
# STEP 2: GPU Check & Memory Management
import torch
import gc
import os

# Memory optimization
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:512'

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'✅ GPU: {gpu_name}')
    print(f'   Memory: {gpu_memory:.1f} GB')
    torch.cuda.set_per_process_memory_fraction(0.85)
    print(f'   Limit: {gpu_memory * 0.85:.1f} GB (85%)')
else:
    raise RuntimeError('❌ NO GPU! Go to Runtime → Change runtime type → GPU')

def clear_gpu():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

print('✅ Memory management ready')

✅ GPU: Tesla T4
   Memory: 15.8 GB
   Limit: 13.5 GB (85%)
✅ Memory management ready


In [3]:
# STEP 3: Mount Google Drive (IMPORTANT!)
from google.colab import drive

drive.mount('/content/drive')

import os
results_dir = '/content/drive/MyDrive/FarmFederate_Results'
os.makedirs(results_dir, exist_ok=True)
os.makedirs(f'{results_dir}/checkpoints', exist_ok=True)
os.makedirs(f'{results_dir}/results', exist_ok=True)
os.makedirs(f'{results_dir}/plots', exist_ok=True)

os.environ['DRIVE_RESULTS_DIR'] = results_dir

print(f'✅ Google Drive mounted')
print(f'   Results will save to: {results_dir}')
print('   ⚠️ This prevents data loss if disconnected!')

Mounted at /content/drive
✅ Google Drive mounted
   Results will save to: /content/drive/MyDrive/FarmFederate_Results
   ⚠️ This prevents data loss if disconnected!


In [4]:
# STEP 4: Install Dependencies
%%capture
!pip install -q transformers datasets peft accelerate evaluate scikit-learn
!pip install -q sentencepiece protobuf timm torch torchvision
!pip install -q matplotlib seaborn pandas pillow

print('✅ Dependencies installed')

In [5]:
# STEP 5: Clone Repository & Checkout Correct Branch
import os

if not os.path.exists('/content/FarmFederate-Advisor'):
    print('📥 Cloning repository...')
    !git clone -b feature/dummy-sensor-data-clean https://github.com/Solventerritory/FarmFederate-Advisor.git
    print('✅ Repository cloned (feature/dummy-sensor-data-clean branch)')
else:
    print('📥 Updating repository...')
    !cd /content/FarmFederate-Advisor && git checkout feature/dummy-sensor-data-clean && git pull
    print('✅ Repository updated')

# Change to backend directory
os.chdir('/content/FarmFederate-Advisor/backend')
print(f'✅ Working directory: {os.getcwd()}')

# Verify training script exists
if os.path.exists('federated_complete_training.py'):
    print('✅ Training script found!')
else:
    print('❌ Training script missing!')
    print(f'   Files in backend: {os.listdir("."[:20])}')
    raise FileNotFoundError('Training script not found - repository may be corrupted')

📥 Cloning repository...
Cloning into 'FarmFederate-Advisor'...
remote: Enumerating objects: 2239, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 2239 (delta 62), reused 72 (delta 25), pack-reused 2108 (from 1)[K
Receiving objects: 100% (2239/2239), 183.01 MiB | 28.13 MiB/s, done.
Resolving deltas: 100% (417/417), done.
Updating files: 100% (2111/2111), done.
Downloading backend/checkpoints/global_central.pt (847 MB)
Error downloading object: backend/checkpoints/global_central.pt (8ca19bc): Smudge error: Error downloading backend/checkpoints/global_central.pt (8ca19bc8f9d655201a0c0530e2b825f5a481d576b9bc5677e223d3157f8e37aa): batch response: This repository exceeded its LFS budget. The account responsible for the budget should increase it to restore access.

Errors logged to /content/FarmFederate-Advisor/.git/lfs/logs/20260112T132641.061045055.log
Use `git lfs logs last` to view the log.
error: external fi

In [6]:
# STEP 6: Auto-Configure for GPU
import sys

# Add backend to path
sys.path.insert(0, '/content/FarmFederate-Advisor/backend')

if gpu_memory < 16:
    batch_size = 2
    lora_rank = 4
    print('📊 T4 Configuration (Conservative)')
elif gpu_memory < 24:
    batch_size = 4
    lora_rank = 8
    print('📊 V100 Configuration')
else:
    batch_size = 8
    lora_rank = 16
    print('📊 A100 Configuration')

print(f'   Batch size: {batch_size}')
print(f'   LoRA rank: {lora_rank}')

os.environ['COLAB_GPU'] = '1'
os.environ['COLAB_BATCH_SIZE'] = str(batch_size)
os.environ['COLAB_LORA_RANK'] = str(lora_rank)

print('✅ Configuration complete')

📊 T4 Configuration (Conservative)
   Batch size: 2
   LoRA rank: 4
✅ Configuration complete


In [None]:
# STEP 7: Run Training
import time
import sys
import os

print('🚀 Starting training...')
print(f'   Time: {time.strftime("%H:%M:%S")}')
print(f'   Working directory: {os.getcwd()}')
print('   Estimated duration: 3-5 hours (T4), 2-3 hours (V100)')
print('\n⏳ Training in progress...\n')

start = time.time()

try:
    # Import and run
    import federated_complete_training
    federated_complete_training.main()

    elapsed = (time.time() - start) / 3600
    print(f'\n✅ COMPLETE! Time: {elapsed:.2f} hours')

except Exception as e:
    print(f'\n❌ Error: {e}')
    import traceback
    traceback.print_exc()

finally:
    drive_dir = os.environ.get('DRIVE_RESULTS_DIR')
    if drive_dir:
        print('\n💾 Backing up to Google Drive...')
        !cp -r ../results/* {drive_dir}/results/ 2>/dev/null || true
        !cp -r ../plots/* {drive_dir}/plots/ 2>/dev/null || true
        print('✅ Backup complete')
    clear_gpu()

🚀 Starting training...
   Time: 13:26:41
   Working directory: /content/FarmFederate-Advisor/backend
   Estimated duration: 3-5 hours (T4), 2-3 hours (V100)

⏳ Training in progress...

[INFO] Using device: cuda
[INFO] Detected Colab/Limited GPU - Using reduced batch sizes
FEDERATED LEARNING COMPLETE TRAINING SYSTEM

📊 Total Models: 39 (13 LLM + 13 ViT + 13 VLM)
🔄 Training Paradigms: Federated (5 clients, 10 rounds) + Centralized (10 epochs)
⚡ Total Training Runs: 78 (39 models × 2 paradigms)

⏱️  Estimated Time: 26-38 hours on CPU, 3-5 hours on GPU
💾 Estimated Disk Space: 15-20 GB for models + 2-3 GB for results

🖥️  Device: cuda
📁 Checkpoint Dir: ../checkpoints
📁 Results Dir: ../results
📁 Plots Dir: ../plots

ℹ️ [Resume] No checkpoint found. Training will start from scratch.
✅ All models trained. Checkpoint cleared.

[DATA] Loading datasets...
[Mix] loading argilla (<= 2000) ...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1695 [00:00<?, ? examples/s]

[Mix] argilla added 0 rows
[Mix] loading agnews (<= 2000) ...


README.md: 0.00B [00:00, ?B/s]

[Mix] agnews added 746 rows
[Mix] source breakdown:
source
localmini    1428
agnews        586
Name: count, dtype: int64
[DATA] Text corpus: 2014 samples
[Images] trying to load BrandonFors/Plant-Diseases-PlantVillage-Dataset ({'split': 'train'}) ...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/321M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/362M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/170M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43456 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10849 [00:00<?, ? examples/s]

[Images] BrandonFors/Plant-Diseases-PlantVillage-Dataset loaded: 6000 samples
[Images] trying to load Saon110/bd-crop-vegetable-plant-disease-dataset ({'split': 'train'}) ...


README.md:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

[Images] failed to load Saon110/bd-crop-vegetable-plant-disease-dataset: Dataset 'Saon110/bd-crop-vegetable-plant-disease-dataset' is a gated dataset on the Hub. You must be authenticated to access it.
[Images] trying to load timm/plant-pathology-2021 ({'split': 'train'}) ...


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/30 [00:00<?, ?files/s]

data/train-00000-of-00030.parquet:   0%|          | 0.00/489M [00:00<?, ?B/s]

data/train-00001-of-00030.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

data/train-00002-of-00030.parquet:   0%|          | 0.00/488M [00:00<?, ?B/s]

data/train-00003-of-00030.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

data/train-00004-of-00030.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

data/train-00005-of-00030.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00006-of-00030.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

data/train-00007-of-00030.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

data/train-00008-of-00030.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00009-of-00030.parquet:   0%|          | 0.00/479M [00:00<?, ?B/s]

data/train-00010-of-00030.parquet:   0%|          | 0.00/489M [00:00<?, ?B/s]

data/train-00011-of-00030.parquet:   0%|          | 0.00/477M [00:00<?, ?B/s]

data/train-00012-of-00030.parquet:   0%|          | 0.00/479M [00:00<?, ?B/s]

data/train-00013-of-00030.parquet:   0%|          | 0.00/475M [00:00<?, ?B/s]

data/train-00014-of-00030.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

data/train-00015-of-00030.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]

data/train-00016-of-00030.parquet:   0%|          | 0.00/488M [00:00<?, ?B/s]

data/train-00017-of-00030.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

data/train-00018-of-00030.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

data/train-00019-of-00030.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00020-of-00030.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

data/train-00021-of-00030.parquet:   0%|          | 0.00/483M [00:00<?, ?B/s]

data/train-00022-of-00030.parquet:   0%|          | 0.00/477M [00:00<?, ?B/s]

data/train-00023-of-00030.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00024-of-00030.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

data/train-00025-of-00030.parquet:   0%|          | 0.00/493M [00:00<?, ?B/s]

data/train-00026-of-00030.parquet:   0%|          | 0.00/477M [00:00<?, ?B/s]

data/train-00027-of-00030.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00028-of-00030.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00029-of-00030.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

data/validation-00000-of-00004.parquet:   0%|          | 0.00/396M [00:00<?, ?B/s]

data/validation-00001-of-00004.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

data/validation-00002-of-00004.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

data/validation-00003-of-00004.parquet:   0%|          | 0.00/398M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16768 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1864 [00:00<?, ? examples/s]

In [None]:
# STEP 8: Generate Plots
clear_gpu()
print('📊 Generating plots...')

try:
    !python comprehensive_plotting.py
    print('✅ Plots generated')
except:
    print('⚠️ Plotting skipped (results still saved)')

drive_dir = os.environ.get('DRIVE_RESULTS_DIR')
if drive_dir:
    !cp -r ../plots/* {drive_dir}/plots/ 2>/dev/null || true

In [None]:
# STEP 9: View Results
import json
import os
from IPython.display import Image, display

results_file = '../results/all_results.json'
if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        results = json.load(f)

    sorted_results = sorted(results, key=lambda x: x.get('final_metrics', {}).get('f1_macro', 0), reverse=True)

    print('='*60)
    print('🏆 TOP 10 MODELS')
    print('='*60)

    for i, model in enumerate(sorted_results[:10], 1):
        name = model.get('config', {}).get('name', 'Unknown')
        metrics = model.get('final_metrics', {})
        f1 = metrics.get('f1_macro', 0)
        acc = metrics.get('accuracy', 0)
        print(f'{i:2d}. {name:30s} F1: {f1:.4f} | Acc: {acc:.4f}')

    print('\n📈 Top 3 Plots:')
    plots = [f for f in os.listdir('../plots') if f.endswith('.png')][:3]
    for plot in plots:
        print(f'   - {plot}')
        try:
            display(Image(f'../plots/{plot}', width=700))
        except:
            pass
else:
    print('⚠️ No results found')

print(f'\n💾 All results saved to: {os.environ.get("DRIVE_RESULTS_DIR")}')

In [None]:
# STEP 10: Download Results (Optional)
from google.colab import files
import shutil
import os

print('📦 Creating download packages...')

if os.path.exists('../results'):
    shutil.make_archive('/content/results', 'zip', '../results')
    print('✅ results.zip ready')

if os.path.exists('../plots'):
    shutil.make_archive('/content/plots', 'zip', '../plots')
    print('✅ plots.zip ready')

print('\n📥 Click to download:')
try:
    if os.path.exists('/content/results.zip'):
        files.download('/content/results.zip')
    if os.path.exists('/content/plots.zip'):
        files.download('/content/plots.zip')
except:
    print('⚠️ Download manually from Files panel on left')

print('\n✅ TRAINING COMPLETE!')
print('📊 Results in: Google Drive + Downloads folder')