# üéôÔ∏è KonkaniVani ASR Training - Google One + Colab

**Resume training from Epoch 15 using your Google One storage**

---

## ‚ö° Quick Start

1. **Enable GPU**: Runtime ‚Üí Change runtime type ‚Üí GPU (T4)
2. **Run cells in order** (1 ‚Üí 8)
3. **Keep browser tab open** during training (~12 hours)

---

## üì¶ Cell 1: Install Dependencies

In [None]:
print("üì¶ Installing dependencies...\n")
!pip install -q torch torchaudio tensorboard jiwer pyyaml soundfile
print("‚úÖ Dependencies installed!\n")

# Verify GPU
print("üîç Checking GPU...\n")
!nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
print("\n‚úÖ If you see 'Tesla T4' above, you're good to go!")
print("   If not, go to Runtime ‚Üí Change runtime type ‚Üí GPU")

## üíæ Cell 2: Mount Google Drive & Verify Files

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

print("\n" + "="*70)
print("üìÇ VERIFYING YOUR GOOGLE ONE DRIVE FILES")
print("="*70 + "\n")

# Define paths
base_path = "/content/drive/MyDrive/konkanivani_training"
audio_path = f"{base_path}/konkani-asr-v0/data/processed_segments_diarized/audio_segments"
manifest_path = f"{base_path}/konkani-asr-v0/splits/manifests"
checkpoint_path = f"{base_path}/checkpoint_epoch_15.pt"
vocab_path = f"{base_path}/vocab.json"

all_good = True

# Check audio files
if os.path.exists(audio_path):
    audio_files = [f for f in os.listdir(audio_path) if f.endswith('.wav')]
    print(f"‚úÖ Audio files: {len(audio_files)} files")
    if len(audio_files) < 100:
        print(f"   ‚ö†Ô∏è  Warning: Expected ~2500+ files, found {len(audio_files)}")
        all_good = False
else:
    print(f"‚ùå Audio files NOT FOUND")
    print(f"   Expected at: {audio_path}")
    all_good = False

# Check manifests
if os.path.exists(manifest_path):
    manifests = ['train.json', 'val.json', 'test.json']
    found_manifests = [m for m in manifests if os.path.exists(f"{manifest_path}/{m}")]
    print(f"‚úÖ Manifests: {', '.join(found_manifests)}")
    if len(found_manifests) < 2:
        print(f"   ‚ö†Ô∏è  Warning: Need at least train.json and val.json")
        all_good = False
else:
    print(f"‚ùå Manifests NOT FOUND")
    print(f"   Expected at: {manifest_path}")
    all_good = False

# Check checkpoint
if os.path.exists(checkpoint_path):
    size_mb = os.path.getsize(checkpoint_path) / (1024*1024)
    print(f"‚úÖ Checkpoint: checkpoint_epoch_15.pt ({size_mb:.1f} MB)")
    if size_mb < 200:
        print(f"   ‚ö†Ô∏è  Warning: File seems too small, might be corrupted")
        all_good = False
else:
    print(f"‚ùå Checkpoint NOT FOUND")
    print(f"   Expected at: {checkpoint_path}")
    all_good = False

# Check vocab
if os.path.exists(vocab_path):
    print(f"‚úÖ Vocabulary: vocab.json")
else:
    print(f"‚ùå Vocab NOT FOUND")
    print(f"   Expected at: {vocab_path}")
    all_good = False

print("\n" + "="*70)
if all_good:
    print("‚úÖ ALL FILES VERIFIED! Ready to proceed.")
else:
    print("‚ùå SOME FILES ARE MISSING!")
    print("   Please upload missing files to your Google Drive.")
    print("   See: GOOGLE_ONE_SETUP_GUIDE.md for instructions.")
print("="*70)

## üì§ Cell 3: Upload Code Package

**Upload `konkani_code.zip` (38MB) from your Mac**

In [None]:
from google.colab import files
import os

print("üì§ Please upload: konkani_code.zip")
print("   Location on your Mac: /Volumes/data&proj/konkani/konkani_code.zip")
print("   Size: ~38MB\n")

uploaded = files.upload()

if 'konkani_code.zip' in uploaded:
    print("\nüìÇ Extracting code...")
    !unzip -q konkani_code.zip -d /content/
    print("‚úÖ Code extracted!\n")
    
    # Verify extraction
    print("üìã Verifying extracted files:\n")
    
    checks = [
        ("/content/train_konkanivani_asr.py", "Training script"),
        ("/content/models/konkanivani_asr.py", "Model definition"),
        ("/content/data/audio_processing/dataset.py", "Dataset loader"),
        ("/content/data/konkani-asr-v0/splits/manifests/train.json", "Train manifest"),
        ("/content/vocab.json", "Vocabulary")
    ]
    
    all_present = True
    for path, name in checks:
        if os.path.exists(path):
            print(f"‚úÖ {name}")
        else:
            print(f"‚ùå {name} - NOT FOUND: {path}")
            all_present = False
    
    if all_present:
        print("\n‚úÖ All code files present! Ready for next step.")
    else:
        print("\n‚ùå Some files missing! Check your konkani_code.zip")
else:
    print("\n‚ùå Please upload konkani_code.zip")

## üîó Cell 4: Link Data from Google Drive

**This creates symbolic links (instant, no copying!)**

In [None]:
import os

print("üîó Setting up data links...\n")

# Create directory structure
!mkdir -p /content/data/konkani-asr-v0/data/processed_segments_diarized
!mkdir -p /content/checkpoints
!mkdir -p /content/logs

base_path = "/content/drive/MyDrive/konkanivani_training"

# Link audio files (instant!)
drive_audio = f"{base_path}/konkani-asr-v0/data/processed_segments_diarized/audio_segments"
local_audio = "/content/data/konkani-asr-v0/data/processed_segments_diarized/audio_segments"

if os.path.exists(drive_audio):
    !ln -s {drive_audio} {local_audio}
    audio_count = len([f for f in os.listdir(drive_audio) if f.endswith('.wav')])
    print(f"‚úÖ Linked {audio_count} audio files (instant, no copying!)")
else:
    print(f"‚ùå Audio path not found: {drive_audio}")

# Copy checkpoint (small file)
drive_checkpoint = f"{base_path}/checkpoint_epoch_15.pt"
if os.path.exists(drive_checkpoint):
    !cp {drive_checkpoint} /content/checkpoints/
    size_mb = os.path.getsize('/content/checkpoints/checkpoint_epoch_15.pt') / (1024*1024)
    print(f"‚úÖ Copied checkpoint ({size_mb:.1f} MB)")
else:
    print(f"‚ùå Checkpoint not found: {drive_checkpoint}")

# Verify manifests from zip
manifest_path = "/content/data/konkani-asr-v0/splits/manifests"
if os.path.exists(f"{manifest_path}/train.json"):
    print(f"‚úÖ Manifests ready (from code zip)")
else:
    print(f"‚ùå Manifests not found in code zip")

# Verify vocab from zip
if os.path.exists("/content/vocab.json"):
    print(f"‚úÖ Vocabulary ready (from code zip)")
else:
    print(f"‚ùå Vocab not found in code zip")

print("\n" + "="*70)
print("‚úÖ SETUP COMPLETE! Ready to train.")
print("="*70)

## üöÄ Cell 5: Start Training (Resume from Epoch 15)

**This will run for ~12 hours. Keep this tab open!**

In [None]:
import os

print("="*70)
print("üöÄ STARTING KONKANIVANI ASR TRAINING")
print("="*70)

# Check for checkpoint
checkpoint_path = "/content/checkpoints/checkpoint_epoch_15.pt"
if os.path.exists(checkpoint_path):
    print("\n‚úÖ Found checkpoint: checkpoint_epoch_15.pt")
    print("   Resuming from Epoch 16")
    print("   Training epochs: 16 ‚Üí 50 (35 epochs remaining)")
    print("   Estimated time: ~12 hours\n")
    resume_flag = f"--resume {checkpoint_path}"
else:
    print("\n‚ö†Ô∏è  No checkpoint found, starting from scratch")
    print("   Training epochs: 1 ‚Üí 50")
    print("   Estimated time: ~20 hours\n")
    resume_flag = ""

print("üìä Training configuration:")
print("   ‚Ä¢ Device: CUDA (Tesla T4 GPU)")
print("   ‚Ä¢ Batch size: 16")
print("   ‚Ä¢ Model: d_model=256, 12 encoder layers, 6 decoder layers")
print("   ‚Ä¢ Checkpoints saved every 5 epochs")
print("   ‚Ä¢ Best model saved when validation improves\n")

print("‚è∞ Training will start in 5 seconds...\n")
import time
time.sleep(5)

print("="*70)
print("TRAINING STARTED - DO NOT CLOSE THIS TAB!")
print("="*70 + "\n")

# Start training
!python3 /content/train_konkanivani_asr.py \
    --train_manifest /content/data/konkani-asr-v0/splits/manifests/train.json \
    --val_manifest /content/data/konkani-asr-v0/splits/manifests/val.json \
    --vocab_file /content/vocab.json \
    --batch_size 16 \
    --num_epochs 50 \
    --learning_rate 0.0005 \
    --device cuda \
    --d_model 256 \
    --encoder_layers 12 \
    --decoder_layers 6 \
    --checkpoint_dir /content/checkpoints \
    --log_dir /content/logs \
    {resume_flag}

## üìä Cell 6: Monitor Training Progress

**Run this in a separate cell while training is running**

In [None]:
import os
from datetime import datetime

print(f"üìä Training Status - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
print("="*70)

# Check checkpoints
print("\nüíæ Saved Checkpoints:\n")
!ls -lth /content/checkpoints/ | head -8

# Check GPU usage
print("\nüî• GPU Usage:\n")
!nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader

# Check latest log
print("\nüìù Recent Training Log:\n")
!tail -30 /content/logs/training.log 2>/dev/null || echo "Log file not created yet"

print("\n" + "="*70)

## üíæ Cell 7: Backup Checkpoints to Drive

**Run this every 2-3 hours to save progress**

In [None]:
from datetime import datetime
import os

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_path = f"/content/drive/MyDrive/konkanivani_training/backups/backup_{timestamp}"

print(f"üíæ Backing up to Google Drive...\n")
print(f"Backup location: {backup_path}\n")

!mkdir -p {backup_path}

# Copy checkpoints
!cp -r /content/checkpoints/* {backup_path}/ 2>/dev/null

# Copy logs
!cp -r /content/logs {backup_path}/ 2>/dev/null

print("‚úÖ Backup complete!\n")
print("üìã Backed up files:\n")
!ls -lh {backup_path}/

print(f"\nüí° Tip: Run this cell every 2-3 hours to save your progress!")

## üì• Cell 8: Download Final Model

**Run this after training completes**

In [None]:
from google.colab import files
import os
from datetime import datetime

print("üì¶ Preparing final model package...\n")

# Create package directory
!mkdir -p /content/final_model

# Copy best model
if os.path.exists('/content/checkpoints/best_model.pt'):
    !cp /content/checkpoints/best_model.pt /content/final_model/
    print("‚úÖ Copied best_model.pt")
else:
    print("‚ö†Ô∏è  best_model.pt not found, using latest checkpoint")
    !cp /content/checkpoints/checkpoint_epoch_*.pt /content/final_model/ 2>/dev/null | tail -1

# Copy supporting files
!cp /content/vocab.json /content/final_model/
!cp -r /content/models /content/final_model/
!cp /content/inference_konkanivani.py /content/final_model/ 2>/dev/null

print("‚úÖ Copied supporting files\n")

# Create zip
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_name = f"konkanivani_final_model_{timestamp}.zip"

!cd /content && zip -r {zip_name} final_model/

print(f"\nüì¶ Package created: {zip_name}")
print(f"   Size: ", end="")
!ls -lh /content/{zip_name} | awk '{print $5}'

# Also save to Drive
drive_path = "/content/drive/MyDrive/konkanivani_training/final_models"
!mkdir -p {drive_path}
!cp /content/{zip_name} {drive_path}/
print(f"\n‚úÖ Saved to Drive: {drive_path}/{zip_name}")

# Download
print(f"\nüì• Downloading to your computer...")
files.download(f'/content/{zip_name}')

print("\n" + "="*70)
print("‚úÖ TRAINING COMPLETE!")
print("="*70)
print("\nYour model is ready to use!")
print("Extract the zip and use inference_konkanivani.py to test it.")

---

## üîß Troubleshooting

### Training stopped / Runtime disconnected
```python
# Just re-run Cell 5 - it will resume from the latest checkpoint
```

### Out of memory error
```python
# In Cell 5, change:
--batch_size 16  ‚Üí  --batch_size 8
```

### Can't find files in Drive
```python
# Check your folder structure:
!ls -la /content/drive/MyDrive/konkanivani_training/
```

---

## üí° Tips

1. **Keep tab open**: Colab disconnects after 90 min of inactivity
2. **Backup regularly**: Run Cell 7 every 2-3 hours
3. **Monitor progress**: Run Cell 6 to check status
4. **GPU usage**: Should be 90-100% during training
5. **Checkpoints**: Saved every 5 epochs automatically

---