# 🚀 ML Phone Number Price Prediction - Colab Auto-Resume Training

**Features:**
- ✅ Auto-save checkpoints to Google Drive every 10 epochs
- ✅ Auto-resume from last checkpoint after disconnection
- ✅ Zero data loss (all progress saved to Drive)
- ✅ Reconnect-proof training

**How to Use:**
1. Run Cell 1-4 sequentially
2. Cell 4 will auto-detect if there's a checkpoint
3. If found → Resume from last epoch
4. If not found → Start fresh
5. Training auto-saves every 10 epochs to Drive

**If disconnected:**
- Just reconnect and run Cell 1-4 again
- Training will resume from last saved checkpoint!

---

## 📂 Cell 1: Mount Google Drive & Setup Directories

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Project directory in Google Drive
PROJECT_NAME = 'ML_Phone_Number_Project'
PROJECT_DIR = f'/content/drive/MyDrive/{PROJECT_NAME}'

# Create project directories
import os
directories = [
    f'{PROJECT_DIR}/checkpoints',
    f'{PROJECT_DIR}/models',
    f'{PROJECT_DIR}/logs',
    f'{PROJECT_DIR}/results',
    f'{PROJECT_DIR}/data'
]

for dir_path in directories:
    os.makedirs(dir_path, exist_ok=True)
    
print("✅ Google Drive mounted successfully!")
print(f"📁 Project directory: {PROJECT_DIR}")
print("\n📂 Created directories:")
for dir_path in directories:
    print(f"   - {dir_path}")

# Change to project directory
%cd /content
print(f"\n✅ Current directory: {os.getcwd()}")

## 📦 Cell 2: Upload & Extract Project Files

In [None]:
# Option 1: Upload ZIP file manually
print("📤 Upload Options:")
print("   Option A: Upload ZIP file from your computer")
print("   Option B: Download from GitHub/URL")
print("")

# Choose your option:
USE_MANUAL_UPLOAD = True  # Set to False to use GitHub URL

if USE_MANUAL_UPLOAD:
    # Upload manually
    from google.colab import files
    print("Please upload the project ZIP file...")
    uploaded = files.upload()
    
    # Get uploaded filename
    zip_filename = list(uploaded.keys())[0]
    print(f"✅ Uploaded: {zip_filename}")
else:
    # Download from URL
    GITHUB_URL = "YOUR_GITHUB_RELEASE_URL_HERE.zip"  # Update this!
    zip_filename = "project.zip"
    !wget -O {zip_filename} "{GITHUB_URL}"
    print(f"✅ Downloaded: {zip_filename}")

# Extract ZIP
print("\n📦 Extracting project files...")
!unzip -o {zip_filename} -d /content/

# Change to project directory
import os
if os.path.exists('/content/number-ML'):
    %cd /content/number-ML
    print("✅ Project extracted successfully!")
    print(f"📁 Current directory: {os.getcwd()}")
else:
    print("❌ Project directory not found. Check ZIP structure.")

# List files
print("\n📂 Project structure:")
!ls -lh

## 🔧 Cell 3: Install Dependencies

In [None]:
# Install required packages
print("📦 Installing dependencies...")
print("This may take 2-5 minutes...\n")

!pip install -q --upgrade pip
!pip install -q -r requirements.txt

# Verify imports
print("\n✅ Verifying imports...")
try:
    import numpy as np
    import pandas as pd
    import sklearn
    import xgboost as xgb
    import lightgbm as lgb
    import catboost
    import optuna
    print("   ✓ numpy:", np.__version__)
    print("   ✓ pandas:", pd.__version__)
    print("   ✓ scikit-learn:", sklearn.__version__)
    print("   ✓ xgboost:", xgb.__version__)
    print("   ✓ lightgbm:", lgb.__version__)
    print("   ✓ catboost:", catboost.__version__)
    print("   ✓ optuna:", optuna.__version__)
    print("\n✅ All dependencies installed successfully!")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Please check requirements.txt and try again.")

## 🔍 Cell 4: Auto-Detect Checkpoint & Setup Auto-Resume

In [None]:
# Import checkpoint manager
import sys
sys.path.insert(0, '/content/number-ML')

from src.checkpoint_manager import CheckpointManager

# Initialize checkpoint manager
CHECKPOINT_DIR = f"{PROJECT_DIR}/checkpoints"
checkpoint_manager = CheckpointManager(
    checkpoint_dir=CHECKPOINT_DIR,
    max_checkpoints=5,
    save_every=10
)

# Check for existing checkpoint
print("🔍 Checking for existing checkpoint...\n")
checkpoint = checkpoint_manager.load_latest_checkpoint()

if checkpoint:
    RESUME_MODE = True
    START_EPOCH = checkpoint['epoch'] + 1
    PREVIOUS_METRICS = checkpoint.get('metrics', {})
    
    print("=" * 60)
    print("🔄 RESUME MODE ACTIVATED")
    print("=" * 60)
    print(f"Last completed epoch: {checkpoint['epoch']}")
    print(f"Will resume from epoch: {START_EPOCH}")
    print(f"Checkpoint timestamp: {checkpoint['timestamp']}")
    print("\nPrevious metrics:")
    for key, value in PREVIOUS_METRICS.items():
        print(f"  {key}: {value}")
    print("=" * 60)
    
    # Print recovery info if available
    print("\n")
    checkpoint_manager.print_recovery_info()
    
else:
    RESUME_MODE = False
    START_EPOCH = 0
    PREVIOUS_METRICS = {}
    
    print("=" * 60)
    print("🆕 FRESH START MODE")
    print("=" * 60)
    print("No checkpoint found.")
    print("Will start training from epoch 0.")
    print("Checkpoints will be saved to Google Drive every 10 epochs.")
    print("=" * 60)

print("\n✅ Auto-resume setup complete!")
print(f"Resume mode: {RESUME_MODE}")
print(f"Start epoch: {START_EPOCH}")

## 📊 Cell 5: Load Data & Feature Engineering

In [None]:
# Import required modules
from src.data_handler import load_and_clean_data
from src.features import create_masterpiece_features
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

print("📂 Loading data...")

# Load data (adjust path as needed)
# Option 1: From project data folder
DATA_PATH = 'data/raw/numberdata.csv'

# Option 2: From Google Drive
# DATA_PATH = f'{PROJECT_DIR}/data/numberdata.csv'

# Load and clean
df = load_and_clean_data(DATA_PATH)
print(f"✅ Loaded {len(df)} records")
print(f"   Columns: {list(df.columns)}")

# Feature engineering
print("\n🔧 Creating features...")
df_features = create_masterpiece_features(df)
print(f"✅ Created {len(df_features.columns)} features")

# Prepare X and y
feature_cols = [col for col in df_features.columns if col not in ['phone_number', 'price']]
X = df_features[feature_cols].values
y = df_features['price'].values

print(f"\n📊 Dataset shape:")
print(f"   Features (X): {X.shape}")
print(f"   Target (y): {y.shape}")

# Train/test split
print("\n✂️ Splitting train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"   Train: {X_train.shape}")
print(f"   Test: {X_test.shape}")

# Save feature names for later
FEATURE_NAMES = feature_cols

print("\n✅ Data preparation complete!")

## 🚀 Cell 6: Train with Auto-Checkpoint & Auto-Resume

**This cell will:**
- Resume from checkpoint if found (Cell 4)
- Train model with auto-save every 10 epochs
- Save checkpoints to Google Drive
- If disconnected, just run Cell 1-6 again → Resume automatically!


In [None]:
from src.train_colab import train_with_auto_resume

# Training configuration
TOTAL_EPOCHS = 100  # Adjust as needed
MODEL_TYPE = 'xgboost'  # Options: 'xgboost', 'lightgbm', 'catboost', 'random_forest'
SAVE_EVERY = 10  # Save checkpoint every N epochs

# Model parameters (adjust as needed)
model_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.1,
    'max_depth': 6,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

print("=" * 70)
print("🚀 TRAINING WITH AUTO-CHECKPOINT & AUTO-RESUME")
print("=" * 70)
print(f"Model type: {MODEL_TYPE}")
print(f"Total epochs: {TOTAL_EPOCHS}")
print(f"Save every: {SAVE_EVERY} epochs")
print(f"Resume mode: {RESUME_MODE}")
if RESUME_MODE:
    print(f"Starting from epoch: {START_EPOCH}")
print("=" * 70)
print("")

# Train with auto-resume
result = train_with_auto_resume(
    X_train=X_train,
    y_train=y_train,
    X_val=X_test,
    y_val=y_test,
    model_type=MODEL_TYPE,
    checkpoint_dir=CHECKPOINT_DIR,
    project_dir=PROJECT_DIR,
    total_epochs=TOTAL_EPOCHS,
    save_every=SAVE_EVERY,
    params=model_params
)

print("\n" + "=" * 70)
print("✅ TRAINING COMPLETE!")
print("=" * 70)
print(f"Final R² Score: {result['metrics']['r2_score']:.4f}")
print(f"Best R² Score: {result['best_score']:.4f}")
print(f"Final Train Loss: {result['metrics']['train_loss']:.4f}")
print(f"Final Val Loss: {result['metrics']['val_loss']:.4f}")
print("=" * 70)

# Save result for next cell
TRAINED_MODEL = result['model']
TRAINING_METRICS = result['metrics']

print("\n💾 Model and checkpoints saved to Google Drive:")
print(f"   {PROJECT_DIR}/checkpoints/")
print(f"   {PROJECT_DIR}/models/")
print("\n✅ Safe to disconnect now! Progress is saved.")

## 📈 Cell 7: Evaluate Model & Save Results

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib
from datetime import datetime

print("📊 Evaluating model...\n")

# Make predictions
y_train_pred = TRAINED_MODEL.predict(X_train)
y_test_pred = TRAINED_MODEL.predict(X_test)

# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Print results
print("=" * 60)
print("📊 FINAL MODEL EVALUATION")
print("=" * 60)
print(f"\nTraining Set:")
print(f"  R² Score:  {train_r2:.4f}")
print(f"  MAE:       {train_mae:.4f}")
print(f"  RMSE:      {train_rmse:.4f}")
print(f"\nTest Set:")
print(f"  R² Score:  {test_r2:.4f}")
print(f"  MAE:       {test_mae:.4f}")
print(f"  RMSE:      {test_rmse:.4f}")
print("=" * 60)

# Save comprehensive model package
print("\n💾 Saving final model package...")

model_package = {
    'model': TRAINED_MODEL,
    'model_type': MODEL_TYPE,
    'feature_names': FEATURE_NAMES,
    'metrics': {
        'train_r2': float(train_r2),
        'test_r2': float(test_r2),
        'train_mae': float(train_mae),
        'test_mae': float(test_mae),
        'train_rmse': float(train_rmse),
        'test_rmse': float(test_rmse)
    },
    'training_config': {
        'total_epochs': TOTAL_EPOCHS,
        'model_params': model_params if 'model_params' in locals() else None
    },
    'timestamp': datetime.now().isoformat(),
    'trained_on': 'Google Colab',
    'data_shape': {
        'n_samples': len(X),
        'n_features': len(FEATURE_NAMES),
        'train_size': len(X_train),
        'test_size': len(X_test)
    }
}

# Save to Google Drive
final_model_path = f"{PROJECT_DIR}/models/final_model_complete.pkl"
joblib.dump(model_package, final_model_path)

print(f"✅ Model package saved to: {final_model_path}")

# Save evaluation report
report_path = f"{PROJECT_DIR}/results/evaluation_report.txt"
with open(report_path, 'w') as f:
    f.write("ML Phone Number Price Prediction - Evaluation Report\n")
    f.write("=" * 60 + "\n\n")
    f.write(f"Model Type: {MODEL_TYPE}\n")
    f.write(f"Training Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    f.write("Training Set Metrics:\n")
    f.write(f"  R² Score:  {train_r2:.4f}\n")
    f.write(f"  MAE:       {train_mae:.4f}\n")
    f.write(f"  RMSE:      {train_rmse:.4f}\n\n")
    f.write("Test Set Metrics:\n")
    f.write(f"  R² Score:  {test_r2:.4f}\n")
    f.write(f"  MAE:       {test_mae:.4f}\n")
    f.write(f"  RMSE:      {test_rmse:.4f}\n")

print(f"✅ Report saved to: {report_path}")
print("\n✅ All results saved to Google Drive!")

## 📥 Cell 8: Download Final Model (Optional)

In [None]:
# Download final model to local machine
from google.colab import files

print("📥 Downloading final model...\n")

# Download model
model_path = f"{PROJECT_DIR}/models/final_model_complete.pkl"
files.download(model_path)

print("✅ Model downloaded!")
print("\n📂 Files in Google Drive:")
print(f"   Models: {PROJECT_DIR}/models/")
print(f"   Checkpoints: {PROJECT_DIR}/checkpoints/")
print(f"   Results: {PROJECT_DIR}/results/")
print(f"   Logs: {PROJECT_DIR}/logs/")

print("\n🎉 Training complete! All files safely stored in Google Drive.")
print("\nℹ️  You can reconnect anytime and resume training from last checkpoint!")