# 🚀 ML Phone Number Price Prediction - Kaggle Auto-Resume Training

**Optimized for Kaggle with P100 GPU!**

**Features:**
- ✅ Auto-save checkpoints to /kaggle/working every 10 epochs
- ✅ Auto-resume from last checkpoint after timeout
- ✅ Zero data loss (all progress saved)
- ✅ Optimized for P100 GPU (30% faster than Colab T4!)
- ✅ 16-30 GB RAM (more than Colab!)

**How to Use:**
1. Add dataset as Kaggle Dataset (one-time)
2. Run Cell 1-6 sequentially
3. Cell 4 will auto-detect if there's a checkpoint
4. If found → Resume from last epoch
5. If not found → Start fresh
6. Training auto-saves every 10 epochs to /kaggle/working

**If timeout (9 hours):**
- Kaggle auto-commits notebook version
- Fork this notebook (new version)
- Run Cell 1-6 again
- Training will resume from last checkpoint!

**Advantages over Colab:**
- 🚀 Faster GPU (P100 vs T4)
- 💾 More RAM (16-30 GB vs 12 GB)
- 📦 Kaggle Datasets (unlimited vs 15 GB)
- 🔒 More stable (less disconnects)
- 📚 Pre-installed ML libraries

---

## 📂 Cell 1: Setup Paths & Environment (Kaggle-Specific)

In [None]:
# Kaggle paths (auto-mounted, no need to mount!)
import os
from pathlib import Path

# Kaggle-specific paths
KAGGLE_INPUT = Path('/kaggle/input')      # Read-only dataset location
KAGGLE_WORKING = Path('/kaggle/working')  # Read-write workspace (20 GB)

# ✅ FIX: Save directly to /kaggle/working (NOT in subfolder!)
# Kaggle ONLY persists files in /kaggle/working/ root, not subfolders
CHECKPOINT_DIR = KAGGLE_WORKING / 'checkpoints'
MODELS_DIR = KAGGLE_WORKING / 'models'
LOGS_DIR = KAGGLE_WORKING / 'logs'
RESULTS_DIR = KAGGLE_WORKING / 'results'

# Create directories (these will persist across sessions!)
for dir_path in [CHECKPOINT_DIR, MODELS_DIR, LOGS_DIR, RESULTS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print("✅ Kaggle environment detected!")
print(f"\n📂 Paths configured:")
print(f"   Input (datasets): {KAGGLE_INPUT}")
print(f"   Working (read-write): {KAGGLE_WORKING}")
print(f"   Checkpoints: {CHECKPOINT_DIR} ⭐ PERSISTENT!")
print(f"   Models: {MODELS_DIR} ⭐ PERSISTENT!")
print(f"   Logs: {LOGS_DIR}")
print(f"   Results: {RESULTS_DIR}")

# Check available datasets
print(f"\n📊 Available datasets:")
if KAGGLE_INPUT.exists():
    datasets = list(KAGGLE_INPUT.glob('*'))
    for ds in datasets:
        print(f"   - {ds.name}")
else:
    print("   No datasets mounted yet.")
    print("   Add dataset: Data → Add data → Search or upload")

# Set working directory
os.chdir(KAGGLE_WORKING)
print(f"\n✅ Working directory: {os.getcwd()}")

# Check GPU
import subprocess
try:
    gpu_info = subprocess.check_output(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader']).decode()
    print(f"\n🎮 GPU: {gpu_info.strip()}")
except:
    print("\n⚠️  No GPU detected. Enable: Settings → Accelerator → GPU")

print(f"\n💡 Checkpoint Strategy:")
print(f"   ✅ Files saved to {CHECKPOINT_DIR}")
print(f"   ✅ Kaggle auto-saves these on notebook commit")
print(f"   ✅ Session timeout → Fork notebook → Resumes from checkpoint!")
print(f"   ✅ NO DATA LOSS!")

## 📦 Cell 2: Load Project from Kaggle Dataset

**Note:** Kaggle automatically extracts ZIP files when uploaded as dataset.
Files are available directly in `/kaggle/input/your-dataset-name/`

**Setup:**
1. Upload `number-ML-kaggle-package-LATEST.zip` as Kaggle Dataset
2. Name it: `phone-number-ml-project-latest`
3. Add to this notebook: Data → Add data → Your dataset
4. Run this cell to copy files to working directory

In [None]:
# Cell 2: Copy Project Files from Kaggle Dataset (Auto-Extracted)
import shutil
from pathlib import Path

dataset_name = 'phone-number-ml-project-latest'  # ⚠️ Update to match your dataset name
dataset_path = KAGGLE_INPUT / dataset_name

print(f"📂 Loading from: {dataset_path}\n")

if dataset_path.exists():
    print("✅ Dataset found! Copying files to working directory...\n")
    
    # Copy src/ folder
    src_source = dataset_path / 'src'
    src_dest = KAGGLE_WORKING / 'src'
    
    if src_source.exists():
        shutil.copytree(src_source, src_dest, dirs_exist_ok=True)
        py_files = list(src_dest.glob('*.py'))
        print(f"✅ Copied src/: {len(py_files)} Python files")
    else:
        print("❌ src/ folder not found in dataset")
    
    # Copy data/ folder
    data_source = dataset_path / 'data'
    data_dest = KAGGLE_WORKING / 'data'
    
    if data_source.exists():
        shutil.copytree(data_source, data_dest, dirs_exist_ok=True)
        print(f"✅ Copied data/ folder")
    else:
        print("❌ data/ folder not found in dataset")
    
    # Copy config files (optional)
    for file in ['requirements.txt', 'CLAUDE.md', 'KAGGLE_SETUP.md']:
        src_file = dataset_path / file
        if src_file.exists():
            shutil.copy2(src_file, KAGGLE_WORKING / file)
            print(f"✅ Copied {file}")
    
    print("\n" + "="*70)
    print("🔍 VERIFICATION")
    print("="*70)
    
    # Verify src/
    src_path = KAGGLE_WORKING / 'src'
    if src_path.exists():
        py_files = sorted(list(src_path.glob('*.py')))
        print(f"\n✅ Project code ready ({len(py_files)} files):")
        for f in py_files:
            print(f"   - {f.name}")
    
    # Verify data
    data_csv = KAGGLE_WORKING / 'data' / 'raw' / 'numberdata.csv'
    if data_csv.exists():
        import pandas as pd
        df = pd.read_csv(data_csv)
        print(f"\n✅ Data loaded: {len(df)} rows, {len(df.columns)} columns")
        print(f"   Columns: {list(df.columns)}")
        print(f"   File size: {data_csv.stat().st_size / 1024:.1f} KB")
    else:
        print(f"\n❌ Data file not found at: {data_csv}")
    
    # Add to Python path
    import sys
    sys.path.insert(0, str(KAGGLE_WORKING))
    print(f"\n✅ Python path updated")
    
    print("="*70)
    print("\n🎉 Project setup complete! Ready for training.\n")
    
else:
    print(f"❌ Dataset not found: {dataset_name}")
    print("\n📋 Available datasets:")
    for ds in KAGGLE_INPUT.glob('*'):
        print(f"   - {ds.name}")
    print("\n💡 Update 'dataset_name' variable to match your dataset name")

## 🔧 Cell 3: Install Dependencies (Minimal - Most Pre-Installed)

Kaggle has most ML libraries pre-installed. Only install if needed.

In [None]:
# Verify pre-installed libraries (most should already exist)
print("🔍 Checking pre-installed libraries...\n")

required_libs = {
    'numpy': 'numpy',
    'pandas': 'pandas',
    'scikit-learn': 'sklearn',
    'xgboost': 'xgboost',
    'lightgbm': 'lightgbm',
    'catboost': 'catboost',
    'optuna': 'optuna',
    'psutil': 'psutil'  # Added for memory monitoring
}

missing = []
for name, import_name in required_libs.items():
    try:
        mod = __import__(import_name)
        version = getattr(mod, '__version__', 'unknown')
        print(f"   ✓ {name}: {version}")
    except ImportError:
        print(f"   ✗ {name}: NOT INSTALLED")
        missing.append(name)

# Install missing libraries
if missing:
    print(f"\n📦 Installing missing libraries: {', '.join(missing)}")
    !pip install -q {' '.join(missing)}
    print("✅ Installation complete!")
else:
    print("\n✅ All required libraries are already installed!")
    print("   (This is a Kaggle advantage - no installation time!)")

# Add project to Python path
import sys
sys.path.insert(0, str(KAGGLE_WORKING))
print(f"\n✅ Python path updated: {KAGGLE_WORKING}")

## 🔍 Cell 4: Auto-Detect Checkpoint & Setup Auto-Resume ⭐

**This is the magic cell!**
- Checks for existing checkpoints in /kaggle/working/checkpoints
- If found → RESUME MODE
- If not found → FRESH START

In [None]:
# Import checkpoint manager (works on both Colab and Kaggle!)
from src.checkpoint_manager import CheckpointManager, detect_environment

# Detect environment
env = detect_environment()
print(f"🌍 Environment detected: {env.upper()}")
print(f"   (Auto-configured for Kaggle paths)\n")

# ✅ FIX: Use direct path to /kaggle/working/checkpoints (no subfolder!)
# This ensures checkpoints persist across sessions
checkpoint_dir_path = str(CHECKPOINT_DIR)

print(f"📂 Checkpoint directory: {checkpoint_dir_path}")

# Verify directory exists
if not Path(checkpoint_dir_path).exists():
    Path(checkpoint_dir_path).mkdir(parents=True, exist_ok=True)
    print(f"✅ Created checkpoint directory")

# Initialize checkpoint manager
checkpoint_manager = CheckpointManager(
    checkpoint_dir=checkpoint_dir_path,
    max_checkpoints=5,
    save_every=10
)

# Check for existing checkpoint
print("\n🔍 Checking for existing checkpoint...\n")
checkpoint = checkpoint_manager.load_latest_checkpoint()

if checkpoint:
    RESUME_MODE = True
    START_EPOCH = checkpoint['epoch'] + 1
    PREVIOUS_METRICS = checkpoint.get('metrics', {})
    
    print("=" * 70)
    print("🔄 RESUME MODE ACTIVATED")
    print("=" * 70)
    print(f"Environment: KAGGLE (P100 GPU)")
    print(f"Last completed epoch: {checkpoint['epoch']}")
    print(f"Will resume from epoch: {START_EPOCH}")
    print(f"Checkpoint timestamp: {checkpoint['timestamp']}")
    print("\nPrevious metrics:")
    for key, value in PREVIOUS_METRICS.items():
        if isinstance(value, float):
            print(f"  {key}: {value:.4f}")
        else:
            print(f"  {key}: {value}")
    print("=" * 70)
    
    # Print recovery info if available
    print("\n")
    checkpoint_manager.print_recovery_info()
    
else:
    RESUME_MODE = False
    START_EPOCH = 0
    PREVIOUS_METRICS = {}
    
    print("=" * 70)
    print("🆕 FRESH START MODE")
    print("=" * 70)
    print(f"Environment: KAGGLE (P100 GPU)")
    print(f"No checkpoint found.")
    print(f"Will start training from epoch 0.")
    print(f"Checkpoints will be saved to {checkpoint_dir_path}")
    print("=" * 70)

print("\n✅ Auto-resume setup complete!")
print(f"Resume mode: {RESUME_MODE}")
print(f"Start epoch: {START_EPOCH}")
print(f"\n💡 Tip: If Kaggle times out (9h), fork this notebook and run again.")
print(f"    Training will resume from last checkpoint automatically!")
print(f"\n🎯 Checkpoint persistence:")
print(f"   - Saved to: {checkpoint_dir_path}")
print(f"   - Auto-saved on notebook commit")
print(f"   - Survives session timeout!")
print(f"   - NO MORE RE-TRAINING FROM SCRATCH! 🎉")

## 📊 Cell 5: Load Data & Feature Engineering

Load data from Kaggle Dataset (much faster than Google Drive!)

In [None]:
# Import required modules
from src.data_handler import load_and_clean_data, calculate_market_statistics
from src.features import create_masterpiece_features
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

print("📂 Loading data from Kaggle Dataset...")

# Find data file in Kaggle input
data_files = list(KAGGLE_INPUT.glob('*/numberdata.csv'))
if not data_files:
    data_files = list(KAGGLE_INPUT.glob('*/*.csv'))
    if not data_files:
        data_files = list((KAGGLE_WORKING / 'data' / 'raw').glob('*.csv'))

if data_files:
    DATA_PATH = str(data_files[0])
    print(f"✅ Found data: {DATA_PATH}")
else:
    # Fallback
    DATA_PATH = str(KAGGLE_WORKING / 'data' / 'raw' / 'numberdata.csv')
    print(f"⚠️  Using fallback: {DATA_PATH}")

# Load and clean - load_and_clean_data returns (df_raw, df_cleaned)
print(f"\n🔄 Loading and cleaning data...")
df_raw, df = load_and_clean_data(DATA_PATH)

# ✅ CRITICAL: Data size check
print(f"\n📊 Data Validation:")
print(f"   Raw data: {len(df_raw)} rows")
print(f"   Cleaned data: {len(df)} rows")
print(f"   Columns: {list(df.columns)}")

if len(df) < 100:
    raise ValueError(f"❌ INSUFFICIENT DATA! Got {len(df)}, need 1000+ for good training")
elif len(df) < 1000:
    print(f"⚠️  WARNING: Only {len(df)} samples. Recommend 3000+ for best results.")
else:
    print(f"✅ Data size looks good!")

# ✅ ป้องกัน DATA LEAKAGE: Split INDICES first (before features!)
print(f"\n🔀 Step 1: Split train/test INDICES (before feature engineering)")
print(f"   This prevents market statistics from leaking between train/test!")

# Create stratified split based on price quintiles
try:
    price_bins = pd.qcut(df['price'], q=5, labels=False, duplicates='drop')
except:
    # Fallback if qcut fails
    price_bins = pd.cut(df['price'], bins=5, labels=False)

train_idx, test_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.2,
    stratify=price_bins,
    random_state=42
)

print(f"✅ Split complete:")
print(f"   Train indices: {len(train_idx)} ({len(train_idx)/len(df)*100:.1f}%)")
print(f"   Test indices: {len(test_idx)} ({len(test_idx)/len(df)*100:.1f}%)")

# ✅ Step 2: Calculate market statistics from TRAIN ONLY
print(f"\n📊 Step 2: Calculate market statistics from TRAINING data only")
market_stats = calculate_market_statistics(df.iloc[train_idx])
print(f"✅ Market statistics calculated (NO data leakage!)")

# ✅ Step 3: Extract prices (y) BEFORE feature engineering
y_train = df.iloc[train_idx]['price'].values
y_test = df.iloc[test_idx]['price'].values

print(f"\n💰 Price statistics:")
print(f"   Train - Min: {y_train.min():.0f}, Max: {y_train.max():.0f}, Mean: {y_train.mean():.0f}")
print(f"   Test  - Min: {y_test.min():.0f}, Max: {y_test.max():.0f}, Mean: {y_test.mean():.0f}")

# ✅ Step 4: Create features separately for train and test (with same market_stats!)
print(f"\n🔧 Step 4: Creating features (250+ features)...")
print(f"   Using TRAIN market statistics for both sets (prevents leakage!)")
print(f"   This may take 2-3 minutes on Kaggle...")

df_train_features = create_masterpiece_features(
    df.iloc[train_idx],
    market_stats=market_stats
)
print(f"✅ Train features: {df_train_features.shape}")

df_test_features = create_masterpiece_features(
    df.iloc[test_idx],
    market_stats=market_stats  # Same stats from train!
)
print(f"✅ Test features: {df_test_features.shape}")

# ✅ Step 5: Convert all features to numeric (FIX STRING ERROR!)
print(f"\n🔧 Step 5: Converting all features to numeric...")
for col in df_train_features.columns:
    df_train_features[col] = pd.to_numeric(df_train_features[col], errors='coerce')
for col in df_test_features.columns:
    df_test_features[col] = pd.to_numeric(df_test_features[col], errors='coerce')
print(f"✅ All features converted to numeric")

# ✅ Step 6: Handle NaN and Inf
print(f"\n🔧 Step 6: Handling NaN and Inf values...")
# Train
train_inf = np.isinf(df_train_features.values).sum()
train_nan = df_train_features.isna().sum().sum()
df_train_features = df_train_features.replace([np.inf, -np.inf], np.nan)
df_train_features = df_train_features.fillna(df_train_features.median())

# Test
test_inf = np.isinf(df_test_features.values).sum()
test_nan = df_test_features.isna().sum().sum()
df_test_features = df_test_features.replace([np.inf, -np.inf], np.nan)
df_test_features = df_test_features.fillna(df_train_features.median())  # Use TRAIN median for test

print(f"   Train - Inf: {train_inf}, NaN: {train_nan} → Replaced with 0")
print(f"   Test  - Inf: {test_inf}, NaN: {test_nan} → Replaced with 0")

# ✅ Step 7: Final preparation
X_train = df_train_features.values.astype(np.float32)  # Use float32 to save memory
X_test = df_test_features.values.astype(np.float32)

print(f"\n📊 Final dataset:")
print(f"   X_train: {X_train.shape}, dtype: {X_train.dtype}")
print(f"   X_test:  {X_test.shape}, dtype: {X_test.dtype}")
print(f"   y_train: {y_train.shape}, dtype: {y_train.dtype}")
print(f"   y_test:  {y_test.shape}, dtype: {y_test.dtype}")

# Save feature names for later
FEATURE_NAMES = list(df_train_features.columns)
print(f"\n✅ Total features: {len(FEATURE_NAMES)}")

# Final validation
print(f"\n🔍 Final validation:")
print(f"   X_train NaN: {np.isnan(X_train).any()}, Inf: {np.isinf(X_train).any()}")
print(f"   X_test NaN:  {np.isnan(X_test).any()}, Inf: {np.isinf(X_test).any()}")
print(f"   y_train NaN: {np.isnan(y_train).any()}, Inf: {np.isinf(y_train).any()}")
print(f"   y_test NaN:  {np.isnan(y_test).any()}, Inf: {np.isinf(y_test).any()}")

# Memory info
import psutil
mem = psutil.virtual_memory()
print(f"\n💾 Memory usage:")
print(f"   Total RAM: {mem.total / (1024**3):.1f} GB")
print(f"   Available: {mem.available / (1024**3):.1f} GB")
print(f"   Used: {mem.percent:.1f}%")

print("\n" + "="*80)
print("✅ DATA PREPARATION COMPLETE - NO DATA LEAKAGE!")
print("="*80)
print("🎯 Ready for ULTRA-POWER training!")


## 🚀 Cell 6: Train with Auto-Checkpoint & Auto-Resume ⭐

**This cell will:**
- Resume from checkpoint if found (Cell 4)
- Train model with auto-save every 10 epochs
- Save checkpoints to /kaggle/working/checkpoints
- If timeout, just fork notebook and run Cell 1-6 again → Resume automatically!

**Optimized for Kaggle P100 GPU - ~30% faster than Colab T4!**

In [None]:
from src.train_production import train_production_pipeline
from src.gpu_monitor import GPUMonitor  # ✅ NEW: Real-time GPU monitoring
import time
import subprocess
from datetime import datetime

# ============================================================================
# 🎮 GPU DETECTION & REAL-TIME MONITORING
# ============================================================================
print("=" * 80)
print("🎮 GPU DETECTION & VERIFICATION")
print("=" * 80)

HAS_GPU = False
gpu_info = "CPU"

try:
    # Get GPU name
    gpu_name = subprocess.check_output(
        ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
        stderr=subprocess.DEVNULL
    ).decode().strip()

    # Get GPU memory
    gpu_memory = subprocess.check_output(
        ['nvidia-smi', '--query-gpu=memory.total', '--format=csv,noheader'],
        stderr=subprocess.DEVNULL
    ).decode().strip()

    # Get GPU utilization
    gpu_util = subprocess.check_output(
        ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader'],
        stderr=subprocess.DEVNULL
    ).decode().strip()

    # Get GPU memory usage
    gpu_mem_used = subprocess.check_output(
        ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader'],
        stderr=subprocess.DEVNULL
    ).decode().strip()

    if gpu_name:
        HAS_GPU = True
        gpu_info = gpu_name

        print(f"\n✅ GPU DETECTED!")
        print(f"   Model: {gpu_name}")
        print(f"   Total Memory: {gpu_memory}")
        print(f"   Current Utilization: {gpu_util}")
        print(f"   Memory Used: {gpu_mem_used}")
        print(f"\n💡 GPU Status:")
        print(f"   - Cells 1-5: GPU will show 0% (normal - CPU tasks)")
        print(f"   - Cell 6 (this cell): GPU will activate during training!")
        print(f"   - Expected GPU usage: 70-95% during XGBoost/LightGBM/CatBoost")
        print(f"\n🔥 Watch for GPU activation during:")
        print(f"   ✅ XGBoost optimization → GPU 70-90%")
        print(f"   ✅ XGBoost training → GPU 85-95%")
        print(f"   ✅ LightGBM optimization → GPU 70-85%")
        print(f"   ✅ LightGBM training → GPU 80-90%")
        print(f"   ✅ CatBoost optimization → GPU 60-80%")
        print(f"   ✅ CatBoost training → GPU 70-85%")
        print(f"   ⚪ RandomForest/ExtraTrees → GPU 0% (CPU only - normal)")
    else:
        print(f"\n⚠️  No GPU found - using CPU")
        print(f"   This will be SLOWER but still works!")

except Exception as e:
    print(f"\n⚠️  GPU detection failed: {e}")
    print(f"   Falling back to CPU")
    print(f"   Training will be slower but still functional")

print("=" * 80)

# ============================================================================
# 🎮 START REAL-TIME GPU MONITORING (NEW!)
# ============================================================================
if HAS_GPU:
    print(f"\n🎮 Starting real-time GPU monitoring...")
    print(f"   GPU stats will be printed every 30 seconds during training")
    print(f"   This helps verify GPU is actually being used!\n")
    
    gpu_monitor = GPUMonitor(interval=30, verbose=True)
    gpu_monitor.start()
else:
    print(f"\n⚠️  GPU monitoring disabled (no GPU detected)")
    gpu_monitor = None

# Configuration
OPTIMIZE = True  # ⚠️ Set to True for full Optuna optimization (SLOW but BEST!)
N_TRIALS = 100   # Optuna trials per model (50-150, higher = better but slower)
                 # 100 trials × 4 models × 2min ≈ 13 hours
                 # 50 trials × 4 models × 2min ≈ 7 hours (fits in 1 Kaggle session!)

print("\n" + "="*80)
print("🚀 ULTRA-POWER PRODUCTION TRAINING PIPELINE")
print("="*80)
print(f"")
print(f"📊 Dataset:")
print(f"   Training:   {len(X_train):,} samples × {X_train.shape[1]} features")
print(f"   Validation: {len(X_test):,} samples × {X_test.shape[1]} features")
print(f"")
print(f"⚙️  Configuration:")
print(f"   Platform: KAGGLE")
print(f"   Accelerator: {'GPU (' + gpu_info + ')' if HAS_GPU else 'CPU (all cores)'}")
print(f"   GPU Monitoring: {'ENABLED (every 30 sec)' if HAS_GPU else 'DISABLED'}")
print(f"   Optimization: {'FULL OPTUNA' if OPTIMIZE else 'DEFAULT PARAMS'}")
if OPTIMIZE:
    print(f"   Optuna trials: {N_TRIALS} per model")
    est_time = N_TRIALS * 4 * 2 / 60  # 4 models, 2 min/trial average
    print(f"   Estimated time: {est_time:.1f} hours")
    if est_time > 9:
        print(f"   ⚠️  May exceed Kaggle 9h limit! Consider reducing N_TRIALS to 50")
else:
    print(f"   Estimated time: 30-60 minutes")
print(f"")
print(f"🎯 Models to train:")
print(f"   - XGBoost      (tree-based, GPU-accelerated)")
print(f"   - LightGBM     (tree-based, fast)")
print(f"   - CatBoost     (tree-based, handles categoricals)")
print(f"   - RandomForest (ensemble of decision trees)")
print(f"   - ExtraTrees   (randomized decision trees)")
print(f"   - GradientBoosting (sequential boosting)")
print(f"")
print(f"🏗️  Ensemble methods:")
print(f"   - Voting Ensemble (top 5 models)")
print(f"   - Stacking Ensemble (meta-learner)")
print(f"   - Weighted Average (R²-weighted)")
print(f"   - Simple Average")
print(f"")
print(f"✨ Advanced features:")
print(f"   ✅ Progressive sample weights (10x for expensive numbers)")
print(f"   ✅ Cross-validation (10-fold)")
print(f"   ✅ Early stopping (prevent overfit)")
print(f"   ✅ Automatic best model selection")
print(f"   ✅ NO data leakage (split before features)")
print(f"   ✅ Real-time GPU monitoring (every 30 seconds)")
print("="*80)

# Ask for confirmation if long training
if OPTIMIZE and N_TRIALS >= 100:
    print(f"\n⚠️  WARNING: This will take ~{est_time:.0f} hours!")
    print(f"   Make sure Kaggle notebook can run that long.")
    print(f"   Starting in 5 seconds...")
    time.sleep(5)

# Start training
print(f"\n🚀 STARTING ULTRA-POWER TRAINING...")
print(f"   {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"   GPU monitoring: {'ACTIVE' if gpu_monitor else 'DISABLED'}\n")

start_time = time.time()

try:
    # Run production pipeline
    result = train_production_pipeline(
        X_train=X_train,
        y_train=y_train,
        X_val=X_test,
        y_val=y_test,
        optimize=OPTIMIZE,
        n_trials=N_TRIALS,
        use_gpu=HAS_GPU,
        verbose=True
    )
    
    # Training complete - stop GPU monitoring
    if gpu_monitor:
        gpu_monitor.stop()
    
    elapsed_time = time.time() - start_time
    
    print("\n" + "="*80)
    print("🎉 ULTRA-POWER TRAINING COMPLETE!")
    print("="*80)
    print(f"\n⏱️  Training time: {elapsed_time/3600:.2f} hours ({elapsed_time/60:.1f} minutes)")
    print(f"\n🏆 Best Model: {result['best_model_name']}")
    print(f"   Test R² Score:  {result['best_score']:.4f}")
    print(f"   Test MAE:       {result['best_mae']:.2f}")
    print(f"   Test RMSE:      {result['best_rmse']:.2f}")
    
    print(f"\n📊 Top 5 Models:")
    sorted_models = sorted(result['all_scores'].items(), 
                          key=lambda x: x[1]['r2'], reverse=True)
    for i, (name, scores) in enumerate(sorted_models[:5], 1):
        print(f"   {i}. {name:25s} R²={scores['r2']:.4f}  MAE={scores['mae']:.2f}  RMSE={scores['rmse']:.2f}")
    
    # Check if target achieved
    if result['best_score'] >= 0.90:
        print(f"\n✅ TARGET ACHIEVED! R² = {result['best_score']:.4f} >= 0.90")
    else:
        print(f"\n⚠️  Target not quite reached. R² = {result['best_score']:.4f} < 0.90")
        print(f"   Consider: More trials, more data, or feature engineering")
    
    # Save for next cells
    TRAINED_MODEL = result['best_model']
    TRAINING_RESULT = result
    
    print("\n💾 Saving models...")
    
    # Save best model
    import joblib
    
    model_package = {
        'model': result['best_model'],
        'model_name': result['best_model_name'],
        'feature_names': FEATURE_NAMES,
        'metrics': {
            'test_r2': float(result['best_score']),
            'test_mae': float(result['best_mae']),
            'test_rmse': float(result['best_rmse'])
        },
        'training_config': {
            'optimize': OPTIMIZE,
            'n_trials': N_TRIALS if OPTIMIZE else None,
            'platform': 'Kaggle',
            'gpu': gpu_info if HAS_GPU else 'CPU'
        },
        'timestamp': datetime.now().isoformat(),
        'data_shape': {
            'train_samples': len(X_train),
            'test_samples': len(X_test),
            'n_features': X_train.shape[1]
        },
        'hyperparameters': result.get('hyperparameters'),
        'all_model_scores': result['all_scores']
    }
    
    model_path = MODELS_DIR / 'best_model_production.pkl'
    joblib.dump(model_package, model_path)
    print(f"✅ Best model saved: {model_path}")
    print(f"   Size: {model_path.stat().st_size / (1024*1024):.2f} MB")
    
    # Save all models
    all_models_path = MODELS_DIR / 'all_models_production.pkl'
    joblib.dump(result, all_models_path)
    print(f"✅ All models saved: {all_models_path}")
    print(f"   Includes: {len(result['trained_models'])} base + {len(result['ensemble_models'])} ensemble models")
    
    print("\n" + "="*80)
    print("🎯 Next: Run Cell 7 to evaluate and visualize results!")
    print("="*80)
    
except Exception as e:
    # Stop GPU monitoring on error
    if gpu_monitor:
        gpu_monitor.stop()
    
    print("\n" + "="*80)
    print("❌ TRAINING FAILED")
    print("="*80)
    print(f"Error Type: {type(e).__name__}")
    print(f"Error Message: {str(e)}")
    print("\n📋 Troubleshooting:")
    print("1. Check data has enough samples (6000+ ✓)")
    print("2. Check features are all numeric (should be ✓ from Cell 5)")
    print("3. Check GPU/CPU settings")
    print("4. Check memory - try reducing N_TRIALS if OOM")
    print("5. Check Kaggle time limit (9 hours)")
    
    import traceback
    print("\n🔍 Full traceback:")
    traceback.print_exc()
    
    raise

## 📈 Cell 7: Evaluate Model & Save Results

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib
from datetime import datetime

print("📊 Evaluating model...\n")

# Make predictions
y_train_pred = TRAINED_MODEL.predict(X_train)
y_test_pred = TRAINED_MODEL.predict(X_test)

# Calculate metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

# Print results
print("=" * 70)
print("📊 FINAL MODEL EVALUATION (Trained on Kaggle P100)")
print("=" * 70)
print(f"\nTraining Set:")
print(f"  R² Score:  {train_r2:.4f}")
print(f"  MAE:       {train_mae:.4f}")
print(f"  RMSE:      {train_rmse:.4f}")
print(f"\nTest Set:")
print(f"  R² Score:  {test_r2:.4f}")
print(f"  MAE:       {test_mae:.4f}")
print(f"  RMSE:      {test_rmse:.4f}")
print("=" * 70)

# Save comprehensive model package
print("\n💾 Saving final model package...")

model_package = {
    'model': TRAINED_MODEL,
    'model_type': MODEL_TYPE,
    'feature_names': FEATURE_NAMES,
    'metrics': {
        'train_r2': float(train_r2),
        'test_r2': float(test_r2),
        'train_mae': float(train_mae),
        'test_mae': float(test_mae),
        'train_rmse': float(train_rmse),
        'test_rmse': float(test_rmse)
    },
    'training_config': {
        'total_epochs': TOTAL_EPOCHS,
        'model_params': model_params if 'model_params' in locals() else None,
        'platform': 'Kaggle',
        'gpu': 'P100'
    },
    'timestamp': datetime.now().isoformat(),
    'trained_on': 'Kaggle (P100 GPU)',
    'data_shape': {
        'n_samples': len(X),
        'n_features': len(FEATURE_NAMES),
        'train_size': len(X_train),
        'test_size': len(X_test)
    }
}

# Save to /kaggle/working (will be saved with notebook commit)
final_model_path = MODELS_DIR / 'final_model_kaggle.pkl'
joblib.dump(model_package, final_model_path)

print(f"✅ Model package saved to: {final_model_path}")
print(f"   Size: {final_model_path.stat().st_size / (1024*1024):.2f} MB")

# Save evaluation report
report_path = RESULTS_DIR / 'evaluation_report_kaggle.txt'
with open(report_path, 'w') as f:
    f.write("ML Phone Number Price Prediction - Kaggle P100 Evaluation\n")
    f.write("=" * 70 + "\n\n")
    f.write(f"Model Type: {MODEL_TYPE}\n")
    f.write(f"Training Platform: Kaggle (P100 GPU)\n")
    f.write(f"Training Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    f.write("Training Set Metrics:\n")
    f.write(f"  R² Score:  {train_r2:.4f}\n")
    f.write(f"  MAE:       {train_mae:.4f}\n")
    f.write(f"  RMSE:      {train_rmse:.4f}\n\n")
    f.write("Test Set Metrics:\n")
    f.write(f"  R² Score:  {test_r2:.4f}\n")
    f.write(f"  MAE:       {test_mae:.4f}\n")
    f.write(f"  RMSE:      {test_rmse:.4f}\n")

print(f"✅ Report saved to: {report_path}")

print("\n✅ All results saved to /kaggle/working!")
print("\n💡 Tip: Commit this notebook to save all outputs permanently!")
print("   (Kaggle auto-commits every version)")

## 📥 Cell 8: Commit Notebook to Save Results

**Important:** Kaggle notebooks need to be committed to save outputs permanently!

Click "Save Version" (top right) to commit:
- Notebook code
- All outputs in /kaggle/working
- Checkpoints
- Models
- Results

After commit, you can:
- Download outputs
- Share notebook
- Fork for next session (to resume training)

In [None]:
# Summary of files to commit
print("📦 Files in /kaggle/working (will be saved on commit):\n")

import os
from pathlib import Path

# ✅ FIX: Show files from /kaggle/working directly (where they actually persist!)
working_dir = Path('/kaggle/working')

for root, dirs, files in os.walk(working_dir):
    # Skip hidden directories and Python cache
    dirs[:] = [d for d in dirs if not d.startswith('.') and d != '__pycache__']
    
    level = root.replace(str(working_dir), '').count(os.sep)
    indent = ' ' * 2 * level
    folder_name = os.path.basename(root) or 'working'
    print(f"{indent}{folder_name}/")
    subindent = ' ' * 2 * (level + 1)
    for file in sorted(files)[:10]:  # Limit to first 10 files per folder
        if file.startswith('.'):  # Skip hidden files
            continue
        file_path = os.path.join(root, file)
        size_mb = os.path.getsize(file_path) / (1024 * 1024)
        print(f"{subindent}{file} ({size_mb:.2f} MB)")
    if len(files) > 10:
        print(f"{subindent}... and {len(files) - 10} more files")

print("\n" + "=" * 70)
print("🎉 TRAINING COMPLETE ON KAGGLE!")
print("=" * 70)
print("\n✅ Next steps:")
print("1. Click 'Save Version' (top right) to commit notebook")
print("2. All files in /kaggle/working will be saved")
print("3. You can download model after commit")
print("4. To continue training: Fork this notebook → Run Cell 1-6")
print("\n🏆 Advantages of Kaggle over Colab:")
print("   - P100 GPU (~30% faster than T4)")
print("   - More RAM (16-30 GB vs 12 GB)")
print("   - More stable (less disconnects)")
print("   - Auto-commit (permanent save)")
print("   - Unlimited dataset storage")
print("\n💾 Checkpoint Persistence:")
print("   ✅ Checkpoints saved to /kaggle/working/checkpoints/")
print("   ✅ Models saved to /kaggle/working/models/")
print("   ✅ Session timeout → Fork → Auto-resume from last checkpoint!")
print("   ✅ NO MORE RE-TRAINING FROM SCRATCH! 🎉")
print("\n" + "=" * 70)

print("\n🎯 Training completed successfully on Kaggle P100!")
if 'test_r2' in locals():
    print(f"   Final R² Score: {test_r2:.4f}")
print("\n🚀 Happy training!")