# Train Tabular Models (XGBoost, LightGBM, CatBoost)

**Purpose:** Train tabular models on processed datasets.

**Models:**
- XGBoost (gradient boosting)
- LightGBM (fast gradient boosting)
- CatBoost (categorical boosting)

**Expected Runtime:** 20-40 minutes per model

**Key Features:**
- Auto-checkpointing every 30 minutes
- W&B experiment tracking
- Resume from last epoch on disconnect
- GPU acceleration (if available)

## Setup Environment

In [None]:
# Setup Colab environment
# Note: You can use either the scaffolded utils or the working notebooks/colab_setup.py

# Option 1: Use the working colab_setup module
import sys
sys.path.insert(0, '/content/research')

from notebooks.colab_setup import setup_colab_environment, is_colab

env_info = setup_colab_environment(
    mount_drive=True,
    use_gpu=True,
)

print(f"\nüìä Environment Info:")
print(f"  Running in Colab: {env_info.get('is_colab', False)}")
print(f"  GPU available: {env_info.get('gpu_available', False)}")
print(f"  Drive mounted: {env_info.get('drive_mounted', False)}")

# Estimate remaining time (Colab sessions are ~12 hours)
import time
session_start = time.time()
print(f"\n‚è±Ô∏è  Session started - you have approximately 12 hours")

## Initialize Checkpoint Manager

In [None]:
# Configuration
SYMBOL = "MES"
HORIZON = 20
MODEL = "xgboost"  # or "lightgbm", "catboost"

# Simple checkpoint manager class (no external dependencies)
class SimpleCheckpointManager:
    """Minimal checkpoint manager that works without W&B."""
    
    def __init__(self, drive_path):
        self.drive_path = Path(drive_path)
        self.drive_path.mkdir(parents=True, exist_ok=True)
        self.wandb_run = None
        
    def log_metrics(self, metrics, step=None):
        """Log metrics (print if W&B not available)."""
        if self.wandb_run:
            import wandb
            wandb.log(metrics, step=step)
        else:
            print(f"Metrics: {metrics}")
    
    def finish_wandb_run(self):
        """Finish W&B run if active."""
        if self.wandb_run:
            self.wandb_run.finish()

# Initialize simple checkpoint manager
ckpt_mgr = SimpleCheckpointManager(
    drive_path="/content/drive/MyDrive/ml_factory/checkpoints"
)

print(f"‚úÖ Checkpoint manager initialized")
print(f"   Checkpoint dir: {ckpt_mgr.drive_path}")

## Load Processed Datasets from Drive

In [None]:
import numpy as np
from pathlib import Path

# Define paths
DRIVE_DATA = Path("/content/drive/MyDrive/ml_factory/data/processed") / SYMBOL
LOCAL_DATA = Path("/content/data/splits/scaled")
LOCAL_DATA.mkdir(parents=True, exist_ok=True)

# Copy data from Drive to local (faster I/O during training)
import shutil
if not (LOCAL_DATA / "X_train.npy").exists():
    print("Copying data from Drive to local disk...")
    shutil.copytree(DRIVE_DATA, LOCAL_DATA, dirs_exist_ok=True)
    print("‚úÖ Data copied to local disk")

# Load data
X_train = np.load(LOCAL_DATA / "X_train.npy")
y_train = np.load(LOCAL_DATA / "y_train.npy")
X_val = np.load(LOCAL_DATA / "X_val.npy")
y_val = np.load(LOCAL_DATA / "y_val.npy")
X_test = np.load(LOCAL_DATA / "X_test.npy")
y_test = np.load(LOCAL_DATA / "y_test.npy")

print(f"\nDataset shapes:")
print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Val: {X_val.shape}, {y_val.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

## Check for Existing Checkpoint

In [None]:
# Check for existing trained model (to resume/skip training)
from pathlib import Path

model_checkpoint_dir = Path("/content/drive/MyDrive/ml_factory/checkpoints/models")
model_checkpoint_dir.mkdir(parents=True, exist_ok=True)

checkpoint_path = model_checkpoint_dir / f"{MODEL}_{SYMBOL}_h{HORIZON}_checkpoint.pkl"

if checkpoint_path.exists():
    print(f"\n‚úÖ Found model checkpoint: {checkpoint_path}")
    print("You can load this model and skip training, or retrain from scratch")
    resume_training = False  # Set to True if you want to use the saved model
else:
    print("\nüÜï No checkpoint found - will train from scratch")
    resume_training = False

## Train Model with Auto-Checkpointing

In [None]:
from src.models import ModelRegistry

# Initialize model
model_class = ModelRegistry.get(MODEL)
model = model_class()

# Load from checkpoint if resuming
if resume_training and checkpoint_path.exists():
    model.load(checkpoint_path)
    print(f"‚úÖ Model loaded from checkpoint: {checkpoint_path}")
    print("Skipping training - model already trained")
else:
    # Training configuration
    config = {
        "n_estimators": 1000,
        "learning_rate": 0.05,
        "max_depth": 8,
        "early_stopping_rounds": 50,
        "use_gpu": env_info.get('gpu_info', {}).get('available', False),
    }
    
    # Train model
    # Note: BaseModel.fit() does NOT support callbacks parameter
    # Checkpointing must be done manually after training completes
    try:
        print(f"\nüöÄ Training {MODEL}...")
        training_metrics = model.fit(
            X_train=X_train,
            y_train=y_train,
            X_val=X_val,
            y_val=y_val,
            config=config,
        )
        
        print("\n‚úÖ Training completed successfully!")
        print(f"Training metrics: {training_metrics}")
        
        # Save checkpoint to Drive immediately after training
        model.save(checkpoint_path)
        print(f"üíæ Model checkpoint saved to: {checkpoint_path}")
        
        # Log to W&B if available
        if hasattr(ckpt_mgr, 'log_metrics'):
            ckpt_mgr.log_metrics({
                "train_loss": training_metrics.train_loss,
                "val_loss": training_metrics.val_loss,
                "val_accuracy": training_metrics.val_accuracy,
            })
        
    except Exception as e:
        print(f"\n‚ùå Training failed: {e}")
        raise

## Evaluate on Test Set

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Generate predictions
# model.predict() returns a PredictionOutput object with:
# - predictions: class predictions (alias: class_predictions)
# - probabilities: class probabilities
# - confidence: prediction confidence scores
pred_output = model.predict(X_test)
y_pred = pred_output.class_predictions  # or pred_output.predictions
y_proba = pred_output.probabilities

# Calculate metrics
test_metrics = {
    "test_accuracy": accuracy_score(y_test, y_pred),
    "test_precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
    "test_recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
    "test_f1": f1_score(y_test, y_pred, average='weighted', zero_division=0),
}

print("\nTest Set Metrics:")
for metric, value in test_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Log to W&B if available
if hasattr(ckpt_mgr, 'log_metrics'):
    ckpt_mgr.log_metrics(test_metrics)

## Save Model to Drive and W&B

In [None]:
from datetime import datetime

# Define paths
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = f"{MODEL}_{SYMBOL}_h{HORIZON}_{timestamp}"

DRIVE_MODELS = Path("/content/drive/MyDrive/ml_factory/models")
DRIVE_MODELS.mkdir(parents=True, exist_ok=True)

model_path = DRIVE_MODELS / f"{model_name}.pkl"

# Save model to Drive
model.save(model_path)
print(f"‚úÖ Model saved to: {model_path}")

# Upload to W&B
if ckpt_mgr.wandb_run:
    artifact = ckpt_mgr.wandb.Artifact(
        name=model_name,
        type="model",
        metadata={"symbol": SYMBOL, "horizon": HORIZON, "model": MODEL, **test_metrics},
    )
    artifact.add_file(str(model_path))
    ckpt_mgr.wandb_run.log_artifact(artifact)
    print("‚òÅÔ∏è  Model uploaded to W&B")

## Finish W&B Run

In [None]:
ckpt_mgr.finish_wandb_run()
print("\n‚úÖ Training complete! Model saved and logged.")

## Next Steps

1. Train other tabular models (LightGBM, CatBoost)
2. Proceed to sequence models (LSTM, GRU, TCN)
3. Build heterogeneous ensemble with stacking meta-learner