# Train Tabular Models (XGBoost, LightGBM, CatBoost)

**Purpose:** Train tabular models on processed datasets.

**Models:**
- XGBoost (gradient boosting)
- LightGBM (fast gradient boosting)
- CatBoost (categorical boosting)

**Expected Runtime:** 20-40 minutes per model

**Key Features:**
- Auto-checkpointing every 30 minutes
- W&B experiment tracking
- Resume from last epoch on disconnect
- GPU acceleration (if available)

## Setup Environment

In [None]:
from utils.colab_setup import setup_colab_environment, estimate_training_time_remaining
from utils.checkpoint_manager import CheckpointManager

# Auto-setup
env_info = setup_colab_environment(
    repo_url="https://github.com/yourusername/ml-factory.git",
    wandb_project="ohlcv-ml-factory",
)

# Check remaining time
remaining = estimate_training_time_remaining()
print(f"‚è±Ô∏è  Estimated time remaining: {remaining:.1f} hours")

if remaining < 2.0:
    print("‚ö†Ô∏è  WARNING: Less than 2 hours remaining - consider restarting runtime")

## Initialize Checkpoint Manager

In [None]:
# Configuration
SYMBOL = "MES"
HORIZON = 20
MODEL = "xgboost"  # or "lightgbm", "catboost"

# Initialize checkpoint manager
ckpt_mgr = CheckpointManager(
    drive_path="/content/drive/MyDrive/ml_factory/checkpoints",
    wandb_project="ohlcv-ml-factory",
    auto_save_interval=1800,  # 30 minutes
)

# Initialize W&B run
ckpt_mgr.init_wandb_run(
    run_name=f"{MODEL}_{SYMBOL}_h{HORIZON}",
    config={"symbol": SYMBOL, "horizon": HORIZON, "model": MODEL},
    tags=["boosting", SYMBOL, MODEL],
)

## Load Processed Datasets from Drive

In [None]:
import numpy as np
from pathlib import Path

# Define paths
DRIVE_DATA = Path("/content/drive/MyDrive/ml_factory/data/processed") / SYMBOL
LOCAL_DATA = Path("/content/data/splits/scaled")
LOCAL_DATA.mkdir(parents=True, exist_ok=True)

# Copy data from Drive to local (faster I/O during training)
import shutil
if not (LOCAL_DATA / "X_train.npy").exists():
    print("Copying data from Drive to local disk...")
    shutil.copytree(DRIVE_DATA, LOCAL_DATA, dirs_exist_ok=True)
    print("‚úÖ Data copied to local disk")

# Load data
X_train = np.load(LOCAL_DATA / "X_train.npy")
y_train = np.load(LOCAL_DATA / "y_train.npy")
X_val = np.load(LOCAL_DATA / "X_val.npy")
y_val = np.load(LOCAL_DATA / "y_val.npy")
X_test = np.load(LOCAL_DATA / "X_test.npy")
y_test = np.load(LOCAL_DATA / "y_test.npy")

print(f"\nDataset shapes:")
print(f"Train: {X_train.shape}, {y_train.shape}")
print(f"Val: {X_val.shape}, {y_val.shape}")
print(f"Test: {X_test.shape}, {y_test.shape}")

## Check for Existing Checkpoint

In [None]:
# Try to resume from checkpoint
phase_name = f"train_{MODEL}"
checkpoint = ckpt_mgr.load_latest_checkpoint(phase=phase_name)

if checkpoint:
    print(f"\n‚úÖ Resuming from checkpoint: {checkpoint['timestamp']}")
    model_state = checkpoint['state'].get('model_state')
    last_epoch = checkpoint['state'].get('epoch', 0)
    print(f"Last completed epoch: {last_epoch}")
else:
    print("\nüÜï No checkpoint found - starting from scratch")
    model_state = None
    last_epoch = 0

## Train Model with Auto-Checkpointing

In [None]:
from src.models import ModelRegistry

# Initialize model
model_class = ModelRegistry.get(MODEL)
model = model_class()

# Resume from checkpoint if available
if model_state:
    model.load_state(model_state)  # Implement load_state in BaseModel
    print(f"Model state restored from epoch {last_epoch}")

# Training configuration
config = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "max_depth": 8,
    "early_stopping_rounds": 50,
    "use_gpu": env_info['gpu_info']['available'],
}

# Custom callback for checkpointing
class CheckpointCallback:
    def __init__(self, ckpt_mgr, phase_name, interval=50):
        self.ckpt_mgr = ckpt_mgr
        self.phase_name = phase_name
        self.interval = interval
        
    def __call__(self, epoch, metrics):
        # Auto-save every N epochs or if time interval elapsed
        if epoch % self.interval == 0 or self.ckpt_mgr.should_auto_save():
            self.ckpt_mgr.save_checkpoint(
                phase=self.phase_name,
                state={
                    "epoch": epoch,
                    "model_state": model.get_state(),  # Implement get_state in BaseModel
                    "metrics": metrics,
                },
                metadata={"symbol": SYMBOL, "horizon": HORIZON, "model": MODEL},
            )
        
        # Log to W&B
        self.ckpt_mgr.log_metrics(metrics, step=epoch)

# Train model
callback = CheckpointCallback(ckpt_mgr, phase_name, interval=50)

try:
    training_metrics = model.fit(
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val,
        config=config,
        callbacks=[callback],  # Pass checkpoint callback
    )
    
    print("\n‚úÖ Training completed successfully!")
    print(f"Final metrics: {training_metrics}")
    
except Exception as e:
    print(f"\n‚ùå Training failed: {e}")
    # Save checkpoint on failure
    ckpt_mgr.save_checkpoint(
        phase=phase_name,
        state={"error": str(e), "epoch": last_epoch},
        metadata={"symbol": SYMBOL, "status": "failed"},
        force=True,
    )
    raise

## Evaluate on Test Set

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Generate predictions
pred_output = model.predict(X_test)
y_pred = pred_output.predictions
y_proba = pred_output.probabilities

# Calculate metrics
test_metrics = {
    "test_accuracy": accuracy_score(y_test, y_pred),
    "test_precision": precision_score(y_test, y_pred, average='weighted'),
    "test_recall": recall_score(y_test, y_pred, average='weighted'),
    "test_f1": f1_score(y_test, y_pred, average='weighted'),
}

print("\nTest Set Metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")

# Log to W&B
ckpt_mgr.log_metrics(test_metrics)

## Save Model to Drive and W&B

In [None]:
from datetime import datetime

# Define paths
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_name = f"{MODEL}_{SYMBOL}_h{HORIZON}_{timestamp}"

DRIVE_MODELS = Path("/content/drive/MyDrive/ml_factory/models")
DRIVE_MODELS.mkdir(parents=True, exist_ok=True)

model_path = DRIVE_MODELS / f"{model_name}.pkl"

# Save model to Drive
model.save(model_path)
print(f"‚úÖ Model saved to: {model_path}")

# Upload to W&B
if ckpt_mgr.wandb_run:
    artifact = ckpt_mgr.wandb.Artifact(
        name=model_name,
        type="model",
        metadata={"symbol": SYMBOL, "horizon": HORIZON, "model": MODEL, **test_metrics},
    )
    artifact.add_file(str(model_path))
    ckpt_mgr.wandb_run.log_artifact(artifact)
    print("‚òÅÔ∏è  Model uploaded to W&B")

## Finish W&B Run

In [None]:
ckpt_mgr.finish_wandb_run()
print("\n‚úÖ Training complete! Model saved and logged.")

## Next Steps

1. Train other tabular models (LightGBM, CatBoost)
2. Proceed to sequence models (LSTM, GRU, TCN)
3. Build heterogeneous ensemble with stacking meta-learner