# Data Pipeline (Phases 1-5)

**Purpose:** Run the complete data pipeline from raw OHLCV to model-ready datasets.

**Phases:**
1. Ingestion: Load raw 1-min OHLCV
2. MTF Upscaling: Resample to 8 intraday timeframes
3. Features: 180+ indicators (momentum, wavelets, microstructure)
4. Labeling: Triple-barrier with Optuna optimization
5. Adapters: Model-family data preparation (2D, 3D)

**Outputs:** Processed datasets saved to Google Drive

**Expected Runtime:** 30-60 minutes (depending on symbol and data size)

## Setup Environment

In [None]:
# Auto-setup (mount Drive, clone repo, install deps)
from utils.colab_setup import setup_colab_environment, check_disk_space, estimate_training_time_remaining

env_info = setup_colab_environment(
    repo_url="https://github.com/yourusername/ml-factory.git",
    drive_mount_point="/content/drive",
    wandb_project="ohlcv-ml-factory",
    install_extra_deps=True,
)

# Check resources
disk = check_disk_space()
print(f"üíæ Available disk space: {disk.get('available', 'unknown')}")

remaining = estimate_training_time_remaining()
print(f"‚è±Ô∏è  Estimated time remaining: {remaining:.1f} hours")

## Initialize Checkpoint Manager

In [None]:
from utils.checkpoint_manager import CheckpointManager

# Initialize checkpoint manager (saves to Drive every 30 min)
ckpt_mgr = CheckpointManager(
    drive_path="/content/drive/MyDrive/ml_factory/checkpoints",
    wandb_project="ohlcv-ml-factory",
    auto_save_interval=1800,  # 30 minutes
    max_checkpoints=3,
)

# Initialize W&B run
ckpt_mgr.init_wandb_run(
    run_name="data_pipeline_MES",
    config={"symbol": "MES", "pipeline_version": "v1"},
    tags=["data_pipeline", "MES"],
)

## Load Raw Data from Drive

In [None]:
import pandas as pd
from pathlib import Path

# Define paths
SYMBOL = "MES"
DRIVE_DATA_PATH = Path("/content/drive/MyDrive/ml_factory/data/raw")
LOCAL_DATA_PATH = Path("/content/data/raw")
LOCAL_DATA_PATH.mkdir(parents=True, exist_ok=True)

# Copy data from Drive to local disk (faster I/O)
raw_data_file = DRIVE_DATA_PATH / f"{SYMBOL}_1m.parquet"
local_data_file = LOCAL_DATA_PATH / f"{SYMBOL}_1m.parquet"

if not local_data_file.exists():
    print(f"Copying data from Drive to local disk...")
    import shutil
    shutil.copy(raw_data_file, local_data_file)
    print(f"‚úÖ Data copied to {local_data_file}")
else:
    print(f"‚úÖ Data already exists at {local_data_file}")

# Load data
df_raw = pd.read_parquet(local_data_file)
print(f"\nRaw data shape: {df_raw.shape}")
print(df_raw.head())

## Check for Existing Checkpoint (Resume if Available)

In [None]:
# Try to resume from checkpoint
checkpoint = ckpt_mgr.load_latest_checkpoint(phase="data_pipeline")

if checkpoint:
    print(f"\n‚úÖ Resuming from checkpoint: {checkpoint['timestamp']}")
    last_completed_phase = checkpoint['state'].get('last_phase', 0)
    print(f"Last completed phase: {last_completed_phase}")
else:
    print("\nüÜï No checkpoint found - starting from scratch")
    last_completed_phase = 0

## Run Pipeline (with Checkpointing)

In [None]:
# Import pipeline runner
from src.pipeline.runner import PipelineRunner
from src.pipeline.config import PipelineConfig

# Configure pipeline
config = PipelineConfig(
    symbols=[SYMBOL],
    data_dir=Path("/content/data"),
    output_dir=Path("/content/output"),
    checkpoint_dir=Path("/content/drive/MyDrive/ml_factory/checkpoints/pipeline"),
)

# Initialize pipeline runner
runner = PipelineRunner(config)

# Run pipeline with auto-checkpointing
try:
    result = runner.run(
        start_phase=last_completed_phase + 1,  # Resume from next phase
        checkpoint_callback=lambda phase, state: ckpt_mgr.save_checkpoint(
            phase="data_pipeline",
            state={"last_phase": phase, **state},
            metadata={"symbol": SYMBOL},
            force=True,  # Force save after each phase
        ),
    )
    
    print("\n‚úÖ Pipeline completed successfully!")
    print(f"Results: {result}")
    
except Exception as e:
    print(f"\n‚ùå Pipeline failed: {e}")
    # Save checkpoint on failure
    ckpt_mgr.save_checkpoint(
        phase="data_pipeline",
        state={"error": str(e), "last_phase": runner.current_phase},
        metadata={"symbol": SYMBOL, "status": "failed"},
        force=True,
    )
    raise

## Copy Results to Google Drive (Permanent Storage)

In [None]:
import shutil

# Define paths
LOCAL_OUTPUT = Path("/content/data/splits/scaled")
DRIVE_OUTPUT = Path("/content/drive/MyDrive/ml_factory/data/processed")
DRIVE_OUTPUT.mkdir(parents=True, exist_ok=True)

# Copy processed datasets to Drive
print("Copying processed datasets to Google Drive...")
shutil.copytree(LOCAL_OUTPUT, DRIVE_OUTPUT / SYMBOL, dirs_exist_ok=True)

print(f"\n‚úÖ Processed datasets saved to: {DRIVE_OUTPUT / SYMBOL}")

## Finish W&B Run

In [None]:
# Log final metrics
ckpt_mgr.log_metrics({
    "pipeline_status": "completed",
    "num_samples": len(df_raw),
    "symbol": SYMBOL,
})

# Finish W&B run
ckpt_mgr.finish_wandb_run()

print("\n‚úÖ Data pipeline complete! Proceed to model training.")