In [None]:
from typing import Tuple, List
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit


class DriftPointDetector:
    """
    Detect drift points using sliding window KS test
    """
    def __init__(self, window_size: int = 20, threshold: float = 0.05):
        self.window_size = window_size
        self.threshold = threshold
        self.drift_points_: List[int] = []

    def detect(self, X: pd.DataFrame) -> List[int]:
        self.drift_points_ = []
        for i in range(self.window_size, len(X) - self.window_size):
            window1 = X.iloc[i - self.window_size:i]
            window2 = X.iloc[i:i + self.window_size]

            drift_detected = False
            for col in X.columns:
                stat, p_value = ks_2samp(window1[col], window2[col])
                if p_value < self.threshold:
                    drift_detected = True
                    break

            if drift_detected:
                self.drift_points_.append(i)
        return self.drift_points_


class AdaptiveFoldGenerator:
    """
    Generate train/test folds based on drift points
    """
    def __init__(self, min_fold_size: int = 30, test_ratio: float = 0.2):
        self.min_fold_size = min_fold_size
        self.test_ratio = test_ratio

    def split(self, X: pd.DataFrame, drift_points: List[int]) -> List[Tuple[np.ndarray, np.ndarray]]:
        folds = []
        points = [0] + drift_points + [len(X)]
        for i in range(len(points) - 1):
            start, end = points[i], points[i + 1]
            if end - start < self.min_fold_size:
                continue
            split = int((1 - self.test_ratio) * (end - start)) + start
            train_idx = np.arange(start, split)
            test_idx = np.arange(split, end)
            folds.append((train_idx, test_idx))
        return folds


class DriftAdaptiveTimeSeriesCV:
    def __init__(self, pipeline: Pipeline):
        self.pipeline = pipeline

    def run(self, X: pd.DataFrame, y: pd.Series, drift_points: List[int]) -> Tuple[List[float], List[float]]:
        fold_gen = AdaptiveFoldGenerator()
        metrics_rmse, metrics_mae = [], []

        for i, (train_idx, test_idx) in enumerate(fold_gen.split(X, drift_points)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model = Pipeline([
                ('scale', StandardScaler()),
                ('regressor', RandomForestRegressor(n_estimators=50, random_state=42))
            ])
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            rmse = mean_squared_error(y_test, y_pred, squared=False)
            mae = mean_absolute_error(y_test, y_pred)
            print(f"[Adaptive Fold {i+1}] RMSE={rmse:.4f}, MAE={mae:.4f}")

            metrics_rmse.append(rmse)
            metrics_mae.append(mae)

        return metrics_rmse, metrics_mae


class BaselineTimeSeriesCV:
    def __init__(self, n_splits: int = 5):
        self.n_splits = n_splits

    def run(self, X: pd.DataFrame, y: pd.Series) -> Tuple[List[float], List[float]]:
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        metrics_rmse, metrics_mae = [], []

        for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model = Pipeline([
                ('scale', StandardScaler()),
                ('regressor', LinearRegression())
            ])
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            rmse = mean_squared_error(y_test, y_pred, squared=False)
            mae = mean_absolute_error(y_test, y_pred)
            print(f"[Baseline Fold {i+1}] RMSE={rmse:.4f}, MAE={mae:.4f}")

            metrics_rmse.append(rmse)
            metrics_mae.append(mae)

        return metrics_rmse, metrics_mae


# Example usage
if __name__ == "__main__":
    np.random.seed(42)
    X = pd.DataFrame({
        'feature1': np.random.randn(200),
        'feature2': np.random.randn(200) + np.linspace(0, 5, 200)
    })
    y = X['feature1'] * 1.5 + X['feature2'] * 0.5 + np.random.randn(200) * 0.2

    # Drift-aware adaptive CV
    detector = DriftPointDetector(window_size=20, threshold=0.01)
    drift_points = detector.detect(X)
    drift_cv = DriftAdaptiveTimeSeriesCV(None)
    print("\n--- Drift-Aware Adaptive CV ---")
    drift_rmse, drift_mae = drift_cv.run(X, pd.Series(y), drift_points)

    # Baseline TimeSeriesSplit CV
    baseline_cv = BaselineTimeSeriesCV(n_splits=5)
    print("\n--- Baseline TimeSeriesSplit CV ---")
    base_rmse, base_mae = baseline_cv.run(X, pd.Series(y))