In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller


df = pd.read_csv("nvidia_10yr_data.csv", parse_dates=["Date"])
df = df.sort_values("Date").reset_index(drop=True)



In [11]:
df['Return'] = df['Close'].pct_change()
df['Volatility'] = df['Close'].rolling(10).std()
df['Price_Diff'] = df['High'] - df['Low']
df['Volume_Log'] = np.log1p(df['Volume'])

# Drop NaN หลัง rolling
df.dropna(inplace=True)

X = df[['Return', 'Volatility', 'Price_Diff', 'Volume_Log']]
y = df['Close']


In [13]:
print(X)

        Return  Volatility  Price_Diff  Volume_Log
18    1.350392   40.330823    1.905525   19.998356
19   -0.454397   39.761560    0.629440   19.970320
20    1.730180   40.143048    1.012461   19.727166
21   -0.984707   40.950648    0.010734   19.484991
22    2.926464   41.477991    0.104923   20.195640
...        ...         ...         ...         ...
2511  2.067152   32.229865    0.102976   19.849853
2512  1.823627   31.864406    0.575775   20.417012
2513  0.745986   31.424410    0.406209   19.097983
2514 -0.350745   30.883297    0.347602   20.002978
2515  6.406565   40.302903    2.859097   19.428317

[2498 rows x 4 columns]


In [16]:
from typing import Tuple, List
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression



class DriftPointDetector:
    """
    ตรวจจับจุดเกิด concept drift ในข้อมูล time series ด้วยการใช้
    Kolmogorov-Smirnov test (KS test) บน sliding windows
    """
    def __init__(self, window_size: int = 100, threshold: float = 0.05):
        self.window_size = window_size
        self.threshold = threshold
        self.drift_points_: List[int] = []

    def detect(self, X: pd.DataFrame) -> List[int]:
        self.drift_points_ = []
        n = len(X)
        for i in range(self.window_size, n - self.window_size):
            window1 = X.iloc[i - self.window_size:i]
            window2 = X.iloc[i:i + self.window_size]

            drift_detected = False
            for col in X.columns:
                stat, p_value = ks_2samp(window1[col], window2[col])
                if p_value < self.threshold:
                    drift_detected = True
                    break

            if drift_detected:
                self.drift_points_.append(i)

        # กรองจุด drift ที่อยู่ใกล้กันเกินไป (ลด false positives)
        filtered_points = []
        last_point = -self.window_size
        for p in self.drift_points_:
            if p - last_point >= self.window_size:
                filtered_points.append(p)
                last_point = p
        self.drift_points_ = filtered_points

        return self.drift_points_

class AdaptiveFoldGenerator:
    """
    สร้าง train/test folds โดยแบ่งตาม drift points ที่ตรวจจับได้
    โดยจะข้ามช่วงที่สั้นเกินไป เพื่อให้แต่ละ fold มีขนาดพอเหมาะ
    """
    def __init__(self, min_fold_size: int = 100, test_ratio: float = 0.15):
        self.min_fold_size = min_fold_size
        self.test_ratio = test_ratio

    def split(self, X: pd.DataFrame, drift_points: List[int]) -> List[Tuple[np.ndarray, np.ndarray]]:
        folds = []
        points = [0] + drift_points + [len(X)]
        for i in range(len(points) - 1):
            start, end = points[i], points[i + 1]
            fold_length = end - start

            if fold_length < self.min_fold_size:
                continue  # ข้าม fold เล็ก ๆ

            split = int(start + (1 - self.test_ratio) * fold_length)
            train_idx = np.arange(start, split)
            test_idx = np.arange(split, end)

            folds.append((train_idx, test_idx))
        return folds

class DriftAdaptiveTimeSeriesCV:
    """
    ทำ cross-validation โดยใช้ fold ที่แบ่งตาม drift points
    ใช้ pipeline ที่กำหนดไว้ภายนอก (รับ parameter pipeline)
    """
    def __init__(self, pipeline: Pipeline):
        self.pipeline = pipeline

    def run(self, X: pd.DataFrame, y: pd.Series, drift_points: List[int]) -> Tuple[List[float], List[float]]:
        fold_gen = AdaptiveFoldGenerator()
        metrics_rmse, metrics_mae = [], []

        folds = fold_gen.split(X, drift_points)
        if not folds:
            print("Warning: No valid folds generated by AdaptiveFoldGenerator!")
            return [], []

        for i, (train_idx, test_idx) in enumerate(folds):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            # ใช้ pipeline ที่ส่งเข้ามา ไม่สร้างใหม่
            self.pipeline.fit(X_train, y_train)
            y_pred = self.pipeline.predict(X_test)

            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            mae = mean_absolute_error(y_test, y_pred)
            print(f"[Adaptive Fold {i+1}] RMSE={rmse:.4f}, MAE={mae:.4f}")

            metrics_rmse.append(rmse)
            metrics_mae.append(mae)

        return metrics_rmse, metrics_mae

class BaselineTimeSeriesCV:
    """
    ทำ cross-validation แบบ TimeSeriesSplit ปกติ
    โดยใช้ pipeline ที่กำหนดไว้ภายนอก (รับ parameter pipeline)
    """
    def __init__(self, pipeline: Pipeline, n_splits: int = 5):
        self.pipeline = pipeline
        self.n_splits = n_splits

    def run(self, X: pd.DataFrame, y: pd.Series) -> Tuple[List[float], List[float]]:
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        metrics_rmse, metrics_mae = [], []

        for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            self.pipeline.fit(X_train, y_train)
            y_pred = self.pipeline.predict(X_test)

            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            mae = mean_absolute_error(y_test, y_pred)
            print(f"[Baseline Fold {i+1}] RMSE={rmse:.4f}, MAE={mae:.4f}")

            metrics_rmse.append(rmse)
            metrics_mae.append(mae)

        return metrics_rmse, metrics_mae

# --------------------------------------------------------
# ตัวอย่างการใช้งาน
if __name__ == "__main__":
    np.random.seed(42)

    # โหลดข้อมูลหุ้น (ตัวอย่าง)
    df = pd.read_csv("nvidia_10yr_data.csv", parse_dates=["Date"])
    df = df.sort_values("Date")

    # Feature selection (เลือก column ที่ใช้เป็น X)
    #X = df[['Open', 'High', 'Low', 'Volume']]
    #y = df['Close']

    # Pipeline สำหรับ Baseline TimeSeriesSplit CV (Linear Model)
    baseline_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('regressor', LinearRegression())
])

    # Pipeline สำหรับ Adaptive CV (Random Forest)
    adaptive_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('regressor', RandomForestRegressor(
    n_estimators=100,
    max_depth=5,              # ลดความลึก
    min_samples_leaf=10,      # ป้องกัน overfit
    random_state=42
)) 
])
    
    # 1) Detect drift points
    detector = DriftPointDetector(window_size = 200, threshold = 0.05)
    drift_points = detector.detect(X)
    print(f"Detected drift points at indices: {drift_points}")

    # 2) Run Adaptive CV
    drift_cv = DriftAdaptiveTimeSeriesCV(adaptive_pipeline)
    print("\n--- Drift-Aware Adaptive CV ---")
    drift_rmse, drift_mae = drift_cv.run(X, y, drift_points)

    # 3) Run Baseline TimeSeriesSplit CV
    baseline_cv = BaselineTimeSeriesCV(baseline_pipeline, n_splits = 5)
    print("\n--- Baseline TimeSeriesSplit CV ---")
    base_rmse, base_mae = baseline_cv.run(X, y)

    # 4) สรุปเปรียบเทียบผลลัพธ์
    print("\n===== Summary Metrics =====")
    print(f"Adaptive CV - Avg RMSE: {np.mean(drift_rmse):.4f}, Avg MAE: {np.mean(drift_mae):.4f}")
    print(f"Baseline CV - Avg RMSE: {np.mean(base_rmse):.4f}, Avg MAE: {np.mean(base_mae):.4f}")


Detected drift points at indices: [200, 412, 612, 812, 1026, 1271, 1471, 1671, 1979, 2180]

--- Drift-Aware Adaptive CV ---
[Adaptive Fold 1] RMSE=16.4151, MAE=6.4669
[Adaptive Fold 2] RMSE=7.0911, MAE=4.1311
[Adaptive Fold 3] RMSE=14.9139, MAE=7.1951
[Adaptive Fold 4] RMSE=14.0176, MAE=6.9817
[Adaptive Fold 5] RMSE=12.4674, MAE=5.9566
[Adaptive Fold 6] RMSE=15.0647, MAE=7.6404
[Adaptive Fold 7] RMSE=12.1248, MAE=6.2419
[Adaptive Fold 8] RMSE=12.5859, MAE=7.0948
[Adaptive Fold 9] RMSE=7.2333, MAE=4.4437
[Adaptive Fold 10] RMSE=9.3082, MAE=4.7859
[Adaptive Fold 11] RMSE=6.8446, MAE=4.0072

--- Baseline TimeSeriesSplit CV ---
[Baseline Fold 1] RMSE=13.0893, MAE=8.7354
[Baseline Fold 2] RMSE=14.8187, MAE=8.6275
[Baseline Fold 3] RMSE=13.2611, MAE=8.2821
[Baseline Fold 4] RMSE=16.4087, MAE=8.8815
[Baseline Fold 5] RMSE=17.3526, MAE=8.4517

===== Summary Metrics =====
Adaptive CV - Avg RMSE: 11.6424, Avg MAE: 5.9041
Baseline CV - Avg RMSE: 14.9861, Avg MAE: 8.5956
