A custom split() method allows you to:

- Preserve temporal order.

- Do walk-forward validation:

    Train on earlier time steps.

    Validate on slightly later ones.

    Keep expanding the training window step by step.

- Avoid data leakage.
- And i have more control over splitting

In [15]:

import optuna
import numpy as np
import pandas as pd
import warnings
from typing import Generator, Tuple, Sequence
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
import catboost as cb
import lightgbm as lgb
import tensorflow as tf
from sklearn.metrics import mean_squared_log_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

warnings.filterwarnings("ignore")

def rmsle(y_true, y_pred):
    """
    Calculate Root Mean Squared Logarithmic Error (RMSLE).
    
    Measures error on a log scale, which means it penalizes relative differences rather than absolute differences."""
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def log_transform(y):
    """Apply logarithmic transformation to the target variable.
     log(0) is undefined; adding 1 handles zero targets gracefully
    """
    return np.log1p(y)

def inverse_log_transform(y):
    """
    Inverse the logarithmic transformation applied to the target variable.
    Converts the predictions back to the original scale."""
    return np.expm1(y)

class TimeSeriesCV:
    """Custom Time Series Cross-Validation for Walk-forward Validation.
        
        This splitter generates train/test indices by expanding the training set
        and sliding the test set forward in time.
    """
    
    def __init__(self, n_splits=5, test_size=0.2):
        """
        Initialize TimeSeriesCV.

        Parameters:
            n_splits (int): Number of walk-forward splits. Must be > 0.
            test_size (float): Proportion of the dataset to use for validation in each split.
                               Must be between 0 and 1 (exclusive).

        Raises:
            ValueError: If parameters are outside of valid range.
        """
        
        if not isinstance(n_splits, int) or n_splits <= 0:
            raise ValueError("n_splits must be a positive integer.")
        if not (0 < test_size < 1):
            raise ValueError("test_size must be a float between 0 and 1 (exclusive).")
        
        self.n_splits = n_splits
        self.test_size = test_size

    def split(self, X: Sequence) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
        """
        Generate (train, validation) indices for each split.

        Parameters:
            X (Sequence): The input data (e.g., list, NumPy array, pandas DataFrame).
                          Must support len() and indexing (X[i]).

        Returns:
            Generator yielding tuples of (train_indices, validation_indices) as numpy arrays.

        Raises:
            ValueError: If dataset is too small to be split according to given parameters.
        """
        n_samples = len(X)
        test_size = int(n_samples * self.test_size)
        if test_size == 0:
            raise ValueError("test_size too small; resulted in 0 validation samples.")
        if n_samples <= test_size:
            raise ValueError("Dataset too small relative to test_size.")
        
        step_size = (n_samples - test_size) // self.n_splits
        if step_size <= 0:
            raise ValueError("Number of splits too large for dataset size and test_size.")

        for i in range(self.n_splits):
            start_idx = i * step_size
            train_end = start_idx + n_samples - test_size - step_size
            test_start = train_end
            test_end = test_start + test_size

            if test_end > n_samples:
                break

            yield np.arange(start_idx, train_end), np.arange(test_start, test_end)


def create_sequences(data, seq_length, target_col):
    """Create sequences of data for LSTM input.
    :param data: Input data as a NumPy array or DataFrame.
    :param seq_length: Length of the sequences to create.
    :param target_col: Index of the target column in the data.
    :return: Tuple of sequences (X) and targets (y).
    """
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:(i + seq_length)])
        y.append(data[i + seq_length, target_col])
    return np.array(X), np.array(y)



def train_catboost(X_train, y_train, X_val, y_val, params, cat_features=None):
    """
    Train a CatBoostRegressor model and predict on validation data.

    Parameters:
        X_train (array-like or DataFrame): Training features.
        y_train (array-like or Series): Training target values.
        X_val (array-like or DataFrame): Validation features.
        y_val (array-like or Series): Validation target values (used for shape checking).
        params (dict): Parameters for CatBoostRegressor.

    Returns:
        np.ndarray: Predictions for X_val.

    Raises:
        ValueError: If input data dimensions do not match.
        Exception: If model training or prediction fails.
    """
    try:
    
        if len(X_train) != len(y_train):
            raise ValueError("X_train and y_train must have the same number of samples.")
        if len(X_val) != len(y_val):
            raise ValueError("X_val and y_val must have the same number of samples.")
        model = cb.CatBoostRegressor(**params)
        model.fit(X_train, y_train, cat_features=cat_features, verbose=0)
        preds = model.predict(X_val)
        return preds
    
    except Exception as e:
        print(f"⚠️ Error in train_catboost: {e}")
        raise

def train_lightgbm(X_train, y_train, X_val, y_val, params):
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    model = lgb.train(params, train_data, valid_sets=[val_data], num_boost_round=200, verbose_eval=False)
    return model.predict(X_val)


def train_lstm(X_train, y_train, X_val, y_val, input_shape, params):
    model = Sequential([
        LSTM(params['lstm_units'], return_sequences=True, input_shape=input_shape),
        Dropout(params['dropout_rate']),
        LSTM(params['lstm_units'] // 2, return_sequences=False),
        Dropout(params['dropout_rate']),
        Dense(params['dense_units'], activation='relu'),
        Dense(1)
    ])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=params['learning_rate']), loss='mse')
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    
    model.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=15,  # Reduce during tuning
              batch_size=params['batch_size'],
              verbose=0,
              callbacks=[early_stopping])
    
    pred = model.predict(X_val, verbose=0)
    
    tf.keras.backend.clear_session()  # better after training to fully release resources
    
    return pred.flatten(), model


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def evaluate_model(model_func, X, y, params:dict, cv, is_lstm=False, seq_length=10):
    """Evaluate a model using cross-validation.
    parameters:
        model_func: Function to train and predict with the model.
        X: Features as a DataFrame or NumPy array.
        y: Target values as a Series or NumPy array.
        params: Dictionary of model parameters.
        cv: Cross-validation splitter (e.g., TimeSeriesCV).
        is_lstm: Boolean indicating if the model is LSTM.
        seq_length: Length of sequences for LSTM (default is 10).
        Returns:
        Mean RMSE across all folds.
    """
    scores = []


    for train_idx, val_idx in cv.split(X):
        try:
            if is_lstm:
                X_tr = X[train_idx]
                y_tr = y[train_idx]
                X_val = X[val_idx]
                y_val = y[val_idx]

                
                y_pred_log = model_func(X_tr, y_tr, X_val, y_val, (X_tr.shape[1], X_tr.shape[2]), params)

                y_pred = np.expm1(y_pred_log)  
                y_true = np.expm1(y_val)
                score = rmse(y_true, y_pred)
            else:
                X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
                
                y_tr_log = log_transform(y_tr)
                y_val_log = log_transform(y_val)
            
                y_pred_log = model_func(X_tr, y_tr_log, X_val, y_val_log, params)
                y_pred = inverse_log_transform(y_pred_log)
                y_true = y_val.values
                score = rmsle(y_true, y_pred)

            scores.append(score)

        except Exception as e:
            print(f" Error during fold: {e}")
            continue

    return np.mean(scores) if scores else float('inf')


def get_objective(model_type, X, y, cv, seq_length):
    """Get the objective function for Optuna optimization based on model type.
    Parameters:
        model_type: Type of model to optimize ('catboost', 'lightgbm', 'lstm').
        X: Features as a DataFrame or NumPy array.
        y: Target values as a Series or NumPy array.
        cv: Cross-validation splitter (e.g., TimeSeriesCV).
        seq_length: Length of sequences for LSTM (default is 10).
        Returns:
        Objective function for Optuna optimization.
    """
    def objective(trial):
        if model_type == 'catboost':
            params = {
                'iterations': trial.suggest_int('iterations', 200, 1000),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'depth': trial.suggest_int('depth', 4, 10),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
                'border_count': trial.suggest_int('border_count', 32, 128),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
                'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
                'random_strength': trial.suggest_float('random_strength', 0, 1),
                'verbose': False,
                'random_seed': 42
            }
            cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
            return evaluate_model(lambda X_train, y_train, X_val, y_val, p: train_catboost(X_train, y_train, X_val, y_val, p, cat_features), X, y, params, cv)

        elif model_type == 'lightgbm':
            params = {
                'objective': 'regression',
                'metric': 'rmse',
                'boosting_type': 'gbdt',
                'num_leaves': trial.suggest_int('num_leaves', 10, 300),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
                'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
                'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
                'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
                'verbose': -1,
                'random_state': 42
            }
            return evaluate_model(train_lightgbm, X, y, params, cv)

        elif model_type == 'lstm':
            params = {
                'lstm_units': trial.suggest_int('lstm_units', 32, 128),
                'dense_units': trial.suggest_int('dense_units', 16, 128),
                'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5),
                'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.01, log=True),
                'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),

            }
            return evaluate_model(train_lstm, X, y, params, cv, is_lstm=True, seq_length=seq_length)

        else:
            raise ValueError("Unsupported model type")

    return objective


def run_optimization(model_type, X, y, n_trials=50, cv_folds=5, seq_length=10):
    """Run Optuna optimization for the specified model type.
    Parameters:
        model_type: Type of model to optimize ('catboost', 'lightgbm', 'lstm').
        X: Features as a DataFrame or NumPy array.
        y: Target values as a Series or NumPy array.
        n_trials: Number of trials for Optuna optimization.
        cv_folds: Number of folds for cross-validation.
        seq_length: Length of sequences for LSTM (default is 10).
        Returns:
        Optuna study object with the best parameters and RMSE.
    """
    cv = TimeSeriesCV(n_splits=cv_folds)
    base_objective = get_objective(model_type, X, y, cv, seq_length)

    def printing_objective(trial):
        score = base_objective(trial)
        print(f"Trial {trial.number}: RMSE={score:.4f}, params={trial.params}")
        return score

    study = optuna.create_study(direction='minimize')
    study.optimize(printing_objective, n_trials=n_trials, show_progress_bar=True)

    print(f"\n Best parameters for {model_type}:")
    print(study.best_params)
    print(f"RMSLE: {study.best_value:.4f}")
    return study






In [5]:
import sys
import numpy as np
from pathlib import Path

current_dir = Path().resolve()
python_files_dir = current_dir.parent / "Python_Files"
if str(python_files_dir) not in sys.path:
    sys.path.append(str(python_files_dir))

from config import CFG
from helpers import load_all_data

In [3]:
data = load_all_data(CFG)

df = data['df_new']
df

FileNotFoundError: File not found: C/Users/seydaaybar/Desktop/ntt_data/data/transaction.csv

In [None]:
run_optimization('catboost', X_train, y_train, n_trials=5)

[I 2025-07-10 16:13:35,588] A new study created in memory with name: no-name-f559f2ce-2aa8-4146-a1b4-26dd817c53e4
Best trial: 0. Best value: 0.583548:  20%|██        | 1/5 [02:08<08:35, 128.80s/it]

Trial 0: RMSE=0.5835, params={'iterations': 855, 'learning_rate': 0.015255813626123587, 'depth': 8, 'l2_leaf_reg': 7.616317063084429, 'border_count': 40, 'min_data_in_leaf': 22, 'bagging_temperature': 0.7525659190873778, 'random_strength': 0.6677000851423287}
[I 2025-07-10 16:15:44,386] Trial 0 finished with value: 0.5835481268794027 and parameters: {'iterations': 855, 'learning_rate': 0.015255813626123587, 'depth': 8, 'l2_leaf_reg': 7.616317063084429, 'border_count': 40, 'min_data_in_leaf': 22, 'bagging_temperature': 0.7525659190873778, 'random_strength': 0.6677000851423287}. Best is trial 0 with value: 0.5835481268794027.


Best trial: 1. Best value: 0.578825:  40%|████      | 2/5 [02:45<03:43, 74.53s/it] 

Trial 1: RMSE=0.5788, params={'iterations': 383, 'learning_rate': 0.08225703868286412, 'depth': 5, 'l2_leaf_reg': 9.944359434041193, 'border_count': 69, 'min_data_in_leaf': 54, 'bagging_temperature': 0.5419965715599584, 'random_strength': 0.17570423922635958}
[I 2025-07-10 16:16:20,933] Trial 1 finished with value: 0.5788253039370908 and parameters: {'iterations': 383, 'learning_rate': 0.08225703868286412, 'depth': 5, 'l2_leaf_reg': 9.944359434041193, 'border_count': 69, 'min_data_in_leaf': 54, 'bagging_temperature': 0.5419965715599584, 'random_strength': 0.17570423922635958}. Best is trial 1 with value: 0.5788253039370908.


Best trial: 1. Best value: 0.578825:  60%|██████    | 3/5 [04:38<03:04, 92.37s/it]

Trial 2: RMSE=0.5831, params={'iterations': 646, 'learning_rate': 0.11548799955502696, 'depth': 9, 'l2_leaf_reg': 6.581490358154209, 'border_count': 106, 'min_data_in_leaf': 26, 'bagging_temperature': 0.10457319284521527, 'random_strength': 0.861878792459167}
[I 2025-07-10 16:18:14,538] Trial 2 finished with value: 0.5830688956429297 and parameters: {'iterations': 646, 'learning_rate': 0.11548799955502696, 'depth': 9, 'l2_leaf_reg': 6.581490358154209, 'border_count': 106, 'min_data_in_leaf': 26, 'bagging_temperature': 0.10457319284521527, 'random_strength': 0.861878792459167}. Best is trial 1 with value: 0.5788253039370908.


Best trial: 1. Best value: 0.578825:  80%|████████  | 4/5 [05:36<01:18, 78.53s/it]

Trial 3: RMSE=0.5798, params={'iterations': 324, 'learning_rate': 0.14643256283839626, 'depth': 9, 'l2_leaf_reg': 9.822442463860943, 'border_count': 86, 'min_data_in_leaf': 80, 'bagging_temperature': 0.9225697336777582, 'random_strength': 0.6142576783266187}
[I 2025-07-10 16:19:11,848] Trial 3 finished with value: 0.5798414443288518 and parameters: {'iterations': 324, 'learning_rate': 0.14643256283839626, 'depth': 9, 'l2_leaf_reg': 9.822442463860943, 'border_count': 86, 'min_data_in_leaf': 80, 'bagging_temperature': 0.9225697336777582, 'random_strength': 0.6142576783266187}. Best is trial 1 with value: 0.5788253039370908.


Best trial: 1. Best value: 0.578825: 100%|██████████| 5/5 [06:19<00:00, 75.91s/it]

Trial 4: RMSE=0.5789, params={'iterations': 571, 'learning_rate': 0.24444877777267898, 'depth': 4, 'l2_leaf_reg': 6.122687932600137, 'border_count': 126, 'min_data_in_leaf': 90, 'bagging_temperature': 0.7829617145050419, 'random_strength': 0.11694542461367563}
[I 2025-07-10 16:19:55,153] Trial 4 finished with value: 0.5788794007949043 and parameters: {'iterations': 571, 'learning_rate': 0.24444877777267898, 'depth': 4, 'l2_leaf_reg': 6.122687932600137, 'border_count': 126, 'min_data_in_leaf': 90, 'bagging_temperature': 0.7829617145050419, 'random_strength': 0.11694542461367563}. Best is trial 1 with value: 0.5788253039370908.

 Best parameters for catboost:
{'iterations': 383, 'learning_rate': 0.08225703868286412, 'depth': 5, 'l2_leaf_reg': 9.944359434041193, 'border_count': 69, 'min_data_in_leaf': 54, 'bagging_temperature': 0.5419965715599584, 'random_strength': 0.17570423922635958}
RMSE: 0.5788





<optuna.study.study.Study at 0x1a98ae39f30>

In [6]:
y

0        10716.0
1        42864.0
2        10716.0
3        10716.0
4        10716.0
          ...   
63768     1065.0
63769     1356.0
63770     1421.0
63771     1746.0
63772      582.0
Name: sales, Length: 63773, dtype: float64

In [7]:
y.max()

np.float64(1467090.0)

In [8]:
y.min()

np.float64(2.0)

In [10]:
from sklearn.metrics import mean_squared_log_error
import numpy as np

y_true = y.values  # original target
y_pred_baseline = np.full_like(y_true, y_true.mean())

baseline_rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred_baseline))
print(f"Baseline RMSLE: {baseline_rmsle:.4f}")


Baseline RMSLE: 1.6395


### LSTM

In [11]:
X_train_lstm = np.load('X_train_lstm.npy')
y_train_lstm = np.load('y_train_lstm.npy')
X_val_lstm = np.load('X_val_lstm.npy')
y_val_lstm = np.load('y_val_lstm.npy')

#### Check for NaN or Inf values in  input

In [12]:
import numpy as np

print("X_train_lstm:")
print("  NaNs:", np.isnan(X_train_lstm).sum())
print("  Infs:", np.isinf(X_train_lstm).sum())

print("y_train_lstm:")
print("  NaNs:", np.isnan(y_train_lstm).sum())
print("  Infs:", np.isinf(y_train_lstm).sum())


X_train_lstm:
  NaNs: 0
  Infs: 0
y_train_lstm:
  NaNs: 0
  Infs: 0


#### Check the actual max values

In [13]:
print("Max X:", np.max(X_train_lstm))
print("Min X:", np.min(X_train_lstm))
print("Max y:", np.max(y_train_lstm))
print("Min y:", np.min(y_train_lstm))


Max X: 21326.0
Min X: -6.118417947713259
Max y: 5.3349249274041455
Min y: -5.05778085876837


In [16]:
run_optimization('lstm', X_train_lstm, y_train_lstm, n_trials=5)

[I 2025-07-12 15:12:26,544] A new study created in memory with name: no-name-ed819684-20f5-4f65-9582-10bc006849a6
Best trial: 0. Best value: 4.47272:  20%|██        | 1/5 [01:01<04:06, 61.50s/it]

Trial 0: RMSE=4.4727, params={'lstm_units': 81, 'dense_units': 70, 'dropout_rate': 0.2600919967351602, 'learning_rate': 0.002802301797380156, 'batch_size': 32}
[I 2025-07-12 15:13:28,048] Trial 0 finished with value: 4.472720763849033 and parameters: {'lstm_units': 81, 'dense_units': 70, 'dropout_rate': 0.2600919967351602, 'learning_rate': 0.002802301797380156, 'batch_size': 32}. Best is trial 0 with value: 4.472720763849033.


Best trial: 0. Best value: 4.47272:  40%|████      | 2/5 [02:24<03:41, 73.86s/it]

Trial 1: RMSE=4.4736, params={'lstm_units': 80, 'dense_units': 53, 'dropout_rate': 0.2538710786217493, 'learning_rate': 0.002765593325579184, 'batch_size': 16}
[I 2025-07-12 15:14:50,552] Trial 1 finished with value: 4.473612143581377 and parameters: {'lstm_units': 80, 'dense_units': 53, 'dropout_rate': 0.2538710786217493, 'learning_rate': 0.002765593325579184, 'batch_size': 16}. Best is trial 0 with value: 4.472720763849033.


Best trial: 2. Best value: 4.46337:  60%|██████    | 3/5 [05:10<03:52, 116.37s/it]

Trial 2: RMSE=4.4634, params={'lstm_units': 116, 'dense_units': 59, 'dropout_rate': 0.14724679263742732, 'learning_rate': 0.004834746313664063, 'batch_size': 16}
[I 2025-07-12 15:17:37,510] Trial 2 finished with value: 4.463369775623137 and parameters: {'lstm_units': 116, 'dense_units': 59, 'dropout_rate': 0.14724679263742732, 'learning_rate': 0.004834746313664063, 'batch_size': 16}. Best is trial 2 with value: 4.463369775623137.


Best trial: 2. Best value: 4.46337:  80%|████████  | 4/5 [05:57<01:28, 88.75s/it] 

Trial 3: RMSE=4.4710, params={'lstm_units': 80, 'dense_units': 30, 'dropout_rate': 0.361799688179356, 'learning_rate': 0.007790321593912226, 'batch_size': 64}
[I 2025-07-12 15:18:23,931] Trial 3 finished with value: 4.4710492897735215 and parameters: {'lstm_units': 80, 'dense_units': 30, 'dropout_rate': 0.361799688179356, 'learning_rate': 0.007790321593912226, 'batch_size': 64}. Best is trial 2 with value: 4.463369775623137.


Best trial: 2. Best value: 4.46337: 100%|██████████| 5/5 [07:27<00:00, 89.57s/it]

Trial 4: RMSE=4.4667, params={'lstm_units': 70, 'dense_units': 102, 'dropout_rate': 0.23497516097044602, 'learning_rate': 0.004881999292116275, 'batch_size': 16}
[I 2025-07-12 15:19:54,374] Trial 4 finished with value: 4.466659214328545 and parameters: {'lstm_units': 70, 'dense_units': 102, 'dropout_rate': 0.23497516097044602, 'learning_rate': 0.004881999292116275, 'batch_size': 16}. Best is trial 2 with value: 4.463369775623137.

 Best parameters for lstm:
{'lstm_units': 116, 'dense_units': 59, 'dropout_rate': 0.14724679263742732, 'learning_rate': 0.004834746313664063, 'batch_size': 16}
RMSLE: 4.4634





<optuna.study.study.Study at 0x1302eab30>

In [17]:
print("Log sales stats:", y_train_lstm.min(), y_train_lstm.max(), y_train_lstm.mean())


Log sales stats: -5.05778085876837 5.3349249274041455 0.35551618838141263
