In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.interpolate import interp1d

In [10]:
# splits and restructures the timeseries dataframe into a training features array and target feature array with some lookback period and a shift step
# can be used as input for CNN and RNN (LSTM, GRU, ...)




def lookback_arr(df: pd.DataFrame, training_feats: list, target_feats: list, lookback = 24, step = 1, n_ahead = 1):

  """
    Creates lookback windows and corresponding target arrays from a time series DataFrame for model training.

    Args:
        df: A Pandas DataFrame with time series data.
        training_feats: A list of column names to use as input features (including past target values).
        target_feats: A list of column names to use as target variables for prediction.
        lookback: The number of past time steps to include in each lookback window.
        step: The number of time steps to move the window forward for each sample (default: 1).
        n_ahead: The number of future time steps to predict (default: 1).

    Returns:
        A tuple of two NumPy arrays:
            - X: A 3D array of shape (num_samples, lookback, num_features) containing the lookback windows.
            - y: A 1D or 2D array of shape (num_samples,) or (num_samples, n_ahead) containing the target values.
    """



  assert isinstance(df, pd.DataFrame), "Input timeseries data must be a pandas DataFrame."
  new_df = df.copy()

  assert len(new_df)> lookback, "There are fewer observations than the lookback period requires. Add more observations or reduce the lookback period."

  for name in training_feats:
    assert name in new_df.columns, f"{name} training column is not present in the timeseries dataframe"

  for target in target_feats:
    assert target in new_df.columns, f"{target} target column is not present in the timeseries dataframe"
    assert target in training_feats, f"{target} target column not in training column list. Required for lookback window. Past target values will be used for training to predict future target values"

  X, y = [], []

  lb = lookback

  all_training_values = new_df[training_feats].values #includes target values from the past
  all_target_values = new_df[target_feats].values # one-step ahead target values

  while lb + n_ahead< len(new_df):
      sliced_training_df = all_training_values[:lb, :] #determines the lookback length of the timeseries observation window
      sliced_target_df = all_target_values[lb:lb + n_ahead,:] #n_ahead determines the number of perdiods ahead you want to predict

      X.append(sliced_training_df)
      y.append(sliced_target_df)

      lb+=step

  X = np.array(X)
  y = np.array(y)

  if y.shape[1]==1: #only one target feature--> reduce from 2-d to 1-d
    y = y.reshape(-1)

  return X.astype(np.float32), y.astype(np.float32)



In [11]:
# interpolates missing values in a timeseries dataframe using polynomial interpolation
# ONLY FOR NUMERICAL DATA (IGNORES CATEGORICALS)

def poly_interpolate(df, order, time_col_name, freq):

  """
    Performs polynomial interpolation on a time series DataFrame, handling missing timestamps.

    This function first fills in any missing timestamps in the specified time column using
    forward-filling and the given frequency. Then, it performs polynomial interpolation
    on all numerical columns to fill in missing values within the existing time range.

    Args:
        df: A Pandas DataFrame with a time column and numerical columns to interpolate.
        order: The order of the polynomial used for interpolation (default: 2, quadratic). Must be a positive integer.
        time_col_name: The name of the column containing datetime values (default: "Date").
        freq: The frequency of the time series for filling missing timestamps (e.g., 'H' for hourly, 'D' for daily). Defaults to "D" (daily).

    Returns:
        A new DataFrame with interpolated values for the numerical columns and a complete DatetimeIndex.

    Raises:
        ValueError: If 'df' is not a DataFrame, 'order' is not a positive integer,
                    or the specified time column is not found.
    """


  assert isinstance(df, pd.DataFrame), "Input must be a Pandas DataFrame"
  assert isinstance(order, int) and order >= 1, "Order must be a positive integer"

  new_df = fill_missing_timestamps(df, time_col_name, freq)

  df_interpolated = new_df.interpolate(method='polynomial', order = order) # order specifies the order of the polynomial (linear (1), quadratic (2), cubic (3))

  return df_interpolated




In [12]:
# interpolates time
#make sure time sure time is column not an index during input

def fill_missing_timestamps(df: pd.DataFrame, date_column_name: str, freq: str) -> pd.DataFrame:
    """Fills missing timestamps in a DataFrame's datetime column by incrementing the previous valid timestamp.

       Starts of with an incomplete time column and ouputs a new dataframe with an imputed DateTime index based on the given frequency.

    Args:
        df (pd.DataFrame): The DataFrame containing the datetime column.
        date_column_name (str): The name of the column containing datetime values.
        freq (str): The frequency of the time series (e.g., 'H' for hourly, 'T' for minute, 'D' for daily).

    Inputs: pd.DataFrame: A DataFrame with an incomplete time column (has NANs or missing values).

    Returns:
        pd.DataFrame: A new DataFrame with filled timestamps as index.
    """

    # Input validation
    assert isinstance(df, pd.DataFrame), "Input must be a Pandas DataFrame."
    assert date_column_name in df.columns, f"Column '{date_column_name}' not found."

    # Copy DataFrame to avoid modifying the original
    df_filled = df.copy()

    # Convert to datetime before filling
    df_filled[date_column_name] = pd.to_datetime(df_filled[date_column_name])

    # Create a series with a complete datetime index
    complete_dates = pd.date_range(start=df_filled[date_column_name].min(),
                                  end=df_filled[date_column_name].max(),
                                  freq=freq)

    # Identify missing timestamps
    missing_dates = complete_dates.difference(df_filled[date_column_name])

    # Create a new DataFrame with the missing dates and NaN values for other columns
    df_missing = pd.DataFrame(index=missing_dates, columns=df_filled.columns)
    df_missing[date_column_name] = missing_dates  # Fill the date column

    # Concatenate and sort
    df_filled = pd.concat([df_filled, df_missing]).sort_index()

    # Fill missing values in the date column by incrementing
    df_filled[date_column_name] = df_filled[date_column_name].fillna(method='ffill') + pd.to_timedelta('1' + freq)

    # put the imputed time column as index and remove its column equivalent
    df_filled.set_index(date_column_name, inplace=True)
    df_filled.drop(columns=[date_column_name], inplace=True)


    return df_filled




In [13]:
# Z-score normalization of timeseries data

# to prevent data leakage the values will only be normalized based on past values, determined by some lookback period

# best use is for price and volume data, not necessarily for indicators like (RSI, MACD, ...)

# ideally the lookback should be the same as the lookback period of the training array used to train the LSTM (RNN) or CNN




def normalize_rolling_mean(df: pd.DataFrame, columns_to_normalize: list, lookback: int, date_column_name:str, freq:str)->pd.DataFrame:

  """
    Normalizes specified columns in a time series DataFrame using a rolling window of means (useful for price and volume data).

    Args:
        df: The DataFrame to normalize. Must have a datetime index or a datetime column specified by 'date_column_name'.
        columns_to_normalize: List of column names to normalize.
        lookback: The number of past periods to use for calculating rolling means.
        date_column_name: The name of the datetime column (default is 'Date'). Used if the DataFrame does not have a datetime index.
        freq: The frequency of the time series (e.g., 'H' for hourly, 'D' for daily) if missing timestamps need to be filled. Defaults to "D" (daily).

    Returns:
        A new DataFrame with the specified columns normalized using rolling means.
    """

  # Ensure datetime index if not already set; if so impute and set it as index, as well remove the old time column
  if not isinstance(df.index, pd.DatetimeIndex):
    df = fill_missing_timestamps(df, date_column_name, freq)

    # Initialize empty dataframe to store normalized data
  df_normalized = pd.DataFrame(index=df.index, columns=df.columns)

  for col in columns_to_normalize:
    scaler = StandardScaler()
    # Initialize with lookback NaNs to avoid leakage
    df_normalized[col] = [np.nan] * lookback
    # Calculate rolling means, ignoring initial NaNs
    rolling_means = df[col].rolling(window=lookback).mean()[lookback:].values.reshape(-1, 1)
    # Fit scaler to rolling means and transform the rest of the data
    df_normalized[col][lookback:] = scaler.fit_transform(rolling_means)

  # Fill the other columns with their original values
  for col in df.columns:
    if col not in columns_to_normalize:
      df_normalized[col] = df[col]

  return df_normalized  # Return the normalized DataFrame




In [14]:
def normalize_indicators(df:pd.DataFrame, columns_to_normalize: list)->pd.DataFrame:

  """
    Z-score normalizes specified columns in a DataFrame (good for indicator normalization).

    Args:
        df (pd.DataFrame): The DataFrame containing the indicators.
        columns_to_normalize (list): A list of column names to normalize.

    Returns:
        pd.DataFrame: A new DataFrame with the normalized columns.
    """

  # make a copy
  df_normalized = df.copy()

  # normalize the specified columns
  scaler = StandardScaler()
  df_normalized[columns_to_normalize] = scaler.fit_transform(df_normalized[columns_to_normalize].values.reshape(-1,1))

  # return the normalized column
  return df_normalized


In [15]:
# mostly for price and volume data

def log_returns(df:pd.DataFrame, column_names: list)->pd.DataFrame:

  """
    Calculates log returns for specified columns in a DataFrame.

    Args:
        df: The DataFrame containing the columns to calculate log returns for.
        column_names: A list of column names to calculate log returns for.

    Returns:
        A new DataFrame with the log returns columns added.

    Raises:
        ValueError: If any specified column is not found in the DataFrame or contains missing values.

    """


  for column in column_names:
    assert column in df.columns, f"{column} column not found in provided DataFrame"
    assert not df[column].isna().any(), f"{column} column has missing values (NAN)"

    # convert to float64 if not already float
    if df[column].dtype != np.float64:
      df[column] = df[column].astype(np.float64)

    df[f'log_ret_{column}'] = np.log(df[column]/df[column].shift(1)).fillna(0)

  return df

In [16]:
def perc_change(df:pd.DataFrame, column_names: list, inplace: bool=False)-> pd.DataFrame:

  """
  Calculates the percentage change for specified columns in a Pandas DataFrame.

    Args:
        df: The DataFrame containing the columns.
        column_names: A list of column names to calculate percentage changes for.
        inplace: If True, modifies the original DataFrame. If False (default), returns a new DataFrame.

    Returns:
        If inplace is True, returns None (original DataFrame is modified).
        If inplace is False, returns a new DataFrame with the calculated percentage change columns.

    Raises:
        ValueError: If a specified column is not found or contains missing values.
        ZeroDivisionError: If a column contains zero values.
    """


  for column in column_names:

    if column not in df.columns:
      raise ValueError(f'Column {column} not found in DataFrame')
    if df[column].isna().any():
      raise ValueError(f"Column {column} has missing values (NaN)")
    if (df[column]==0).any():
      raise ZeroDivisionError(f"Column {column} contains zero values. Division by zero is not possible.")

    # convert to float64 if not already float
    if df[column].dtype != np.float64:
      df[column] = df[column].astype(np.float64)

    #calculate the percent change (keeping it in decimal form)
    df[f'perc_change_{column}'] = (df[column]-df[column].shift(1))/df[column].shift(1).fillna(1)

  return None if inplace else df

In [17]:

def spline_interpolate_features(df: pd.DataFrame, training_features: list, date_column_name: str, freq: str) -> pd.DataFrame:
    """
    Imputes missing values in specified training features of a time series DataFrame using cubic spline interpolation.

    Args:
        df (pd.DataFrame): The DataFrame with potential missing values.
        training_features (list): A list of column names representing the features to interpolate.
        date_column_name (str, optional): The name of the datetime column (default: "Date").
        freq (str, optional): The frequency of the time series for filling missing timestamps (default: "D").

    Returns:
        pd.DataFrame: A new DataFrame with imputed values in the specified training features.

    Raises:
        ValueError: If 'df' is not a DataFrame, the datetime column is not found,
                    or any specified training feature is not found in the DataFrame.
        TypeError: If the datetime column is not in datetime format.
        Exception: If there are not enough non-NaN values for spline interpolation in a feature.
    """

    # Input Validation:
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a Pandas DataFrame.")
    if date_column_name not in df.columns:
        raise ValueError(f"Column '{date_column_name}' not found in DataFrame.")

    for feature in training_features:
        if feature not in df.columns:
            raise ValueError(f"Training feature '{feature}' not found in DataFrame.")

    # Create a complete and filled datetime index
    df_filled = fill_missing_timestamps(df, date_column_name, freq)

    # Convert the DatetimeIndex to numeric for interp1d
    x = df_filled.index.astype(np.int64) / 10 ** 9

    # Interpolate only the specified training features
    for col in training_features:
        y = df_filled[col].dropna().values
        if len(y) > 3:
            f = interp1d(x[df_filled[col].notnull()], y, kind='cubic')
            df_filled[col] = f(x)
        else:
            raise Exception(f"Not enough non-NaN values in column '{col}' for spline interpolation.")

    return df_filled  # Return the DataFrame with imputed training features
