# 02.- Feature Engineering

In [64]:
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn import preprocessing

from utils.plotting import candlestick_plot, local_maxima_minima_plot

## Raw Data Loading

In [None]:
# Open the pickle file containing raw benchmark data.
with open('../data/raw/benchmark_data.pkl', 'rb') as handle:
    bm = pkl.load(handle)

data = bm['ibex']
data.head()

Unnamed: 0,close,high,low,open,vol
2000-01-03,11610.0,11881.8,11574.4,11846.6,60107000.0
2000-01-04,11206.6,11530.0,11159.8,11499.5,62539000.0
2000-01-05,10863.1,11068.1,10824.9,11206.6,68153000.0
2000-01-07,11102.4,11137.9,10882.7,10963.4,144207000.0
2000-01-10,11173.3,11364.3,11120.6,11363.8,133817000.0


In [68]:
window_size = 50

# Randomly select a starting index
start_index = np.random.randint(0, len(data) - window_size + 1)

# Select the consecutive window of rows
random_window = data.iloc[start_index:start_index + window_size]
candlestick_plot(random_window, 'Random Window')

FigureWidget({
    'data': [{'close': [9439.9, 9500.7, 9399.4, 9561.5, 9601.8, 9530.7, 9516.8,
                        9607.3, 9606.8, 9444.0, 9522.5, 9476.6, 9363.3, 9018.7,
                        9047.9, 9021.5, 8920.9, 8925.2, 8964.3, 8792.7, 8787.6,
                        8855.9, 8878.4, 9073.3, 9015.9, 8885.0, 8787.6, 8501.9,
                        8526.2, 8465.1, 8266.1, 8166.7, 8303.9, 8296.1, 8372.7,
                        8210.1, 8354.2, 8304.1, 8378.9, 8373.8, 8153.5, 8231.0,
                        8217.0, 8417.9, 8480.0, 8557.3, 8608.7, 8564.7, 8636.2,
                        8567.7],
              'high': [9526.3, 9514.5, 9498.9, 9561.5, 9601.8, 9637.1, 9600.5,
                       9663.9, 9657.6, 9616.6, 9538.5, 9496.9, 9481.8, 9307.7,
                       9105.4, 9038.8, 9086.7, 9051.2, 9026.3, 8956.2, 8890.7,
                       8895.0, 8913.1, 9084.0, 9104.9, 8994.2, 8893.7, 8738.8,
                       8577.1, 8648.9, 8370.5, 8424.6, 8313.8, 8349.7, 8387.

In [40]:
def local_min_max(data: pd.DataFrame) -> tuple[pd.Series,
                                               np.ndarray,
                                               np.ndarray]:
    """
    Identifies local minima and maxima from the 'low' values
    of a given time series using polynomial fitting.

    Args:
        data (pd.DataFrame): A DataFrame containing a 'low'
        column that represents the lowest values (e.g., stock lows).

    Returns:
        tuple[pd.Series, np.ndarray, np.ndarray]: 
            - A Series of the normalized 'low' values.
            - Indices of the local minima.
            - Indices of the local maxima.
    """

    # Normalize the 'low' values of the data using min-max scaling, 
    # bringing values into the range [0, 1].
    data_scaled = pd.Series(
        preprocessing.minmax_scale(data['low']),
        index=data.index
    ).reset_index(drop=True)
    
    # Prepare the data for polynomial fitting. 'x_data' are the 
    # indices and 'y_data' are the scaled 'low' values.
    x_data = data_scaled.index.tolist()
    y_data = data_scaled

    # Create a smooth range for fitting
    x = np.linspace(0, max(x_data), max(x_data) + 1)  

    # Fit a 15th degree polynomial to the data. This degree captures 
    # local variations in the data.
    pol = np.polyfit(x_data, y_data, 15)
    # Evaluate the polynomial at the specified x values
    y_pol = np.polyval(pol, x)

    # Find local minima (where the second derivative changes 
    # sign from negative to positive)
    local_min = (
        np.diff(np.sign(np.diff(y_pol))) > 0
        ).nonzero()[0] + 1

    # Find local maxima (where the second derivative changes 
    # sign from positive to negative)
    local_max = (
        np.diff(np.sign(np.diff(y_pol))) < 0
        ).nonzero()[0] + 1

    # Return the normalized 'low' values, local minima, and 
    # local maxima
    return data_scaled, local_min, local_max


## Features Definition

### Feature 1: Absolute Extrema Duration
This feature could highlight relative compression or elongation of trends. For instance:

1. **Rapid Changes**:
   A small time difference could hint at erratic or impulsive market behavior, potentially filtering out broader, more gradual trends that might not fit a double-bottom scenario.

2. **Structure of Support and Resistance**:
   Longer time differences between the extremes could indicate a more stable consolidation, aligning with the characteristics of a well-formed double bottom, where the two lows and the interim peak occur over a longer timeframe.


In [41]:
def absolute_extrema_duration(data: pd.DataFrame) -> float:
    """Calculate the number of days between the maximum 
    and minimum values in a time series.

    Args:
        data (pd.DataFrame): DataFrame containing OHLC data.

    Returns:
        float: Number of days between the highest and 
        lowest values divided by the window lengh.
        
        This normalization can make your model more robust 
        when handling different time windows or comparing 
        different patterns
    """
    # Get the indices of the maximum and minimum 
    # values directly
    max_index = data.index.get_loc(
        data.high.idxmax().strftime("%Y-%m-%d"))
    min_index = data.index.get_loc(
        data.low.idxmin().strftime("%Y-%m-%d"))

    # Calculate the difference in days directly 
    # and normalizes the result
    duration = abs(
        (max_index - min_index)
        ) / len(data)
    
    return duration


In [42]:
absolute_extrema_duration(random_window)

0.5

### Feature 2: Low Threshold Minima Count

1. **Capturing Potential "Bottoms"**:
   The double-bottom pattern is characterized by two distinct troughs. By counting local minima that fall below a threshold, this feature highlights regions where the price exhibits pronounced dips, which are critical to identifying the pattern.

2. **Indicator of Significant Downward Movements**:
   A higher count of minima below the threshold suggests strong downward pressures within the window, increasing the likelihood of the presence of one or more "bottoms." This metric helps focus the analysis on windows with sufficient depth for the pattern to exist.

3. **Normalization Across Windows**:
   Dividing the count by the length of the data window ensures that the feature is scale-invariant and comparable across windows of different sizes. This is essential for ensuring that the feature is meaningful in varying contexts, regardless of the specific duration of the window.

4. **Threshold as a Proxy for Pattern Shape**:
   The chosen threshold (e.g., 0.20) is a proxy for the depth of price movements. Adjusting this value can help fine-tune the feature to be sensitive to the specific characteristics of double bottoms in the dataset being analyzed.


In [43]:
def low_threshold_minima_count(data: pd.DataFrame) -> float:
    """
    Calculate the proportion of local minima below a specified 
    threshold within a time window.

    Args:
        data (pd.DataFrame): DataFrame containing OHLC data.

    Returns:
        float: Proportion of local minima below the 0.20 
        threshold relative to the data length.
    """
    # Scale the 'low' column and find local minima
    data_scaled, local_min, _ = local_min_max(data)
    
    # Extract the values of the local minima
    local_min_values = data_scaled.loc[local_min]

    # Count minima below the threshold and normalize 
    # by the data length
    count = local_min_values.loc[
        (local_min_values < 0.20)].count() / len(data)
    
    # Return the computed proportion
    return count

In [44]:
low_threshold_minima_count(random_window)

np.float64(0.04)

### Feature 3: Maxima Between Minima

This feature is critical because it analyzes the relationship between two significant minima and the local maxima that might lie between them.

1. **Pattern Recognition in Double Bottoms**: 
   The double-bottom pattern is characterized by two nearly equal lows (minima) separated by a peak (maximum). This feature captures the core structure of this pattern by checking if a maximum exists between two significant, low-valued minima.

2. **Validation of Reversal Potential**:
   By ensuring the minima are below a specified threshold (e.g., 0.20), the function confirms that these lows are significant enough to suggest potential market exhaustion. If a high lies between these lows, it indicates a potential reversal setup.

3. **Highlighting Key Support and Resistance Levels**:
   The relationship between the minima and maxima offers insights into market dynamics. The presence of a maximum between two significant lows suggests a potential neckline resistance, which is critical for validating the double-bottom breakout signal.

4. **Temporal and Spatial Arrangement**:
   This feature ensures that the minima and the intervening maximum occur in the correct sequence, an essential requirement for confirming the double-bottom shape.


In [None]:
def max_between_min(data: pd.DataFrame) -> bool:
    """
    Determines if the first local maximum lies between 
    the two lowest minima below a specific threshold (0.20),
    indicating potential double-bottom behavior.

    Args:
        data (pd.DataFrame): A DataFrame containing OHLC data.

    Returns:
        bool: True if the first local maximum is between 
        the two lowest minima below the threshold, 
        False otherwise.
    """
    # Identify scaled data, local minima, and maxima
    data_scaled, local_minima, local_maxima = local_min_max(data)

    # Extract and sort the two lowest minima
    lowest_minima = (
        data_scaled.loc[local_minima]
        .sort_values()
        .iloc[:2]
        .sort_index()
    )

    # Extract sorted maxima for comparison
    sorted_maxima = data_scaled.loc[
        local_maxima].sort_values()

    # Check conditions: exactly two minima and both 
    # below the threshold
    if (
        len(lowest_minima) == 2 and 
        (lowest_minima.iloc[0] < 0.20) and 
        (lowest_minima.iloc[1] < 0.20)
    ):
        # Verify if the first local maximum is between 
        # the two minima
        return any(
            lowest_minima.index[0] < max_idx < lowest_minima.index[1]
            for max_idx in sorted_maxima.index
        )

    # Return False if conditions are not met
    return False



In [None]:
max_between_min(random_window)

True