# 02.- Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn import preprocessing

from utils.plotting import candlestick_plot, local_maxima_minima_plot

## Raw Data Loading

In [2]:
# Open the pickle file containing raw benchmark data.
with open('../data/raw/benchmark_data.pkl', 'rb') as handle:
    bm = pkl.load(handle)

data = bm['ibex']
data.head()

Unnamed: 0,close,high,low,open,vol
2000-01-03,11610.0,11881.8,11574.4,11846.6,60107000.0
2000-01-04,11206.6,11530.0,11159.8,11499.5,62539000.0
2000-01-05,10863.1,11068.1,10824.9,11206.6,68153000.0
2000-01-07,11102.4,11137.9,10882.7,10963.4,144207000.0
2000-01-10,11173.3,11364.3,11120.6,11363.8,133817000.0


In [3]:
window_size = 30

# Randomly select a starting index
start_index = np.random.randint(0, len(data) - window_size + 1)

# Select the consecutive window of rows
random_window = data.iloc[start_index:start_index + window_size]
candlestick_plot(random_window, 'Random Window')

FigureWidget({
    'data': [{'close': [9606.5, 9569.5, 9467.6, 9399.1, 9376.1, 9376.3, 9301.3,
                        9208.7, 9171.2, 9270.8, 9284.1, 9306.8, 9329.2, 9365.3,
                        9404.6, 9447.5, 9486.3, 9583.7, 9590.4, 9512.8, 9493.6,
                        9524.8, 9527.5, 9389.2, 9407.0, 9305.5, 9361.1, 9314.5,
                        9253.9, 9199.2],
              'high': [9671.1, 9597.1, 9574.1, 9470.6, 9421.6, 9436.1, 9377.6,
                       9291.8, 9237.9, 9300.9, 9302.5, 9319.7, 9387.1, 9378.3,
                       9434.4, 9453.6, 9502.9, 9607.0, 9665.4, 9586.0, 9568.0,
                       9530.3, 9542.8, 9468.9, 9475.2, 9356.1, 9424.7, 9378.7,
                       9318.1, 9243.0],
              'low': [9598.4, 9502.3, 9445.1, 9375.1, 9356.1, 9308.7, 9301.3,
                      9208.2, 9111.1, 9161.0, 9211.8, 9248.7, 9296.7, 9324.2,
                      9349.9, 9385.8, 9422.4, 9488.3, 9563.3, 9499.9, 9480.4,
                      9471.1, 9406

In [25]:
def local_min_max(data: pd.DataFrame) -> tuple[pd.Series,
                                               np.ndarray,
                                               np.ndarray]:
    """
    Identifies local minima and maxima from the 'low' values
    of a given time series using polynomial fitting.

    Args:
        data (pd.DataFrame): A DataFrame containing a 'low'
        column that represents the lowest values (e.g., stock lows).

    Returns:
        tuple[pd.Series, np.ndarray, np.ndarray]: 
            - A Series of the normalized 'low' values.
            - Indices of the local minima.
            - Indices of the local maxima.
    """

    # Normalize the 'low' values of the data using min-max scaling, 
    # bringing values into the range [0, 1].
    data_scaled = pd.Series(
        preprocessing.minmax_scale(data['low']),
        index=data.index
    ).reset_index(drop=True)
    
    # Prepare the data for polynomial fitting. 'x_data' are the 
    # indices and 'y_data' are the scaled 'low' values.
    x_data = data_scaled.index.tolist()
    y_data = data_scaled

    # Create a smooth range for fitting
    x = np.linspace(0, max(x_data), max(x_data) + 1)  

    # Fit a 15th degree polynomial to the data. This degree captures 
    # local variations in the data.
    pol = np.polyfit(x_data, y_data, 15)
    # Evaluate the polynomial at the specified x values
    y_pol = np.polyval(pol, x)

    # Find local minima (where the second derivative changes 
    # sign from negative to positive)
    local_min = (
        np.diff(np.sign(np.diff(y_pol))) > 0
        ).nonzero()[0] + 1

    # Find local maxima (where the second derivative changes 
    # sign from positive to negative)
    local_max = (
        np.diff(np.sign(np.diff(y_pol))) < 0
        ).nonzero()[0] + 1

    # Return the normalized 'low' values, local minima, and 
    # local maxima
    return data_scaled, local_min, local_max


In [28]:
data_scaled, local_min, local_max = local_min_max(random_window)

## Features Definition

### Feature 1: Absolute Extrema Duration
This feature could highlight relative compression or elongation of trends. For instance:

1. **Rapid Changes**:
   A small time difference could hint at erratic or impulsive market behavior, potentially filtering out broader, more gradual trends that might not fit a double-bottom scenario.

2. **Structure of Support and Resistance**:
   Longer time differences between the extremes could indicate a more stable consolidation, aligning with the characteristics of a well-formed double bottom, where the two lows and the interim peak occur over a longer timeframe.


In [29]:
def absolute_extrema_duration(data: pd.DataFrame) -> float:
    """Calculate the number of days between the maximum 
    and minimum values in a time series.

    Args:
        data (pd.DataFrame): DataFrame containing OHLC data.

    Returns:
        float: Number of days between the highest and 
        lowest values divided by the window lengh.
        
        This normalization can make your model more robust 
        when handling different time windows or comparing 
        different patterns
    """
    # Get the indices of the maximum and minimum 
    # values directly
    max_index = data.index.get_loc(
        data.high.idxmax().strftime("%Y-%m-%d"))
    min_index = data.index.get_loc(
        data.low.idxmin().strftime("%Y-%m-%d"))

    # Calculate the difference in days directly 
    # and normalizes the result
    duration = abs(
        (max_index - min_index)
        ) / len(data)
    
    return duration


In [30]:
absolute_extrema_duration(random_window)

0.26666666666666666

### Feature 2: Low Threshold Minima Count

1. **Capturing Potential "Bottoms"**:
   The double-bottom pattern is characterized by two distinct troughs. By counting local minima that fall below a threshold, this feature highlights regions where the price exhibits pronounced dips, which are critical to identifying the pattern.

2. **Indicator of Significant Downward Movements**:
   A higher count of minima below the threshold suggests strong downward pressures within the window, increasing the likelihood of the presence of one or more "bottoms." This metric helps focus the analysis on windows with sufficient depth for the pattern to exist.

3. **Normalization Across Windows**:
   Dividing the count by the length of the data window ensures that the feature is scale-invariant and comparable across windows of different sizes. This is essential for ensuring that the feature is meaningful in varying contexts, regardless of the specific duration of the window.

4. **Threshold as a Proxy for Pattern Shape**:
   The chosen threshold (e.g., 0.20) is a proxy for the depth of price movements. Adjusting this value can help fine-tune the feature to be sensitive to the specific characteristics of double bottoms in the dataset being analyzed.


In [31]:
def low_threshold_minima_count(data: pd.DataFrame) -> float:
    """
    Calculate the proportion of local minima below a specified 
    threshold within a time window.

    Args:
        data (pd.DataFrame): DataFrame containing OHLC data.

    Returns:
        float: Proportion of local minima below the 0.20 
        threshold relative to the data length.
    """
    # Extract the values of the local minima
    local_min_values = data_scaled.loc[local_min]

    # Count minima below the threshold and normalize 
    # by the data length
    count = local_min_values.loc[
        (local_min_values < 0.20)].count() / len(data)
    
    # Return the computed proportion
    return count

In [32]:
low_threshold_minima_count(random_window)

np.float64(0.03333333333333333)

### Feature 3: Maxima Between Minima

This feature is critical because it analyzes the relationship between two significant minima and the local maxima that might lie between them.

1. **Pattern Recognition in Double Bottoms**: 
   The double-bottom pattern is characterized by two nearly equal lows (minima) separated by a peak (maximum). This feature captures the core structure of this pattern by checking if a maximum exists between two significant, low-valued minima.

2. **Validation of Reversal Potential**:
   By ensuring the minima are below a specified threshold (e.g., 0.20), the function confirms that these lows are significant enough to suggest potential market exhaustion. If a high lies between these lows, it indicates a potential reversal setup.

3. **Highlighting Key Support and Resistance Levels**:
   The relationship between the minima and maxima offers insights into market dynamics. The presence of a maximum between two significant lows suggests a potential neckline resistance, which is critical for validating the double-bottom breakout signal.

4. **Temporal and Spatial Arrangement**:
   This feature ensures that the minima and the intervening maximum occur in the correct sequence, an essential requirement for confirming the double-bottom shape.


In [33]:
def max_between_min(data: pd.DataFrame) -> bool:
    """
    Determines if the first local maximum lies between 
    the two lowest minima below a specific threshold (0.20),
    indicating potential double-bottom behavior.

    Args:
        data (pd.DataFrame): A DataFrame containing OHLC data.

    Returns:
        bool: True if the first local maximum is between 
        the two lowest minima below the threshold, 
        False otherwise.
    """
    # Extract and sort the two lowest minima
    lowest_minima = (
        data_scaled.loc[local_min]
        .sort_values()
        .iloc[:2]
        .sort_index()
    )

    # Extract sorted maxima for comparison
    sorted_maxima = data_scaled.loc[
        local_max].sort_values()

    # Check conditions: exactly two minima and both 
    # below the threshold
    if (
        len(lowest_minima) == 2 and 
        (lowest_minima.iloc[0] < 0.20) and 
        (lowest_minima.iloc[1] < 0.20)
    ):
        # Verify if the first local maximum is between 
        # the two minima
        return any(
            lowest_minima.index[0] < max_idx < lowest_minima.index[1]
            for max_idx in sorted_maxima.index
        )

    # Return False if conditions are not met
    return False



In [34]:
max_between_min(random_window)

False

### Feature 4: Pattern Extrema Duration

1. **Accurate Temporal Relationship Between Extremes**:
   The duration between the minima and maxima within a potential Double Bottom is critical for understanding the timing of the pattern's formation. A shorter duration between the minima and maxima could suggest a tight price action, potentially indicating a quick reversal or confirmation of the pattern. On the other hand, a longer duration may imply a slower development but could still be indicative of a valid reversal. This feature provides a time window that refines the model's ability to distinguish between a valid Double Bottom and other similar patterns.

2. **Supports the Continuity of the Price Action**:
   Even though the Polyfit method handles noise, the duration of the pattern captures how the market moves between key points. A short duration between the local minimum and maximum might indicate a swift bounce after a fall, which is characteristic of some strong reversals. This feature helps the model understand how quickly or slowly the market can form a valid reversal, aligning with the characteristic behavior of Double Bottoms, where price typically drops, forms a bottom, rises, and revisits that bottom before reversing upwards.

3. **Model’s Temporal Sensitivity to Pattern Development**:
   This feature fine-tunes the model’s ability to learn from historical instances of Double Bottoms with similar timing structures. It helps ensure that the model recognizes patterns that form over reasonable periods as valid instances. By leveraging this feature, the model is equipped to make decisions about the timing and context of potential Double Bottoms in future data, which is essential for trading decisions or trend predictions.


In [35]:
def pattern_extrema_duration(data: pd.DataFrame) -> float:
    """Calculate the duration between the potential 
    minima and maxima of a Double Bottom pattern.

    Args:
        data (pd.DataFrame): A DataFrame containing OHLC data.

    Returns:
        float: The duration (in days, relative to the 
        window length) between the closest local minimum 
        and local maximum forming the potential pattern.
    """
    # Get the indices of the closest local minimum 
    # and maximum that form the potential Double Bottom 
    # pattern
    min_index = data_scaled.loc[
        local_min].sort_values().iloc[0:1].index
    max_index = data_scaled.loc[
        local_max].sort_values().iloc[0:1].index

    # Calculate the duration in days between the 
    # local minimum and maximum indices, normalized 
    # by the length of the data
    days_between = abs(
        max_index - min_index
        ) / len(data)

    return days_between.values[0]


In [36]:
pattern_extrema_duration(random_window)

np.float64(0.6333333333333333)

### Feature 5: Minima Deviation

The standard deviation of the y-values at local minima can be a key indicator for identifying the likelihood of a double bottom pattern because:

1. **Consistency of the Low Points**:
   A low standard deviation suggests that the minima are close in value, indicating a greater likelihood that the lows are aligned. This is a critical characteristic of a double bottom pattern where the two troughs are expected to be at similar levels.

2. **Noise Reduction and Relevance**:
   High variability in the y-values of minima could imply erratic price behavior or a lack of clear structure in the local lows, which is less likely to form a recognizable double bottom pattern.

3. **Alignment and Pattern Validity**:
   When minima values are tightly clustered, it supports the hypothesis that the price reached similar levels during the two dips, reinforcing the validity of the potential pattern.

In [42]:
def minima_deviation(data: pd.DataFrame) -> float:
    """
    Computes the standard deviation of the y-values (scaled) 
    corresponding to the local minima within the data.

    Args:
        data (pd.DataFrame): A DataFrame containing OHLC data.
    Returns:
        float: Standard deviation of the y-values at the
        identified local minima.
    """
    # Return standard deviation of the scaled values at 
    # the local minima
    return data_scaled.loc[local_min].std()

In [43]:
minima_deviation(random_window)

np.float64(0.27526812310914717)

### Feature 6: Minima Mean

The mean value of the two smallest local minima is crucial for determining the potential for a double bottom pattern. Here's why:

1. **Consistency of Lows**:
A key characteristic of a double bottom pattern is that the two troughs are relatively close in value, forming a consistent support level. The mean of these two minima provides a measure of this consistency.

2. **Validation of Similar Depth**:
If the mean value is within an acceptable range, it suggests that the two minima are at a similar price level, which strengthens the case for a double bottom.

In [44]:
def minima_mean(data: pd.DataFrame) -> float:
    """
    Computes the mean of the scaled values at the two 
    smallest local minima.

    Args:
        data (pd.DataFrame): A DataFrame containing OHLC data.

    Returns:
        float: Mean of the scaled values at the two 
        smallest local minima.
    """
    
    # Return the mean of the two smallest local minima 
    return data_scaled.loc[
        local_min
        ].sort_values().iloc[0:2].mean()

In [45]:
minima_mean(random_window)

np.float64(0.19464395649497135)