# Data Normalisation

In [1]:
#import required libraries
from utils import aws # used to create aws session and load parquet 
import pandas as pd
import numpy as np
import ast 
import dask.dataframe as dd

## Load Sample data from S3

Make sure to update credentials

In [2]:
#load sample lob from s3 to a dask dataframe
samp_lob_ddf = aws.load_s3_file_as_ddf("s3://dsmp-ol2/processed-data/lob_sample_data.parquet")

In [3]:
# Compute the dask datafram to a pandas dataframe
samp_lob = samp_lob_ddf.compute()

In [4]:
samp_lob.head()

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price
0,0.0,Exch0,[],[],2025-01-02,
1,0.279,Exch0,"[[1, 6]]",[],2025-01-02,
2,1.333,Exch0,"[[1, 6]]","[[800, 1]]",2025-01-02,400.5
3,1.581,Exch0,"[[1, 6]]","[[799, 1]]",2025-01-02,400.0
4,1.643,Exch0,"[[1, 6]]","[[798, 1]]",2025-01-02,399.5


## Normalise the Mid-Price using z-score

Based on the research done on normaliation, a common technique used when analysing LOB data is z-score. As trends change across time in finace, it is common place to use dynamic normalisation. In the case of z-score this means taking the mean and standard deviation of a previous period (often the previous day) and using that as the measure to caluclate z-score. The draw back of this is that the first day's data is lost as there is no previous day to normalise against.

Below uses a dynamic z-score to normalise the Mid-Price. Additional features can be added for normalisation.

In [5]:
# Replace 'features_to_normalize' with the list of feature column names we want to normalize
# Starting with just mid-price as an example
features_to_normalize = ['Mid_Price']  # Replace with feature column names in the future

for date in samp_lob['Date'].unique():
    if date == samp_lob['Date'].min():  # Skip the first date since there's no previous day
        continue
    
    prev_date = samp_lob[samp_lob['Date'] < date]['Date'].max()  # Find the most recent previous date
    
    # Calculate mean and standard deviation for each feature for the previous day
    prev_day_stats = samp_lob[samp_lob['Date'] == prev_date][features_to_normalize].mean()
    prev_day_std = samp_lob[samp_lob['Date'] == prev_date][features_to_normalize].std()
    
    # Normalize the features for the current date using z-score with stats from the previous day
    for feature in features_to_normalize:
        normalized_feature = (samp_lob[samp_lob['Date'] == date][feature] - prev_day_stats[feature]) / prev_day_std[feature]
        samp_lob.loc[samp_lob['Date'] == date, feature + 'z-score_normalized'] = normalized_feature

In [6]:

# Now the df contains normalized values for features based on z-score with stats from the previous day
samp_lob

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price,Mid_Pricez-score_normalized
0,0.000,Exch0,[],[],2025-01-02,,
1,0.279,Exch0,"[[1, 6]]",[],2025-01-02,,
2,1.333,Exch0,"[[1, 6]]","[[800, 1]]",2025-01-02,400.5,
3,1.581,Exch0,"[[1, 6]]","[[799, 1]]",2025-01-02,400.0,
4,1.643,Exch0,"[[1, 6]]","[[798, 1]]",2025-01-02,399.5,
...,...,...,...,...,...,...,...
1037929,30599.418,Exch0,"[[323, 2], [104, 3], [63, 1], [44, 6]]","[[338, 1], [343, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.893131
1037930,30599.449,Exch0,"[[323, 2], [99, 3], [63, 1], [44, 6]]","[[338, 1], [343, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.893131
1037931,30599.635,Exch0,"[[323, 2], [99, 3], [63, 1], [44, 6]]","[[338, 1], [341, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.893131
1037932,30599.697,Exch0,"[[323, 2], [249, 1], [99, 3], [44, 6]]","[[338, 1], [341, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.893131


Although z-score is a common normalisation technique a number of papers suggest that different models perform better with different normalisation techniques. Given data normalisation has such and impact on model performance it can be considered during model tuning to ensure the most performant method is selected.

## Create a Normalisation Function

Below takes the code above and uses it in a function where different normalisation techniques can be utilised.

In [27]:
# TODO Extract z-score and min-max into separate functions that are called within normalisae features
# TODO Add window size to determine the number of previous days to use in the normalisation
# TODO Migrate this notebook to the model pipeline notebook

def normalize_features(df, features_to_normalize, method='z-score'):
    """
    Normalize features in the DataFrame based on the chosen method.

    Parameters:
        df (DataFrame): DataFrame containing the LOB data and features.
        features_to_normalize (list): List of feature column names to normalize.
        method (str): Normalization method. Supported methods: 'z-score', 'min-max'. Defaults to 'z-score'.

    Returns:
        DataFrame: DataFrame with normalized feature columns.
    """
    for date in df['Date'].unique():
        if date == df['Date'].min():  # Skip the first date since there's no previous day
            continue
        
        prev_date = df[df['Date'] < date]['Date'].max()  # Find the most recent previous date
        
        # Normalize the features for the current date using the chosen method
        for feature in features_to_normalize:
            if method == 'z-score':
                # Calculate mean and standard deviation for each feature for the previous day
                prev_day_mean = df[df['Date'] == prev_date][feature].mean()
                prev_day_std = df[df['Date'] == prev_date][feature].std()
                # Calculate z-score for each feature
                normalized_feature = (df[df['Date'] == date][feature] - prev_day_mean) / prev_day_std
            elif method == 'min-max':
                # Calculate min and max for each feature for the previous day
                min_val = df[df['Date'] == prev_date][feature].min()
                max_val = df[df['Date'] == prev_date][feature].max()
                # Calculate min-max normalisation for each feature
                normalized_feature = (df[df['Date'] == date][feature] - min_val) / (max_val - min_val)
            elif method == 'decimal_scaling':
                # Calculate max for each feature for the previous day
                max_val = df[df['Date'] == prev_date][feature].max()
                # Calculate the number of digits in the max_val
                digits = len(str(abs(max_val)).split('.')[0])
                # Calculate decimal scaling normalisation for each feature
                normalized_feature = df[df['Date'] == date][feature] / 10**digits
            elif method == 'robust_scaling':
                # Calculate median and iqr for each feature for the previous day
                prev_day_median = df[df['Date'] == prev_date][feature].median()
                prev_day_quartiles = df[df['Date'] == prev_date][feature].quantile([0.25, 0.75]) # Upper and lower quartiles
                prev_day_iqr = prev_day_quartiles[0.75] - prev_day_quartiles[0.25] # Inter quartile range
                # Calculate z-score for each feature
                normalized_feature = (df[df['Date'] == date][feature] - prev_day_median) / prev_day_iqr
            elif method == 'sigmoid': # TODO check this calculation
                # Calculate mean and standard deviation for each feature for the previous day
                prev_day_mean = df[df['Date'] == prev_date][feature].mean()
                prev_day_std = df[df['Date'] == prev_date][feature].std()
                # Calculate sigmoid normalisation for each feature
                normalized_feature = 1 / (1 + np.exp((-df[df['Date'] == date][feature] - prev_day_mean) / prev_day_std))
            elif method == 'tanh': 
                # Calculate mean and standard deviation for each feature for the previous day
                prev_day_mean = df[df['Date'] == prev_date][feature].mean()
                prev_day_std = df[df['Date'] == prev_date][feature].std()
                # Calculate tanh estimation for each feature
                normalized_feature = 0.5*(np.tanh((0.01*(df[df['Date'] == date][feature] - prev_day_mean)) / prev_day_std))
            elif method == 'mean':
                # Calculate min, max and median for each feature for the previous day
                prev_day_mean = df[df['Date'] == prev_date][feature].mean()
                min_val = df[df['Date'] == prev_date][feature].min()
                max_val = df[df['Date'] == prev_date][feature].max()
                # Calculate min-max normalisation for each feature
                normalized_feature = (df[df['Date'] == date][feature] - prev_day_mean) / (max_val - min_val)
            elif method == 'median':
                # Calculate median for each feature for the previous day
                prev_day_median = df[df['Date'] == prev_date][feature].median()
                # Calculate min-max normalisation for each feature
                normalized_feature = (df[df['Date'] == date][feature] - min_val) / prev_day_median
            elif method == 'max_absolute':
                # Calculate max for each feature for the previous day
                max_val = df[df['Date'] == prev_date][feature].max()
                # Calculate min-max normalisation for each feature
                normalized_feature = (df[df['Date'] == date][feature]) / max_val
            # Store the normalized values in new columns
            df.loc[df['Date'] == date, feature + '_' + method + '_normalized'] = normalized_feature
    
    return df