# Data Normalisation

In [1]:
#import required libraries
from utils import aws # used to create aws session and load parquet 
import pandas as pd
import numpy as np
import ast 
import dask.dataframe as dd

## Load Sample data from S3

Make sure to update credentials

In [2]:
#load sample lob from s3 to a dask dataframe
samp_lob_ddf = aws.load_s3_file_as_ddf("s3://dsmp-ol2/processed-data/lob_sample_data.parquet")

In [3]:
# Compute the dask datafram to a pandas dataframe
samp_lob = samp_lob_ddf.compute()

In [4]:
samp_lob.head()

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price
0,0.0,Exch0,[],[],2025-01-02,
1,0.279,Exch0,"[[1, 6]]",[],2025-01-02,
2,1.333,Exch0,"[[1, 6]]","[[800, 1]]",2025-01-02,400.5
3,1.581,Exch0,"[[1, 6]]","[[799, 1]]",2025-01-02,400.0
4,1.643,Exch0,"[[1, 6]]","[[798, 1]]",2025-01-02,399.5


## Normalise the Mid-Price using z-score

Based on the research done on normaliation, a common technique used when analysing LOB data is z-score. As trends change across time in finace, it is common place to use dynamic normalisation. In the case of z-score this means taking the mean and standard deviation of a previous period (often the previous day) and using that as the measure to caluclate z-score. The draw back of this is that the first day's data is lost as there is no previous day to normalise against.

Below uses a dynamic z-score to normalise the Mid-Price. Additional features can be added for normalisation.

In [5]:
# Replace 'features_to_normalise' with the list of feature column names we want to normalize
# Starting with just mid-price as an example
features_to_normalise = ['Mid_Price']  # Replace with feature column names in the future

for date in samp_lob['Date'].unique():
    if date == samp_lob['Date'].min():  # Skip the first date since there's no previous day
        continue
    
    prev_date = samp_lob[samp_lob['Date'] < date]['Date'].max()  # Find the most recent previous date
    
    # Calculate mean and standard deviation for each feature for the previous day
    prev_day_stats = samp_lob[samp_lob['Date'] == prev_date][features_to_normalise].mean()
    prev_day_std = samp_lob[samp_lob['Date'] == prev_date][features_to_normalise].std()
    
    # Normalize the features for the current date using z-score with stats from the previous day
    for feature in features_to_normalise:
        normalised_feature = (samp_lob[samp_lob['Date'] == date][feature] - prev_day_stats[feature]) / prev_day_std[feature]
        samp_lob.loc[samp_lob['Date'] == date, feature + 'z-score_normalised'] = normalised_feature

In [6]:

# Now the df contains normalized values for features based on z-score with stats from the previous day
samp_lob

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price,Mid_Pricez-score_normalized
0,0.000,Exch0,[],[],2025-01-02,,
1,0.279,Exch0,"[[1, 6]]",[],2025-01-02,,
2,1.333,Exch0,"[[1, 6]]","[[800, 1]]",2025-01-02,400.5,
3,1.581,Exch0,"[[1, 6]]","[[799, 1]]",2025-01-02,400.0,
4,1.643,Exch0,"[[1, 6]]","[[798, 1]]",2025-01-02,399.5,
...,...,...,...,...,...,...,...
1037929,30599.418,Exch0,"[[323, 2], [104, 3], [63, 1], [44, 6]]","[[338, 1], [343, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.893131
1037930,30599.449,Exch0,"[[323, 2], [99, 3], [63, 1], [44, 6]]","[[338, 1], [343, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.893131
1037931,30599.635,Exch0,"[[323, 2], [99, 3], [63, 1], [44, 6]]","[[338, 1], [341, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.893131
1037932,30599.697,Exch0,"[[323, 2], [249, 1], [99, 3], [44, 6]]","[[338, 1], [341, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.893131


Although z-score is a common normalisation technique a number of papers suggest that different models perform better with different normalisation techniques. Given data normalisation has such and impact on model performance it can be considered during model tuning to ensure the most performant method is selected.

## Create a Normalisation Function

Below takes the code above and uses it in a function where different normalisation techniques can be utilised.

## Extract the normalisation techniques into their own functions

In [5]:
# TODO Add window size to determine the number of previous days to use in the normalisation
# TODO Migrate this notebook to the model pipeline notebook

# Define functions for the common measures required in the normalisation techniques

def prev_day_mean(df, feature, prev_date):
    """
    Calculate the mean of a specific feature for the previous day.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        feature (str): The name of the feature for which the mean is calculated.
        prev_date (str): The date for which the previous day's mean is calculated.

    Returns:
        float: Mean of the specified feature for the previous day.
    """
    prev_day_mean = df[df['Date'] == prev_date][feature].mean() # Mean of the previous day
    return prev_day_mean

def prev_day_std(df, feature, prev_date):
    """
    Calculate the standard deviation of a specific feature for the previous day.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        feature (str): The name of the feature for which the standard deviation is calculated.
        prev_date (str): The date for which the previous day's standard deviation is calculated.

    Returns:
        float: Standard deviation of the specified feature for the previous day.
    """
    prev_day_std = df[df['Date'] == prev_date][feature].std() # Standard deviation of the previous day
    return prev_day_std

def prev_day_max(df, feature, prev_date):
    """
    Find the maximum value of a specific feature for the previous day.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        feature (str): The name of the feature for which the maximum value is found.
        prev_date (str): The date for which the previous day's maximum value is found.

    Returns:
        float: Maximum value of the specified feature for the previous day.
    """
    prev_day_max = df[df['Date'] == prev_date][feature].max() # Maximum of the previous day
    return prev_day_max

def prev_day_min(df, feature, prev_date):
    """
    Find the minimum value of a specific feature for the previous day.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        feature (str): The name of the feature for which the minimum value is found.
        prev_date (str): The date for which the previous day's minimum value is found.

    Returns:
        float: Minimum value of the specified feature for the previous day.
    """
    prev_day_min = df[df['Date'] == prev_date][feature].min() # Minimum of the previous day
    return prev_day_min

def prev_day_median(df, feature, prev_date):
    """
    Calculate the median of a specific feature for the previous day.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        feature (str): The name of the feature for which the median is calculated.
        prev_date (str): The date for which the previous day's median is calculated.

    Returns:
        float: Median of the specified feature for the previous day.
    """
    prev_day_median = df[df['Date'] == prev_date][feature].median() # Median of the previous day
    return prev_day_median

def prev_day_quartiles(df, feature, prev_date):
    """
    Calculate the lower and upper quartiles of a specific feature for the previous day.

    Parameters:
        df (DataFrame): The DataFrame containing the data.
        feature (str): The name of the feature for which the quartiles are calculated.
        prev_date (str): The date for which the previous day's quartiles are calculated.

    Returns:
        pandas.Series: Series containing the lower and upper quartiles of the specified feature for the previous day.
    """
    prev_day_quartiles = df[df['Date'] == prev_date][feature].quantile([0.25, 0.75]) # Upper and lower quartiles of the previous day
    return prev_day_quartiles


In [6]:
# Define the normalisation functions to be called in the normalise_features function

def z_score_normalise(df, feature, prev_date, date):
    """
    Normalise the given feature using z-score normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate mean and standard deviation.
        date (datetime): Current date of the feature to be normalised. 
    
    Returns:
        Series: Normalised feature values.
    """
    mean = prev_day_mean(df, feature, prev_date) # Mean of the previous day
    std = prev_day_std(df, feature, prev_date) # Standard deviation of the previous day
    normalised_feature = (df[df['Date'] == date][feature] - mean) / std # z-score
    return normalised_feature

def min_max_normalise(df, feature, prev_date, date):
    """
    Normalise the given feature using min-max normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate min and max.
        date (datetime): Current date of the feature to be normalised.
    
    Returns:
        Series: Normalised feature values.
    """
    min_val = prev_day_min(df, feature, prev_date) # Minimum of the previous day
    max_val = prev_day_max(df, feature, prev_date) # Maximum of the previous day
    normalised_feature = (df[df['Date'] == date][feature] - min_val) / (max_val - min_val) # min-max
    return normalised_feature

def decimal_scaling_normalise(df, feature, prev_date, date):
    """
    Normalise the given feature using decimal scaling normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate max value.
        date (datetime): Current date of the feature to be normalised.
    
    Returns:
        Series: Normalised feature values.
    """
    max_val = prev_day_max(df, feature, prev_date) # Maximum of the previous day
    digits = len(str(abs(max_val)).split('.')[0]) # Number of digits infront of the decimal
    normalised_feature = df[df['Date'] == date][feature] / 10**digits # decimal scaling
    return normalised_feature

def robust_scaling_normalise(df, feature, prev_date, date):
    """
    Normalise the given feature using robust scaling normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate median and interquartile range.
        date (datetime): Current date of the feature to be normalised.
    
    Returns:
        Series: Normalised feature values.
    """
    prev_day_median = prev_day_median(df, feature, prev_date) # Median of the previous day
    prev_day_quartiles = prev_day_quartiles(df, feature, prev_date) # Upper and lower quartiles of the previous day
    prev_day_iqr = prev_day_quartiles[0.75] - prev_day_quartiles[0.25] # Inter quartile range
    normalised_feature = (df[df['Date'] == date][feature] - prev_day_median) / prev_day_iqr # robust scaling
    return normalised_feature

def sigmoid_normalise(df, feature, prev_date, date): # TODO check this calculation
    """
    Normalise the given feature using sigmoid normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate mean and standard deviation.
        date (datetime): Current date of the feature to be normalised.
    
    Returns:
        Series: Normalised feature values.
    """
    prev_day_mean = prev_day_mean(df, feature, prev_date) # Mean of the previous day
    prev_day_std = prev_day_std(df, feature, prev_date) # Standard deviation of the previous day
    normalised_feature = 1 / (1 + np.exp((-df[df['Date'] == date][feature] - prev_day_mean) / prev_day_std)) # sigmoid
    return normalised_feature

def tanh_normalise(df, feature, prev_date, date):
    """
    Normalise the given feature using tanh normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate mean and standard deviation.
        date (datetime): Current date of the feature to be normalised.
    
    Returns:
        Series: Normalised feature values.
    """
    prev_day_mean = prev_day_mean(df, feature, prev_date) # Mean of the previous day
    prev_day_std = prev_day_std(df, feature, prev_date) # Standard deviation of the previous day
    normalised_feature = 0.5*(np.tanh((0.01*(df[df['Date'] == date][feature] - prev_day_mean)) / prev_day_std)) # tanh estimation
    return normalised_feature

def mean_normalise(df, feature, prev_date, date):
    """
    Normalise the given feature using mean normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate mean, min, and max.
        date (datetime): Current date of the feature to be normalised.
    
    Returns:
        Series: Normalised feature values.
    """
    prev_day_mean = prev_day_mean(df, feature, prev_date) # Mean of the previous day
    min_val = prev_day_min(df, feature, prev_date) # Minimum of the previous day
    max_val = prev_day_max(df, feature, prev_date) # Maximum of the previous day
    normalised_feature = (df[df['Date'] == date][feature] - prev_day_mean) / (max_val - min_val) # mean normalisation
    return normalised_feature

def median_normalise(df, feature, prev_date, date):
    """
    Normalise the given feature using median normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate median and min.
        date (datetime): Current date of the feature to be normalised.
    
    Returns:
        Series: Normalised feature values.
    """
    prev_day_median = prev_day_median(df, feature, prev_date) # Median of the previous day
    min_val = prev_day_min(df, feature, prev_date) # Minimum of the previous day
    normalised_feature = (df[df['Date'] == date][feature] - min_val) / prev_day_median # median normalisation
    return normalised_feature

def max_absolute_normalise(df, feature, prev_date, date):
    """
    Normalise the given feature using max absolute normalisation.
    
    Parameters:
        df (DataFrame): DataFrame containing the data.
        feature (str): Name of the feature to normalise.
        prev_date (datetime): Previous date to calculate max value.
        date (datetime): Current date of the feature to be normalised.
    
    Returns:
        Series: Normalised feature values.
    """
    max_val = prev_day_max(df, feature, prev_date) # Maximum of the previous day
    normalised_feature = (df[df['Date'] == date][feature]) / max_val # max absolute
    return normalised_feature


In [7]:
# Define the normalise_features function

def normalise_features(df, features_to_normalise, method='z-score'):
    """
    Normalise features in the DataFrame based on the chosen method.

    Parameters:
        df (DataFrame): DataFrame containing the LOB data and features.
        features_to_normalise (list): List of feature column names to normalize.
        method (str): Normalisation method. Supported methods: 'z-score', 'min-max'. Defaults to 'z-score'.

    Returns:
        DataFrame: DataFrame with normalised feature columns.
    """
    for date in df['Date'].unique():
        if date == df['Date'].min():  # Skip the first date since there's no previous day
            continue
        
        prev_date = df[df['Date'] < date]['Date'].max()  # Find the most recent previous date
        
        # Normalize the features for the current date using the chosen method
        for feature in features_to_normalise:
            if method == 'z-score':
                normalised_feature = z_score_normalise(df, feature, prev_date, date)
            elif method == 'min-max':
                normalised_feature = min_max_normalise(df, feature, prev_date, date)
            elif method == 'decimal_scaling':
                normalised_feature = decimal_scaling_normalise(df, feature, prev_date, date)
            elif method == 'robust_scaling':
                normalised_feature = robust_scaling_normalise(df, feature, prev_date, date)
            elif method == 'sigmoid': 
                normalised_feature = sigmoid_normalise(df, feature, prev_date, date)
            elif method == 'tanh': 
                normalised_feature = tanh_normalise(df, feature, prev_date, date)
            elif method == 'mean':
                normalised_feature = mean_normalise(df, feature, prev_date, date)
            elif method == 'median':
                normalised_feature = mean_normalise(df, feature, prev_date, date)
            elif method == 'max_absolute':
                normalised_feature = max_absolute_normalise(df, feature, prev_date, date)
            # Store the normalised values in new columns
            df.loc[df['Date'] == date, feature + '_' + method + '_normalised'] = normalised_feature
    
    return df

In [8]:
features_to_normalise = ['Mid_Price']

normalised_df = normalise_features(df=samp_lob, features_to_normalise=features_to_normalise, method='decimal_scaling')

In [9]:
normalised_df

Unnamed: 0,Timestamp,Exchange,Bid,Ask,Date,Mid_Price,Mid_Price_decimal_scaling_normalised
0,0.000,Exch0,[],[],2025-01-02,,
1,0.279,Exch0,"[[1, 6]]",[],2025-01-02,,
2,1.333,Exch0,"[[1, 6]]","[[800, 1]]",2025-01-02,400.5,
3,1.581,Exch0,"[[1, 6]]","[[799, 1]]",2025-01-02,400.0,
4,1.643,Exch0,"[[1, 6]]","[[798, 1]]",2025-01-02,399.5,
...,...,...,...,...,...,...,...
1037929,30599.418,Exch0,"[[323, 2], [104, 3], [63, 1], [44, 6]]","[[338, 1], [343, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.3305
1037930,30599.449,Exch0,"[[323, 2], [99, 3], [63, 1], [44, 6]]","[[338, 1], [343, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.3305
1037931,30599.635,Exch0,"[[323, 2], [99, 3], [63, 1], [44, 6]]","[[338, 1], [341, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.3305
1037932,30599.697,Exch0,"[[323, 2], [249, 1], [99, 3], [44, 6]]","[[338, 1], [341, 2], [507, 4], [659, 1], [749,...",2025-01-06,330.5,0.3305
