# Correcting the y-metric used for calculating MSE

In [2]:
import pandas as pd
from tensorflow.keras.saving import load_model
import numpy as np
from sklearn.preprocessing import StandardScaler

In [3]:
report = "reports_mid_price"

In [4]:
def get_dates_for_training_scheme(df: pd.DataFrame):
    """
    This function returns all dates needed to train, validate and test the model. The training
    follows a rolling window type scheme. The data is trained on 4 days and validated on the 5th. 
    Some portion of the end of the dates are used for testing.

    Args:
        df (pd.DataFrame): dataframe with data.

    Returns:
        List[Tuples]: a list of tuples conatining the training and validation/test dates.
    """
    dates = df["sip_timestamp"].dt.day.unique()

    date_set = []

    num_days_training = 9
    num_days_val_test = 1

    for i in range(0, len(dates) - num_days_training, num_days_val_test):
        date_set.append((dates[i: i + num_days_training], dates[i + num_days_training: i + num_days_training + num_days_val_test]))

    return date_set

def train_val_test(df: pd.DataFrame, days: tuple[np.ndarray, np.ndarray]):
    """
    Splits a dataframe into training and val/test. The splitting is done by given dates.

    Args:
        df (pd.DataFrame): dataframe to split into training and test.
        days (tuple[np.ndarray, np.ndarray]): a tuple containing two arrays of train days and val/test days respectively

    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: a train and val/test set of the original dataframe
    """
    
    train = df[df["sip_timestamp"].dt.day.isin(days[0])]
    val_test = df[df["sip_timestamp"].dt.day.isin(days[1])]

    return train, val_test

def create_sequences(data: np.ndarray, seq_length: int):
    """
    Convert DataFrame into sequences of specified length for LSTM input.

    Args: 
        data (np.array): complete data containing both x and y
        seq_length (int): the length of the sequence considered in the lstm.

    Returns:
        (np.array, np.array): x and y datasets with the sequences. 
    """
    x, y = [], []
    for i in range(len(data) - seq_length):
        x.append(data[i:i + seq_length, :])  # all columns, including the last one, as features
        y.append(data[i + seq_length, -1])  # last column's lagged value as target

    return np.array(x), np.array(y)

def create_sequences_modified(data, seq_length):
    """
    Convert DataFrame into sequences of specified length for LSTM input. Makes sure sequences do not 
    contain data from different dates.

    Args: 
        data (np.array): complete data containing both x and y
        seq_length (int): the length of the sequence considered in the lstm.

    Returns:
        (np.array, np.array): x and y datasets with the sequences. 
    """
    X, y = [], []
    for date, group_data in data.groupby("date"):
        group_data = group_data.drop(columns=["date"]).values
        X_date, y_date = create_sequences(group_data, seq_length)
        X.append(X_date)
        y.append(y_date)
    return np.concatenate(X), np.concatenate(y)

def get_best_config(ticker: str):
    config_loss_df = pd.read_csv(f"../../reports_log/{report}/config_space_loss.csv")
    config_loss_df = config_loss_df[config_loss_df["ticker"] == ticker]
    
    id_min_loss = config_loss_df["mse"].idxmin()
    config = config_loss_df.loc[id_min_loss,:]
    config = config.drop(index = ["ticker", "mse"])
    config = config.to_dict()
    return config

In [None]:
df = pd.read_csv("../../data/processed/tq_data_gridded/df_tot_gridded.csv")

In [35]:
def correct_y(df: pd.DataFrame, ticker: str):

    config = get_best_config(ticker)

    ticker_data = df[df["ticker"] == ticker].copy()
    ticker_data["sip_timestamp"] = pd.to_datetime(ticker_data["sip_timestamp"])
    ticker_data["date"] = ticker_data["sip_timestamp"].dt.date

    date_scheme = get_dates_for_training_scheme(ticker_data)

    num_days_testing = 6
    date_scheme_test = date_scheme[-num_days_testing:]

    tot_predictions = np.array([])
    tot_true = np.array([])

    for test_day in range(6):
        
        date_set = date_scheme_test[test_day]

        train, test = train_val_test(ticker_data, date_set)

        scaler = StandardScaler()

        ts = test["sip_timestamp"].iloc[config["seq_length"]:]

        train = train.drop(columns=["ticker", "sip_timestamp"])
        test = test.drop(columns=["ticker", "sip_timestamp"])

        scaler = StandardScaler()

        columns_to_standardize = train.columns[:-1] # standardize all columns except date

        for col in columns_to_standardize:
            scaler.fit(train.loc[train[col] != 0, [col]])
            train.loc[train[col] != 0, col] = scaler.transform(train.loc[train[col] != 0, [col]])
            test.loc[test[col] != 0, col] = scaler.transform(test.loc[test[col] != 0, [col]])

        X_train, y_train = create_sequences_modified(train, config["seq_length"])
        X_test, y_test = create_sequences_modified(test, config["seq_length"])

        model = load_model(f"../../reports/models/{ticker}-{test_day}.keras")

        y_pred = model.predict(X_test, verbose=False)

        # scaling back
        test.loc[test[col] != 0, col] = scaler.inverse_transform(test.loc[test[col] != 0, [col]])
        y_pred = scaler.inverse_transform(y_pred)
        
        tot_true = np.concatenate((tot_true, test.iloc[config["seq_length"]:, -2]))
        tot_predictions = np.concatenate((tot_predictions, y_pred.flatten()))
    

    normalized_mse = np.mean((tot_predictions-tot_true)**2)**0.5 / np.std(tot_true)

    return normalized_mse, tot_predictions, tot_true


In [None]:
tot_loss_df = pd.read_csv(f"../../reports_log/{report}/tot_loss_lstm.csv")

corrected_mse = []
tot_predictions_df = pd.DataFrame(columns=["ticker", "predictions", "true"])

for ticker in tot_loss_df["ticker"]:
    print(ticker)
    corr, tot_predictions, tot_true = correct_y(df, ticker)
    
    corrected_mse.append(corr)

    sub_df = pd.DataFrame(columns=["ticker", "predictions", "true"])
    sub_df["predictions"], sub_df["true"] = tot_predictions, tot_true
    sub_df["ticker"] = ticker

    tot_predictions_df = pd.concat([tot_predictions_df, sub_df], )
    

tot_loss_df["corrected_normalized_rmse"] = corrected_mse

tot_loss_df

ADSK
AEP
AHH
ALEC
AMED
APYX
ARTNA
ARVN
BANF
BELFB
BKD
BTU
BV
CB
CFBK
CHE
CIVB
CLDX
CNA
COHU
CRBP
CRMD
CRMT
CWT
DOMO
DVAX
EAF
EPM
ESRT
EVTC
FDS
FGBI
FITB
FULT
FWONA
GCI
GDDY
GNE
HAL
HLF
HTBI
IBCP
ICAD
IPG
JELD
KEY
LYTS
MARA
MEDP
MESA


Unnamed: 0,ticker,mse,rmse,normalized_rmse,mae,corrected_mse,corrected_normalized_rmse
0,ADSK,0.232169,0.481839,1.00606,0.17647,1.005853,1.005853
1,AEP,0.461674,0.679466,1.005545,0.394341,1.005507,1.005507
2,AHH,0.083131,0.288324,1.002126,0.063943,1.002659,1.002659
3,ALEC,0.055981,0.236604,1.009996,0.057942,1.017372,1.017372
4,AMED,0.016985,0.130326,1.00144,0.027108,1.011832,1.011832
5,APYX,0.005579,0.074692,0.999921,0.006062,1.274508,1.274508
6,ARTNA,0.027399,0.165526,0.999185,0.021984,1.003972,1.003972
7,ARVN,0.166975,0.408625,1.097384,0.121215,1.095669,1.095669
8,BANF,0.128041,0.357828,1.023674,0.084184,1.022182,1.022182
9,BELFB,0.052289,0.228667,0.988091,0.041869,0.991406,0.991406


In [None]:
tot_loss_df.to_csv(f"../../reports_log/{report}/tot_loss_lstm.csv", index = False)