In [10]:
import pandas as pd
import numpy as np
import pickle

MIN_TIME_BETWEEN_UPDATES_IN_MILLISECONDS = 1

In [11]:
def process_data(ticker, path_to_data, sequence_length, num_sequences, num_eval_points, no_eval_fraction, output_path):
    # Read the CSV data
    print("Reading CSV data...")
    df = pd.read_csv(path_to_data)

    # Filter by ticker and discard the ticker column
    df = df[df['ticker'] == ticker].copy()
    df.drop(columns=['ticker'], inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Ensure the DataFrame is sorted by sip_timestamp
    df.sort_values(by='sip_timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Convert sip_timestamp to datetime
    df["sip_timestamp_datetime"] = pd.to_datetime(df["sip_timestamp"])

    # Calculate the time difference in milliseconds
    df["millisecond"] = (df["sip_timestamp_datetime"] - df["sip_timestamp_datetime"].dt.normalize()).dt.total_seconds() * 1000
    df["change_in_millisecond"] = df["millisecond"].diff().fillna(0)

    # Create a cluster identifier
    df["cluster"] = (df["change_in_millisecond"] >= MIN_TIME_BETWEEN_UPDATES_IN_MILLISECONDS).cumsum()

    # Group by the cluster identifier and aggregate
    result_df = df.groupby("cluster").agg({
        "sip_timestamp": "last",  
        "ask_price": "mean",  
        "ask_size": "mean",
        "bid_price": "mean",
        "bid_size": "mean",
        "last_trade_price": "mean",
        "last_trade_size": "mean"  
    }).reset_index(drop=True)

    df = result_df

    N = len(df)
    if N < sequence_length:
        raise ValueError("Not enough data to create sequences of the given length.")

    # Generate random starting indices for sequences
    max_start_index = N - sequence_length
    start_indices = np.random.randint(0, max_start_index + 1, size=num_sequences * 2)  # Generate extra indices

    sequences = []
    for i, idx in enumerate(start_indices):
        if len(sequences) >= num_sequences:
            break  # We have enough sequences
        
        print(f"Processing sequence {len(sequences)+1}/{num_sequences}")

        # Extract the sequence of data
        seq_df = df.iloc[idx:idx + sequence_length].copy()
        ts = seq_df['sip_timestamp'].values
        xs = seq_df.drop(columns=['sip_timestamp']).values

        # Verify the sequence length
        if len(ts) != sequence_length or len(xs) != sequence_length:
            print(f"Warning: Sequence at index {idx} has incorrect length {len(ts)}. Skipping this sequence.")
            continue  # Skip this sequence

        # Generate ts_eval
        min_ts_seq = ts.min()
        max_ts_seq = ts.max()
        min_ts_eval = min_ts_seq + no_eval_fraction * (max_ts_seq - min_ts_seq)

        ts_set = set(ts)
        ts_eval = []
        attempts = 0
        max_attempts = num_eval_points * 10  # To prevent infinite loop
        while len(ts_eval) < num_eval_points and attempts < max_attempts:
            t_candidate = np.random.uniform(min_ts_eval, max_ts_seq)
            if t_candidate not in ts_set and t_candidate not in ts_eval:
                ts_eval.append(t_candidate)
            attempts += 1

        if len(ts_eval) < num_eval_points:
            print(f"Warning: Could not generate enough evaluation timestamps for sequence at index {idx}. Skipping this sequence.")
            continue  # Skip this sequence

        ts_eval.sort()
        ts_eval = np.array(ts_eval)

        # Compute ys_eval
        ys_eval = []
        for t in ts_eval:
            y = find_y_value(df, t)
            ys_eval.append(y)
        ys_eval = np.array(ys_eval)

        # Verify that ys_eval has the correct length and contains valid values
        if len(ys_eval) != num_eval_points or np.isnan(ys_eval).any():
            print(f"Warning: ys_eval for sequence at index {idx} has incorrect length or contains NaN values. Skipping this sequence.")
            continue  # Skip this sequence

        # Append the sequence
        sequences.append((ts, xs, ts_eval, ys_eval))

    # Check if we have enough sequences
    if len(sequences) < num_sequences:
        print(f"Warning: Only {len(sequences)} sequences were generated out of requested {num_sequences}.")

    # Save the sequences to a file
    with open(output_path, 'wb') as f:
        pickle.dump(sequences, f)
    print(f"Saved {len(sequences)} sequences to {output_path}")

def find_y_value(df, t):
    df_after_t = df[df['sip_timestamp'] > t]
    if df_after_t.empty:
        # Return the last known last_trade_price
        return df['last_trade_price'].iloc[-1]
    else:
        idx = df_after_t.index[0]
        last_trade_price_i = df.at[idx, 'last_trade_price']
        last_trade_price_prev = df.at[idx - 1, 'last_trade_price'] if idx > 0 else last_trade_price_i

        if last_trade_price_i != last_trade_price_prev:
            return last_trade_price_i
        else:
            # Compare other columns except 'last_trade_size'
            row_i_except_size = df.loc[idx].drop(labels='last_trade_size')
            row_prev_except_size = df.loc[idx - 1].drop(labels='last_trade_size') if idx > 0 else row_i_except_size

            if row_i_except_size.equals(row_prev_except_size):
                return last_trade_price_i
            else:
                # If no change is found, return the last known last_trade_price
                return df['last_trade_price'].iloc[-1]


In [12]:
process_data(
    ticker='V',
    path_to_data='../data/processed/2024-09-06.csv',
    sequence_length=52,
    num_sequences=8192,
    num_eval_points=12,
    no_eval_fraction=0.2,
    output_path='../data/processed/Visa_2024-09-06_1ms_limit_large.pkl'
)

Reading CSV data...
Processing sequence 1/8192
Processing sequence 2/8192
Processing sequence 3/8192
Processing sequence 4/8192
Processing sequence 5/8192
Processing sequence 6/8192
Processing sequence 7/8192
Processing sequence 8/8192
Processing sequence 9/8192
Processing sequence 10/8192
Processing sequence 11/8192
Processing sequence 12/8192
Processing sequence 13/8192
Processing sequence 14/8192
Processing sequence 15/8192
Processing sequence 16/8192
Processing sequence 17/8192
Processing sequence 18/8192
Processing sequence 19/8192
Processing sequence 20/8192
Processing sequence 21/8192
Processing sequence 22/8192
Processing sequence 23/8192
Processing sequence 24/8192
Processing sequence 25/8192
Processing sequence 26/8192
Processing sequence 27/8192
Processing sequence 28/8192
Processing sequence 29/8192
Processing sequence 30/8192
Processing sequence 31/8192
Processing sequence 32/8192
Processing sequence 33/8192
Processing sequence 34/8192
Processing sequence 35/8192
Processin