In [240]:
import polars as pl
import os
from pathlib import Path
import pandas as pd
import hvplot.polars
import datetime
import matplotlib.pyplot as plt
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay

data_path = Path.cwd().parent.parent / "Data"

#Restrictions
timeperiod = [datetime.datetime(2018, 12, 30), datetime.datetime(2021, 12, 31)]
train_period = [datetime.datetime(2019, 1, 1), datetime.datetime(2020, 12, 31)]
trainperiod1 = [datetime.datetime(2019, 1, 1), datetime.datetime(2019, 12, 31)]
trainperiod2 = [datetime.datetime(2020, 1, 1), datetime.datetime(2020, 12, 31)]
volume_r = 0
open_interest_r = 0
delta_r = 0
vega_r = 0
theta_r = 0
gamma_r = 0
midprice_r = 0.3
bid_price_r = 0.0
days_till_exp_r1 = 7
days_till_exp_r2 = 252
moneyness_min = -2.5
moneyness_max = 2.5


In [241]:
firm_stock_data = Path.cwd().parent.parent / "Data/updated_standardization/firm_stock_dataset_tech.parquet"
option_data = Path.cwd().parent.parent / "Data/updated_standardization/data_set_option_tech.parquet"
macro_data = Path.cwd().parent.parent / "Data/updated_standardization/data_set_macro.parquet"

firm_stock_data = pd.read_parquet(firm_stock_data)
option_data = pd.read_parquet(option_data)
macro_data = pd.read_parquet(macro_data)

# Add the following columns for ADHOC
option_data['moneyness_squared'] = option_data['moneyness'] ** 2
option_data['tau_squared'] = option_data['T'] ** 2
option_data['moneyness_tau'] = option_data['moneyness'] * option_data['T']
option_data['prc_option'] = (option_data['best_offer_option'] + option_data['best_bid_option']) / 2

# List of columns to drop
columns_to_drop = ['divi', 'divo', 'sin', 'RETX','BIDLO', 'ASKHI', 'cumulative_return', 'prev_day_iv', 'prev2_day_iv', 'trading_days_till_exp', 'HIGH_vix', 'LOW_vix', 'best_bid_option', 'best_offer_option']

# Drop columns from datasets if they exist
firm_stock_data = firm_stock_data.drop(columns=columns_to_drop, errors='ignore')
option_data1 = option_data.drop(columns=columns_to_drop, errors='ignore')
macro_data = macro_data.drop(columns=columns_to_drop, errors='ignore')

# Rename column
macro_data = macro_data.rename(columns={'spread_vix': 'hi-lo_vix'})


In [242]:
option_data1

Unnamed: 0,cp_flag,Ticker,date,moneyness,impl_volatility,T,volume_option,spread_option,moneyness_squared,tau_squared,moneyness_tau,prc_option
64493,C,AAPL,2018-12-31,-0.228,0.266853,0.015873,4965,0.05,0.051984,0.000252,-0.003619,3.475
64494,C,AAPL,2018-12-31,0.211,0.289520,0.015873,12609,0.04,0.044521,0.000252,0.003349,2.050
64495,C,AAPL,2018-12-31,0.643,0.297505,0.015873,21841,0.02,0.413449,0.000252,0.010206,1.060
64496,C,AAPL,2018-12-31,1.068,0.303342,0.015873,14485,0.01,1.140624,0.000252,0.016952,0.485
64497,C,AAPL,2018-12-31,1.486,0.315189,0.015873,13253,0.01,2.208196,0.000252,0.023587,0.215
...,...,...,...,...,...,...,...,...,...,...,...,...
428784,P,TSLA,2021-12-30,2.062,1.012930,0.007937,6,1.65,4.251844,0.000063,0.016365,115.275
428785,P,TSLA,2021-12-30,2.161,1.047433,0.007937,4,1.65,4.669921,0.000063,0.017151,120.275
428786,P,TSLA,2021-12-30,2.261,1.088575,0.007937,9,1.50,5.112121,0.000063,0.017944,125.300
428787,P,TSLA,2021-12-30,2.360,1.129544,0.007937,3235,1.55,5.569600,0.000063,0.018730,130.325


In [243]:
option_data = option_data1.copy()
firm_stock_data_short = firm_stock_data.iloc[:, :12]

option_data = pd.merge(option_data, firm_stock_data_short, on=['date', 'Ticker'], how='left')

In [244]:
firm_stock_data_short

Unnamed: 0,Ticker,date,PRC,vol_stock,RET,BID,ASK,PRC_actual,daily_return_indicator_stock,5_day_rolling_return_stock,hi-lo_stock,spread_stock
0,AAPL,2019-01-02,156.23000,42291347.0,0.000512,156.22000,156.24001,39.057500,1.0,-0.003826,3.97000,0.02001
1,AAPL,2019-01-03,157.74001,35003466.0,0.009665,157.92999,157.94000,39.435003,1.0,0.046507,2.88000,0.01001
2,AAPL,2019-01-04,157.92000,37066356.0,0.001141,157.91000,157.92999,39.480000,1.0,0.075529,4.62001,0.01999
3,AAPL,2019-01-07,142.19000,91373695.0,-0.099607,142.08000,142.09000,35.547500,-1.0,-0.095311,3.72000,0.01000
4,AAPL,2019-01-08,148.25999,58603001.0,0.042689,148.25000,148.25999,37.064997,1.0,-0.050529,4.74990,0.00999
...,...,...,...,...,...,...,...,...,...,...,...,...
6051,TSLA,2021-12-27,1008.87000,31046563.0,0.074947,1008.50000,1008.87000,336.290000,1.0,0.033689,58.60992,0.37000
6052,TSLA,2021-12-28,1067.00000,30735097.0,0.057619,1067.45996,1067.58997,355.666667,1.0,0.151125,75.41668,0.13001
6053,TSLA,2021-12-29,1093.93994,23695249.0,0.025248,1093.81995,1093.83997,364.646647,1.0,0.173038,46.28479,0.02002
6054,TSLA,2021-12-30,1088.46997,20025526.0,-0.005000,1088.68005,1089.19995,362.823323,-1.0,0.209492,40.57984,0.51990


In [245]:
not_to_move = firm_stock_data_short.iloc[:,2:12].columns
not_to_move

Index(['PRC', 'vol_stock', 'RET', 'BID', 'ASK', 'PRC_actual',
       'daily_return_indicator_stock', '5_day_rolling_return_stock',
       'hi-lo_stock', 'spread_stock'],
      dtype='object')

In [246]:
option_data

Unnamed: 0,cp_flag,Ticker,date,moneyness,impl_volatility,T,volume_option,spread_option,moneyness_squared,tau_squared,...,PRC,vol_stock,RET,BID,ASK,PRC_actual,daily_return_indicator_stock,5_day_rolling_return_stock,hi-lo_stock,spread_stock
0,C,AAPL,2018-12-31,-0.228,0.266853,0.015873,4965,0.05,0.051984,0.000252,...,,,,,,,,,,
1,C,AAPL,2018-12-31,0.211,0.289520,0.015873,12609,0.04,0.044521,0.000252,...,,,,,,,,,,
2,C,AAPL,2018-12-31,0.643,0.297505,0.015873,21841,0.02,0.413449,0.000252,...,,,,,,,,,,
3,C,AAPL,2018-12-31,1.068,0.303342,0.015873,14485,0.01,1.140624,0.000252,...,,,,,,,,,,
4,C,AAPL,2018-12-31,1.486,0.315189,0.015873,13253,0.01,2.208196,0.000252,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364291,P,TSLA,2021-12-30,2.062,1.012930,0.007937,6,1.65,4.251844,0.000063,...,1088.46997,20025526.0,-0.005,1088.68005,1089.19995,362.823323,-1.0,0.209492,40.57984,0.5199
364292,P,TSLA,2021-12-30,2.161,1.047433,0.007937,4,1.65,4.669921,0.000063,...,1088.46997,20025526.0,-0.005,1088.68005,1089.19995,362.823323,-1.0,0.209492,40.57984,0.5199
364293,P,TSLA,2021-12-30,2.261,1.088575,0.007937,9,1.50,5.112121,0.000063,...,1088.46997,20025526.0,-0.005,1088.68005,1089.19995,362.823323,-1.0,0.209492,40.57984,0.5199
364294,P,TSLA,2021-12-30,2.360,1.129544,0.007937,3235,1.55,5.569600,0.000063,...,1088.46997,20025526.0,-0.005,1088.68005,1089.19995,362.823323,-1.0,0.209492,40.57984,0.5199


In [247]:
import pandas as pd
import numpy as np

# Assuming option_data contains both call and put option data
# List of specific tickers to count
tickers = ['AAPL', 'AMZN', 'META', 'MSFT', 'NVDA', 'TSLA']

# Function to process options dynamically, whether puts or calls
def process_options(option_data, option_type):
    # Define bins depending on whether it's a put or call
    if option_type == 'P':
        bins = np.arange(-2, 0.51, 0.25)  # Bins for Puts
    elif option_type == 'C':
        bins = np.arange(-0.5, 2.01, 0.25)  # Bins for Calls
    
    # Filter based on option type
    option_filtered = option_data[option_data['cp_flag'] == option_type]
    
    # Categorize moneyness into groups
    option_filtered['moneyness_group'] = pd.cut(option_filtered['moneyness'], bins=bins)

    # Convert moneyness_group to the first number of the range (lower bound of the interval)
    option_filtered['moneyness_group'] = option_filtered['moneyness_group'].apply(lambda x: x.left if pd.notnull(x) else np.nan)

    # Dynamically identify columns to calculate mean and std for
    numeric_columns = option_filtered.select_dtypes(include=np.number).columns

    # Create a dictionary for aggregating the mean and std for each numeric column
    agg_dict = {col: ['mean', 'std'] for col in numeric_columns if col not in ['date', 'Ticker', 'cp_flag']}

    # Group by 'date' and 'moneyness_group' and apply aggregation dynamically
    grouped_options = option_filtered.groupby(['date', 'moneyness_group']).agg(agg_dict)

    # Flatten the MultiIndex columns produced by aggregation
    grouped_options.columns = ['_'.join(col).strip() for col in grouped_options.columns.values]

    # Reset index to have 'date' and 'moneyness_group' as columns
    grouped_options = grouped_options.reset_index()

    # Add separate columns for each ticker and count occurrences
    for ticker in tickers:
        grouped_options[ticker] = option_filtered[option_filtered['Ticker'] == ticker].groupby(['date', 'moneyness_group']).size().reindex(grouped_options.set_index(['date', 'moneyness_group']).index, fill_value=0).values

    return grouped_options

# Apply the function for puts and calls
grouped_puts = process_options(option_data, 'P')
grouped_calls = process_options(option_data, 'C')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  option_filtered['moneyness_group'] = pd.cut(option_filtered['moneyness'], bins=bins)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  option_filtered['moneyness_group'] = option_filtered['moneyness_group'].apply(lambda x: x.left if pd.notnull(x) else np.nan)
  grouped_options = option_filtered.groupby(['date', 'moneyness_group']).agg(agg_dict)
  grouped_options[ticker] = option_filtered[option_filtered['Ticker'] == ticker].groupby(['date', 'moneyness_group']).size().reindex(grouped_options.set_index(['date', 'mone

In [248]:
grouped_puts

Unnamed: 0,date,moneyness_group,moneyness_mean,moneyness_std,impl_volatility_mean,impl_volatility_std,T_mean,T_std,volume_option_mean,volume_option_std,...,hi-lo_stock_mean,hi-lo_stock_std,spread_stock_mean,spread_stock_std,AAPL,AMZN,META,MSFT,NVDA,TSLA
0,2018-12-31,-2.00,-1.860643,0.065937,0.627174,0.134024,0.015873,0.0,339.357143,337.590240,...,,,,,1,6,1,2,2,2
1,2018-12-31,-1.75,-1.623500,0.071893,0.608392,0.126452,0.015873,0.0,392.285714,407.838661,...,,,,,1,7,2,1,1,2
2,2018-12-31,-1.50,-1.382812,0.078221,0.606167,0.138904,0.015873,0.0,661.625000,1012.993114,...,,,,,2,6,1,2,2,3
3,2018-12-31,-1.25,-1.116625,0.075100,0.576477,0.114992,0.015873,0.0,1042.750000,1824.079074,...,,,,,1,7,2,2,2,2
4,2018-12-31,-1.00,-0.872462,0.068266,0.608202,0.127452,0.015873,0.0,480.076923,491.725442,...,,,,,0,6,1,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7535,2021-12-30,-0.75,-0.640250,0.067576,0.311621,0.125146,0.007937,0.0,12390.250000,19335.348981,...,35.381803,25.536958,0.696204,0.643714,1,3,0,1,1,2
7536,2021-12-30,-0.50,-0.394333,0.072075,0.305530,0.114938,0.007937,0.0,10658.222222,9210.655063,...,38.730224,24.560596,0.788841,0.639694,0,4,1,1,1,2
7537,2021-12-30,-0.25,-0.121556,0.087805,0.364117,0.122689,0.007937,0.0,11827.777778,8405.850504,...,37.559571,21.622621,0.694392,0.585283,0,3,1,0,2,3
7538,2021-12-30,0.00,0.135444,0.072908,0.315706,0.144618,0.007937,0.0,10513.777778,19188.990807,...,38.206893,25.346586,0.778841,0.651178,1,4,0,1,1,2


In [249]:
not_to_move

Index(['PRC', 'vol_stock', 'RET', 'BID', 'ASK', 'PRC_actual',
       'daily_return_indicator_stock', '5_day_rolling_return_stock',
       'hi-lo_stock', 'spread_stock'],
      dtype='object')

In [250]:
import pandas as pd

# Step 1: Fill missing values with 0
grouped_puts_filled = grouped_puts.fillna(0)

# Step 2: Shift all columns down by 10 steps except 'impl_volatility_mean' and 'impl_volatility_std'
columns_to_shift = grouped_puts_filled.columns.difference(['date', 'moneyness_group', 'impl_volatility_mean', 'impl_volatility_std'] + [f"{col}_mean" for col in not_to_move] + [f"{col}_std" for col in not_to_move])
grouped_puts_shifted = grouped_puts_filled.copy()

# Shift the columns by 10 rows down
grouped_puts_shifted[columns_to_shift] = grouped_puts_filled[columns_to_shift].shift(10, fill_value=0)

# Step 3: Create 'prev_iv', 'prev2_iv', 'prev_iv_std', 'prev2_iv_std' columns based on lagged 'impl_volatility_mean' and 'impl_volatility_std'

# Create new columns 'prev_iv' (10 rows back) and 'prev2_iv' (20 rows back)
grouped_puts_shifted['prev_iv'] = grouped_puts_filled['impl_volatility_mean'].shift(10, fill_value=0)
grouped_puts_shifted['prev2_iv'] = grouped_puts_filled['impl_volatility_mean'].shift(20, fill_value=0)

# Create new columns 'prev_iv_std' (10 rows back) and 'prev2_iv_std' (20 rows back)
grouped_puts_shifted['prev_iv_std'] = grouped_puts_filled['impl_volatility_std'].shift(10, fill_value=0)
grouped_puts_shifted['prev2_iv_std'] = grouped_puts_filled['impl_volatility_std'].shift(20, fill_value=0)

# Step 1: Fill missing values with 0
grouped_calls_filled = grouped_calls.fillna(0)

# Step 2: Shift all columns down by 10 steps except 'impl_volatility_mean', 'impl_volatility_std', 'date', and 'moneyness_group'
columns_to_shift = grouped_puts_filled.columns.difference(['date', 'moneyness_group', 'impl_volatility_mean', 'impl_volatility_std'] + [f"{col}_mean" for col in not_to_move] + [f"{col}_std" for col in not_to_move])
grouped_calls_shifted = grouped_calls_filled.copy()

# Shift the columns by 10 rows down
grouped_calls_shifted[columns_to_shift] = grouped_calls_filled[columns_to_shift].shift(10, fill_value=0)

# Step 3: Create 'prev_iv', 'prev2_iv', 'prev_iv_std', 'prev2_iv_std' columns based on lagged 'impl_volatility_mean' and 'impl_volatility_std'

# Create new columns 'prev_iv' (10 rows back) and 'prev2_iv' (20 rows back)
grouped_calls_shifted['prev_iv'] = grouped_calls_filled['impl_volatility_mean'].shift(10, fill_value=0)
grouped_calls_shifted['prev2_iv'] = grouped_calls_filled['impl_volatility_mean'].shift(20, fill_value=0)

# Create new columns 'prev_iv_std' (10 rows back) and 'prev2_iv_std' (20 rows back)
grouped_calls_shifted['prev_iv_std'] = grouped_calls_filled['impl_volatility_std'].shift(10, fill_value=0)
grouped_calls_shifted['prev2_iv_std'] = grouped_calls_filled['impl_volatility_std'].shift(20, fill_value=0)

# The DataFrame 'grouped_calls_shifted' now contains the shifted columns and the new columns for lagged volatility values
# import ace_tools as tools; tools.display_dataframe_to_user(name="Grouped Calls Shifted", dataframe=grouped_calls_shifted)



In [251]:
# Step 1: Add the 'Ticker' column to each dataset
grouped_calls_shifted['cp_flag'] = 'C'  # Add 'C' for calls
grouped_puts_shifted['cp_flag'] = 'P'   # Add 'P' for puts

# Merge with the option data & macro data
merge_data_calls = pd.merge(grouped_calls_shifted, macro_data, on=['date'], how='left')
merge_data_puts = pd.merge(grouped_puts_shifted, macro_data, on=['date'], how='left')

options_total = pd.concat([merge_data_calls, merge_data_puts], axis=0)

In [252]:
options_total

Unnamed: 0,date,moneyness_group,moneyness_mean,moneyness_std,impl_volatility_mean,impl_volatility_std,T_mean,T_std,volume_option_mean,volume_option_std,...,cp_flag,FF_rate,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,CLOSE_vix,hi-lo_vix
0,2018-12-31,-0.50,0.000000,0.000000,0.378736,0.143804,0.000000,0.0,0.000000,0.000000,...,C,,,,,,,,,
1,2018-12-31,-0.25,0.000000,0.000000,0.421619,0.131420,0.000000,0.0,0.000000,0.000000,...,C,,,,,,,,,
2,2018-12-31,0.00,0.000000,0.000000,0.431959,0.125325,0.000000,0.0,0.000000,0.000000,...,C,,,,,,,,,
3,2018-12-31,0.25,0.000000,0.000000,0.440552,0.112838,0.000000,0.0,0.000000,0.000000,...,C,,,,,,,,,
4,2018-12-31,0.50,0.000000,0.000000,0.433034,0.110295,0.000000,0.0,0.000000,0.000000,...,C,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7535,2021-12-30,-0.75,-0.619100,0.075434,0.311621,0.125146,0.011905,0.0,10448.900000,13477.640434,...,P,0.08,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,12.85,2.04
7536,2021-12-30,-0.50,-0.382750,0.075645,0.305530,0.114938,0.011905,0.0,9691.333333,13760.416275,...,P,0.08,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,12.85,2.04
7537,2021-12-30,-0.25,-0.131000,0.066126,0.364117,0.122689,0.011905,0.0,8233.454545,8609.453285,...,P,0.08,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,12.85,2.04
7538,2021-12-30,0.00,0.122083,0.065604,0.315706,0.144618,0.011905,0.0,8505.500000,14957.514235,...,P,0.08,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,12.85,2.04


In [253]:
option_train = options_total[(options_total["date"] >= train_period[0]) & (options_total["date"] <= train_period[1])]
option_train1 = options_total[(options_total["date"] >= trainperiod1[0]) & (options_total["date"] <= trainperiod1[1])]
option_train2 = options_total[(options_total["date"] >= trainperiod2[0]) & (options_total["date"] <= trainperiod2[1])]

option_test = options_total[(options_total["date"] > train_period[1])]

In [254]:
option_test

Unnamed: 0,date,moneyness_group,moneyness_mean,moneyness_std,impl_volatility_mean,impl_volatility_std,T_mean,T_std,volume_option_mean,volume_option_std,...,cp_flag,FF_rate,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,CLOSE_vix,hi-lo_vix
5040,2021-01-04,-0.50,-0.368294,0.069495,0.479429,0.167744,0.023810,0.0,1433.588235,2970.442540,...,C,0.09,672.012903,3.036452,0.93,0.1074,0.1400,21.29,21.20,1.92
5041,2021-01-04,-0.25,-0.133632,0.073882,0.479940,0.149235,0.023810,0.0,3058.631579,6475.478749,...,C,0.09,672.012903,3.036452,0.93,0.1074,0.1400,21.29,21.20,1.92
5042,2021-01-04,0.00,0.121900,0.073019,0.445119,0.125227,0.023810,0.0,7935.650000,11046.893706,...,C,0.09,672.012903,3.036452,0.93,0.1074,0.1400,21.29,21.20,1.92
5043,2021-01-04,0.25,0.376944,0.076069,0.421343,0.099053,0.023810,0.0,5469.944444,7154.944555,...,C,0.09,672.012903,3.036452,0.93,0.1074,0.1400,21.29,21.20,1.92
5044,2021-01-04,0.50,0.629267,0.069784,0.437297,0.099304,0.023810,0.0,3285.266667,4529.093641,...,C,0.09,672.012903,3.036452,0.93,0.1074,0.1400,21.29,21.20,1.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7535,2021-12-30,-0.75,-0.619100,0.075434,0.311621,0.125146,0.011905,0.0,10448.900000,13477.640434,...,P,0.08,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,12.85,2.04
7536,2021-12-30,-0.50,-0.382750,0.075645,0.305530,0.114938,0.011905,0.0,9691.333333,13760.416275,...,P,0.08,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,12.85,2.04
7537,2021-12-30,-0.25,-0.131000,0.066126,0.364117,0.122689,0.011905,0.0,8233.454545,8609.453285,...,P,0.08,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,12.85,2.04
7538,2021-12-30,0.00,0.122083,0.065604,0.315706,0.144618,0.011905,0.0,8505.500000,14957.514235,...,P,0.08,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,12.85,2.04


# Nonscaled dataset train for put and calls

In [255]:
# Save the parquet file

data_set_train_nonscaled_new = Path.cwd().parent.parent / "Data/tech_data/data_set_train_tech_nonscaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_train_nonscaled_new):
    option_train.to_parquet(data_set_train_nonscaled_new)

# Nonscaled dataset test for put and calls

In [256]:
# Save the parquet file

data_set_test_nonscaled_new = Path.cwd().parent.parent / "Data/tech_data/data_set_test_tech_nonscaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_test_nonscaled_new):
    option_test.to_parquet(data_set_test_nonscaled_new)

# Call and Put standardization train

In [257]:
import pandas as pd

def standardize_data(option_train, option_type, exclude_columns):
    """
    Filter by option type and standardize numeric columns.
    
    Parameters:
    option_train (pd.DataFrame): The option data.
    option_type (str): 'C' for calls, 'P' for puts.
    exclude_columns (list): List of columns to exclude from standardization.
    
    Returns:
    pd.DataFrame: The standardized DataFrame.
    """
    # Filter based on option type
    option_filtered = option_train[option_train["cp_flag"] == option_type].copy()
    
    # Identify numeric columns to standardize (excluding specified columns)
    numeric_columns = option_filtered.select_dtypes(include=['float64', 'int64']).columns.difference(exclude_columns)
    
    # Standardize the numeric columns (only for numeric data)
    option_filtered[numeric_columns] = (option_filtered[numeric_columns] - option_filtered[numeric_columns].mean()) / option_filtered[numeric_columns].std()

    # Return the standardized DataFrame, including excluded columns
    return option_filtered

# List of columns to exclude from standardization
exclude_columns = ['date', 'cp_flag', 'moneyness_group', 'impl_volatility_mean', 'impl_volatility_std']

# Standardize the data for calls (C) and puts (P)
standardized_data_opt_c = standardize_data(option_train, 'C', exclude_columns)
standardized_data_opt_p = standardize_data(option_train, 'P', exclude_columns)

# Display the standardized DataFrames
# print(standardized_data_opt_c.head())
# print(standardized_data_opt_p.head())


In [258]:
# Save the parquet file

data_set_standardized_tot_p_new = Path.cwd().parent.parent / "Data/tech_data/data_set_train_val_tech_scaled_p.parquet"
data_set_standardized_tot_c_new = Path.cwd().parent.parent / "Data/tech_data/data_set_train_val_tech_scaled_c.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_standardized_tot_p_new):
    standardized_data_opt_p.to_parquet(data_set_standardized_tot_p_new)

save_parquet = True
if save_parquet or not os.path.exists(data_set_standardized_tot_c_new):
    standardized_data_opt_c.to_parquet(data_set_standardized_tot_c_new)

# Calls and Put standardization test

In [259]:
import pandas as pd

data_set_train_nonscaled_new = Path.cwd().parent.parent / "Data/tech_data/data_set_train_tech_nonscaled.parquet"
data_set_test_nonscaled_new = Path.cwd().parent.parent / "Data/tech_data/data_set_test_tech_nonscaled.parquet"
data_set_train_nonscaled_new = pd.read_parquet(data_set_train_nonscaled_new)
data_set_test_nonscaled_new = pd.read_parquet(data_set_test_nonscaled_new)
# data_set_train_nonscaled_c_tot = data_set_train_nonscaled_new[(data_set_train_nonscaled_new["cp_flag"] == "C")]

# Assuming merge_data_total_t and merge_data_train_c1 are already defined

# Filter the test set for 'C' and 'P' flags
test_c = data_set_test_nonscaled_new[data_set_test_nonscaled_new["cp_flag"] == "C"]
test_p = data_set_test_nonscaled_new[data_set_test_nonscaled_new["cp_flag"] == "P"]

data_set_train_nonscaled_c = data_set_train_nonscaled_new[data_set_train_nonscaled_new['cp_flag'] == 'C']
data_set_train_nonscaled_p = data_set_train_nonscaled_new[data_set_train_nonscaled_new['cp_flag'] == 'P']

# Custom standardization function
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features from train data and the 'C' test data
train_features = data_set_train_nonscaled_c.drop(columns=['date', 'moneyness_group', 'cp_flag', 'impl_volatility_mean', 'impl_volatility_std'])
test_features_c = test_c.drop(columns=['date', 'moneyness_group', 'cp_flag','impl_volatility_mean', 'impl_volatility_std'])

# Step 2: Standardize the training data and save the means and stds
train_features_standardized, train_means, train_stds = custom_standardize(train_features)

# Step 3: Standardize the 'C' test data using the saved means and stds from the training data
test_features_c_standardized, _, _ = custom_standardize(test_features_c, means=train_means, stds=train_stds)

# Step 4: Combine back the standardized 'C' test features with the non-feature columns
test_data_c_standardized = pd.DataFrame(test_features_c_standardized, columns=test_features_c.columns)
test_data_c_standardized = pd.concat([test_c[['date', 'moneyness_group', 'cp_flag','impl_volatility_mean', 'impl_volatility_std']], test_data_c_standardized], axis=1)

# Output the standardized 'C' test data
test_data_c_standardized



Unnamed: 0,date,moneyness_group,cp_flag,impl_volatility_mean,impl_volatility_std,moneyness_mean,moneyness_std,T_mean,T_std,volume_option_mean,...,prev2_iv_std,FF_rate,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,CLOSE_vix,hi-lo_vix
5040,2021-01-04,-0.50,C,0.479429,0.167744,-1.558174,-0.573249,1.519695,-0.21372,-0.648503,...,-0.475013,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
5041,2021-01-04,-0.25,C,0.479940,0.149235,-1.231237,0.025769,1.519695,-0.21372,-0.266864,...,-0.636105,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
5042,2021-01-04,0.00,C,0.445119,0.125227,-0.875225,-0.092094,1.519695,-0.21372,0.878495,...,-0.806542,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
5043,2021-01-04,0.25,C,0.421343,0.099053,-0.519891,0.324400,1.519695,-0.21372,0.299428,...,-0.835290,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
5044,2021-01-04,0.50,C,0.437297,0.099304,-0.168351,-0.533852,1.519695,-0.21372,-0.213639,...,-0.971770,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7535,2021-12-30,0.75,C,0.397993,0.169214,0.168960,-0.803586,-0.760782,-0.21372,1.305590,...,-0.052023,-1.172726,0.958946,-0.822889,0.047437,-0.812020,-0.455627,-0.568767,-0.630316,-0.381699
7536,2021-12-30,1.00,C,0.458480,0.182552,0.508151,-0.739018,-0.760782,-0.21372,1.292262,...,-0.094125,-1.172726,0.958946,-0.822889,0.047437,-0.812020,-0.455627,-0.568767,-0.630316,-0.381699
7537,2021-12-30,1.25,C,0.438479,0.189529,0.877727,1.205688,-0.760782,-0.21372,-0.290051,...,-0.005873,-1.172726,0.958946,-0.822889,0.047437,-0.812020,-0.455627,-0.568767,-0.630316,-0.381699
7538,2021-12-30,1.50,C,0.497671,0.190034,1.216140,-2.082956,-0.760782,-0.21372,0.998866,...,0.169813,-1.172726,0.958946,-0.822889,0.047437,-0.812020,-0.455627,-0.568767,-0.630316,-0.381699


In [260]:
# Load the datasets
data_set_train_nonscaled_new = Path.cwd().parent.parent / "Data/tech_data/data_set_train_tech_nonscaled.parquet"
data_set_test_nonscaled_new = Path.cwd().parent.parent / "Data/tech_data/data_set_test_tech_nonscaled.parquet"
data_set_train_nonscaled_new = pd.read_parquet(data_set_train_nonscaled_new)
data_set_test_nonscaled_new = pd.read_parquet(data_set_test_nonscaled_new)

# Filter the test set for 'C' and 'P' flags
test_p = data_set_test_nonscaled_new[data_set_test_nonscaled_new["cp_flag"] == "P"]

# Filter the training data for 'P' (put options)
data_set_train_nonscaled_p = data_set_train_nonscaled_new[data_set_train_nonscaled_new['cp_flag'] == 'P']

# Custom standardization function
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features from train data and the 'P' test data
train_features_p = data_set_train_nonscaled_p.drop(columns=['date', 'moneyness_group', 'cp_flag', 'impl_volatility_mean', 'impl_volatility_std'])
test_features_p = test_p.drop(columns=['date', 'moneyness_group', 'cp_flag', 'impl_volatility_mean', 'impl_volatility_std'])

# Step 2: Standardize the training data and save the means and stds for puts
train_features_p_standardized, train_means_p, train_stds_p = custom_standardize(train_features_p)

# Step 3: Standardize the 'P' test data using the saved means and stds from the training data
test_features_p_standardized, _, _ = custom_standardize(test_features_p, means=train_means_p, stds=train_stds_p)

# Step 4: Combine back the standardized 'P' test features with the non-feature columns
test_data_p_standardized = pd.DataFrame(test_features_p_standardized, columns=test_features_p.columns)
test_data_p_standardized = pd.concat([test_p[['date', 'moneyness_group', 'cp_flag', 'impl_volatility_mean', 'impl_volatility_std']], test_data_p_standardized], axis=1)

# Output the standardized 'P' test data
test_data_p_standardized


Unnamed: 0,date,moneyness_group,cp_flag,impl_volatility_mean,impl_volatility_std,moneyness_mean,moneyness_std,T_mean,T_std,volume_option_mean,...,prev2_iv_std,FF_rate,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,CLOSE_vix,hi-lo_vix
5040,2021-01-04,-2.00,P,0.716275,0.269466,-1.583860,-0.487562,1.515971,-0.213674,-0.877439,...,0.371799,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
5041,2021-01-04,-1.75,P,0.633162,0.244552,-1.216221,1.280503,1.515971,-0.213674,-0.772302,...,0.752631,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
5042,2021-01-04,-1.50,P,0.567879,0.235952,-0.865971,0.199870,1.515971,-0.213674,-0.719237,...,0.256859,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
5043,2021-01-04,-1.25,P,0.559753,0.225383,-0.522997,-0.191891,1.515971,-0.213674,-0.630035,...,0.450171,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
5044,2021-01-04,-1.00,P,0.545487,0.218441,-0.175572,-0.510639,1.515971,-0.213674,-0.534790,...,-0.106734,-1.162816,1.451941,0.266074,-0.801314,-1.218487,-1.213376,-0.071108,-0.041412,-0.407989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7535,2021-12-30,-0.75,P,0.311621,0.125146,0.181550,0.237646,-0.757755,-0.213674,4.186096,...,-0.436048,-1.172726,0.958946,-0.822889,0.047437,-0.812020,-0.455627,-0.568767,-0.630316,-0.381699
7536,2021-12-30,-0.50,P,0.305530,0.114938,0.511030,0.265937,-0.757755,-0.213674,3.796540,...,-0.537905,-1.172726,0.958946,-0.822889,0.047437,-0.812020,-0.455627,-0.568767,-0.630316,-0.381699
7537,2021-12-30,-0.25,P,0.364117,0.122689,0.861979,-1.010289,-0.757755,-0.213674,3.046870,...,-0.450505,-1.172726,0.958946,-0.822889,0.047437,-0.812020,-0.455627,-0.568767,-0.630316,-0.381699
7538,2021-12-30,0.00,P,0.315706,0.144618,1.214787,-1.080205,-0.757755,-0.213674,3.186761,...,-0.355298,-1.172726,0.958946,-0.822889,0.047437,-0.812020,-0.455627,-0.568767,-0.630316,-0.381699


In [261]:
# Save the parquet file
data_set_test_c_total_new = Path.cwd().parent.parent / "Data/tech_data/data_set_test_tech_scaled_c_total.parquet"
data_set_test_p_total_new = Path.cwd().parent.parent / "Data/tech_data/data_set_test_tech_scaled_p_total.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_test_c_total_new):
    test_data_c_standardized.to_parquet(data_set_test_c_total_new)

save_parquet = True
if save_parquet or not os.path.exists(data_set_test_p_total_new):
    test_data_p_standardized.to_parquet(data_set_test_p_total_new)

# From here the old code will be represent

In [None]:
import pandas as pd

# Assuming merge_data_total_t and merge_data_train_c1 are already defined
# data_set_train_nonscaled = pd.read_parquet("/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_tech_nonscaled.parquet")
# data_set_train_nonscaled_p = data_set_train_nonscaled[(data_set_train_nonscaled["cp_flag"] == "P")]

# Filter the test set for 'C' and 'P' flags
test_c = merge_data_total_t[merge_data_total_t["cp_flag"] == "C"]
test_p = merge_data_total_t[merge_data_total_t["cp_flag"] == "P"]

# Custom standardization function
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features from train data and the 'C' test data
train_features = data_set_train_nonscaled_p.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])
test_features_p = test_p.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])

# Step 2: Standardize the training data and save the means and stds
train_features_standardized, train_means, train_stds = custom_standardize(train_features)

# Step 3: Standardize the 'C' test data using the saved means and stds from the training data
test_features_p_standardized, _, _ = custom_standardize(test_features_p, means=train_means, stds=train_stds)

# Step 4: Combine back the standardized 'C' test features with the non-feature columns
test_data_p_standardized = pd.DataFrame(test_features_p_standardized, columns=test_features_p.columns)
test_data_p_standardized = pd.concat([test_p[['date', 'Ticker', 'cp_flag', 'impl_volatility']], test_data_p_standardized], axis=1)

# Output the standardized 'C' test data
test_data_p_standardized

# Save the parquet file
data_set_val_test_p = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_scaled_p.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_val_test_p):
    test_data_p_standardized.to_parquet(data_set_val_test_p)



In [109]:
standardized_data_opt_c

Unnamed: 0,date,moneyness_group,moneyness_mean,moneyness_std,impl_volatility_mean,impl_volatility_std,T_mean,T_std,prev_day_iv_mean,prev_day_iv_std,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
10,2019-01-02,-0.50,-1.559693,-0.899796,0.487425,0.099187,-0.000623,-0.21372,-2.103683,-1.555744,...,-1.357150,-0.709812,1.566975,1.374613,1.472822,0.677893,0.477223,0.666199,0.478375,-0.134139
11,2019-01-02,-0.25,-1.238194,0.280259,0.480363,0.110424,-0.000623,-0.21372,-2.103683,-1.555744,...,-1.357150,-0.709812,1.566975,1.374613,1.472822,0.677893,0.477223,0.666199,0.478375,-0.134139
12,2019-01-02,0.00,-0.862361,-0.483196,0.484917,0.098911,-0.000623,-0.21372,-2.103683,-1.555744,...,-1.357150,-0.709812,1.566975,1.374613,1.472822,0.677893,0.477223,0.666199,0.478375,-0.134139
13,2019-01-02,0.25,-0.517492,0.095575,0.495387,0.124589,-0.000623,-0.21372,-2.103683,-1.555744,...,-1.357150,-0.709812,1.566975,1.374613,1.472822,0.677893,0.477223,0.666199,0.478375,-0.134139
14,2019-01-02,0.50,-0.154267,-0.016268,0.485254,0.122659,-0.000623,-0.21372,-2.103683,-1.555744,...,-1.357150,-0.709812,1.566975,1.374613,1.472822,0.677893,0.477223,0.666199,0.478375,-0.134139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5035,2020-12-31,0.75,0.116888,0.118652,0.376182,0.133080,-1.520941,-0.21372,-0.723599,-0.989122,...,1.436988,0.268241,-0.801314,-1.215175,-1.208572,-0.056028,-0.144789,0.017269,-0.033654,-0.552581
5036,2020-12-31,1.00,0.514576,0.449081,0.378704,0.139804,-1.520941,-0.21372,-0.604428,-0.991330,...,1.436988,0.268241,-0.801314,-1.215175,-1.208572,-0.056028,-0.144789,0.017269,-0.033654,-0.552581
5037,2020-12-31,1.25,0.858547,-6.472006,0.389960,0.138477,-1.520941,-0.21372,-0.553610,-0.906571,...,1.436988,0.268241,-0.801314,-1.215175,-1.208572,-0.056028,-0.144789,0.017269,-0.033654,-0.552581
5038,2020-12-31,1.50,1.192630,1.742726,0.378551,0.124766,-1.520941,-0.21372,-0.384698,-0.960639,...,1.436988,0.268241,-0.801314,-1.215175,-1.208572,-0.056028,-0.144789,0.017269,-0.033654,-0.552581


# Standardize macro data

In [18]:
import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = macro_train.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['date'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_macro = pd.concat([data[['date']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_macro


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,date,10Y_RIR,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
3,2019-01-02,1.562822,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
4,2019-01-03,1.562822,1.371916,1.469739,0.071348,1.124610,0.457626,0.198016,0.656238,-1.349397,-0.709394,1.065358
5,2019-01-04,1.426211,1.310038,1.377262,0.336815,1.124610,0.300487,0.331346,0.252127,-1.344931,-0.708961,0.153294
8,2019-01-07,1.576483,1.355593,1.441962,-0.067370,1.124610,0.014381,0.038020,0.124589,-1.340465,-0.708528,-0.052797
9,2019-01-08,1.617466,1.392399,1.488480,-0.023713,1.124610,-0.045720,0.059838,0.080165,-1.327066,-0.707228,-0.322470
...,...,...,...,...,...,...,...,...,...,...,...,...
725,2020-12-24,-0.759553,-1.214549,-1.201615,-0.183556,-1.160795,-0.265465,-0.221367,-0.171329,1.333629,0.283189,-0.329047
729,2020-12-28,-0.786875,-1.215936,-1.205631,-0.330020,-1.160795,-0.311167,-0.284395,-0.223634,1.348571,0.281024,-0.318085
730,2020-12-29,-0.786875,-1.209428,-1.202285,-0.123703,-1.160795,-0.233536,-0.259346,-0.368369,1.408338,0.272360,-0.114186
731,2020-12-30,-0.786875,-1.206761,-1.201057,-0.023713,-1.160795,-0.092048,-0.049250,-0.162015,1.423280,0.270195,-0.188730


In [19]:
import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = firm_train.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_firm = pd.concat([data[['Ticker' ,'date']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_firm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,5_day_rolling_return_stock,ASK,ASKHI,BID,BIDLO,PRC,PRC_actual,RET,...,spread_stock,std_dolvol,std_turn,stdacc,stdcf,tang,tb,turn,vol_stock,zerotrade
0,AAPL,2019-01-02,-0.286276,-0.568980,-0.569260,-0.569010,-0.567166,-0.568991,-1.119896,-0.080584,...,-0.125902,0.384411,-0.383392,-1.637978,-1.397221,-2.269561,-0.150957,-0.464434,0.737871,-0.186964
1,AAPL,2019-01-03,0.513255,-0.566836,-0.568214,-0.566853,-0.564699,-0.567087,-1.114221,0.234178,...,-0.131505,0.340969,-0.384122,-1.637978,-1.397221,-2.269561,-0.150957,-0.463254,0.462326,-0.179913
2,AAPL,2019-01-04,0.974270,-0.566849,-0.568849,-0.566878,-0.567575,-0.566860,-1.113545,-0.058953,...,-0.125914,0.297528,-0.384853,-1.637978,-1.397221,-2.269561,-0.150957,-0.462073,0.540321,-0.172861
3,AAPL,2019-01-07,-1.739500,-0.586819,-0.585200,-0.586846,-0.583208,-0.586694,-1.172660,-3.523569,...,-0.131510,0.167203,-0.387044,-1.637978,-1.397221,-2.269561,-0.150957,-0.458530,2.593612,-0.151707
4,AAPL,2019-01-08,-1.028140,-0.579040,-0.581676,-0.579063,-0.580907,-0.579040,-1.149848,1.369838,...,-0.131516,0.123762,-0.387774,-1.637978,-1.397221,-2.269561,-0.150957,-0.457350,1.354594,-0.144656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5799,TSLA,2020-12-24,-0.047648,0.041302,0.042610,0.041255,0.020426,0.041417,1.501621,-0.601955,...,0.030951,2.181198,0.928123,1.677337,0.982961,-0.057908,-0.411748,1.730563,1.095747,-1.103519
5800,TSLA,2020-12-28,0.366524,0.048437,0.044627,0.048381,0.031086,0.048528,1.529882,0.204706,...,0.036559,2.620721,1.011200,1.677337,0.982961,-0.057908,-0.411748,1.765113,0.388826,-1.097611
5801,TSLA,2020-12-29,-0.083321,0.068356,0.062795,0.068501,0.054645,0.068437,1.609004,0.742412,...,-0.047459,2.730602,1.031969,1.677337,0.982961,-0.057908,-0.411748,1.773751,0.001428,-1.096134
5802,TSLA,2020-12-30,-0.941110,0.070865,0.081860,0.070998,0.079954,0.070858,1.618625,0.001571,...,-0.041851,2.840483,1.052738,1.677337,0.982961,-0.057908,-0.411748,1.782388,0.357568,-1.094657


# Calls standardization

In [20]:
option_train_c  = option_train.copy()

# Apply filter of moneyness
option_train_c = option_train_c[option_train_c["cp_flag"] == "C"]
option_train_c = option_train_c[(option_train_c["moneyness"] >= -0.5) & (option_train_c["moneyness"] <= 2)]

import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = option_train_c.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag', 'impl_volatility'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_opt_c = pd.concat([data[['Ticker' ,'date', 'cp_flag', 'impl_volatility']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_opt_c


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,cp_flag,impl_volatility,T,best_bid_option,best_offer_option,moneyness,moneyness_squared,moneyness_tau,prev2_day_iv,prev_day_iv,spread_option,tau_squared,trading_days_till_exp,volume_option
64506,AAPL,2019-01-02,C,0.344281,-0.854054,-0.372907,-0.379237,-1.651490,-0.693975,-1.331578,-1.519873,-1.542073,-0.465828,-0.912062,-0.854054,0.582528
64507,AAPL,2019-01-02,C,0.348925,-0.854054,-0.428732,-0.434495,-1.062773,-0.883319,-0.945750,-1.519873,-0.759052,-0.487390,-0.912062,-0.854054,1.099089
64508,AAPL,2019-01-02,C,0.346461,-0.854054,-0.465140,-0.470430,-0.482466,-0.768297,-0.565435,-1.519873,-0.716917,-0.498171,-0.912062,-0.854054,1.904057
64509,AAPL,2019-01-02,C,0.349742,-0.854054,-0.482824,-0.488059,0.088029,-0.363283,-0.191549,-1.519873,-0.693295,-0.508952,-0.912062,-0.854054,0.981316
64510,AAPL,2019-01-02,C,0.355280,-0.854054,-0.489759,-0.494839,0.650114,0.318840,0.176824,-1.519873,-0.674080,-0.508952,-0.912062,-0.854054,0.554578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235024,TSLA,2020-12-31,C,0.623205,1.331335,-0.383309,-0.387712,1.060814,0.994905,1.793626,-0.140819,0.024429,-0.411923,1.480315,1.331335,-0.288466
235025,TSLA,2020-12-31,C,0.629719,1.331335,-0.393364,-0.398899,1.156130,1.173255,1.918561,-0.140819,0.024429,-0.455047,1.480315,1.331335,2.220414
235026,TSLA,2020-12-31,C,0.639880,1.331335,-0.412782,-0.418562,1.345361,1.551274,2.166593,-0.071801,0.110670,-0.476609,1.480315,1.331335,-0.200060
235027,TSLA,2020-12-31,C,0.648805,1.331335,-0.429426,-0.434156,1.533190,1.957986,2.412788,-0.037014,0.158024,-0.455047,1.480315,1.331335,-0.046135


# Puts standardization

In [21]:
option_train_p  = option_train.copy()
option_train_p = option_train_p[option_train_p["cp_flag"] == "P"]
option_train_p = option_train_p[(option_train_p["moneyness"] >= -2) & (option_train_p["moneyness"] <= 0.5)]

import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = option_train_p.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag', 'impl_volatility'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_opt_p = pd.concat([data[['Ticker' ,'date', 'cp_flag', 'impl_volatility']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_opt_p


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,cp_flag,impl_volatility,T,best_bid_option,best_offer_option,moneyness,moneyness_squared,moneyness_tau,prev2_day_iv,prev_day_iv,spread_option,tau_squared,trading_days_till_exp,volume_option
251482,AAPL,2019-01-02,P,0.558181,-0.862489,-0.485564,-0.491857,-1.599060,2.114823,-0.787321,-1.565542,-0.215265,-0.531451,-0.919035,-0.862489,-0.283173
251483,AAPL,2019-01-02,P,0.541292,-0.862489,-0.484420,-0.490739,-1.348787,1.566450,-0.624106,-1.565542,-0.243882,-0.531451,-0.919035,-0.862489,-0.243252
251484,AAPL,2019-01-02,P,0.525111,-0.862489,-0.482894,-0.489247,-1.099920,1.075810,-0.461808,-1.565542,-0.272954,-0.531451,-0.919035,-0.862489,0.089150
251485,AAPL,2019-01-02,P,0.507597,-0.862489,-0.480987,-0.487382,-0.853865,0.644292,-0.301343,-1.565542,-0.301999,-0.531451,-0.919035,-0.862489,1.045216
251486,AAPL,2019-01-02,P,0.468786,-0.862489,-0.472976,-0.479179,-0.242243,-0.197542,0.097526,-1.565542,-0.349271,-0.519904,-0.919035,-0.862489,1.977247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413599,TSLA,2020-12-31,P,0.573773,1.327470,0.504291,0.486655,1.347974,-0.845811,1.350982,-0.388218,-0.077136,-0.196571,1.475756,1.327470,0.486526
413600,TSLA,2020-12-31,P,0.573565,1.327470,0.557693,0.536997,1.401403,-0.828955,1.420670,-0.388671,-0.113290,-0.254309,1.475756,1.327470,-0.103122
413601,TSLA,2020-12-31,P,0.572960,1.327470,0.611096,0.589205,1.454832,-0.809587,1.490357,-0.388671,-0.113290,-0.254309,1.475756,1.327470,0.668411
413602,TSLA,2020-12-31,P,0.573072,1.327470,0.723623,0.701077,1.561690,-0.763315,1.629732,-0.385860,-0.145900,-0.196571,1.475756,1.327470,0.419720


# Merge all the testdata

In [22]:
# Apply filter of moneyness
option_train_f = option_test.copy()

option_train_data_c = option_test[option_train_f["cp_flag"] == "C"]
option_train_data_c = option_train_data_c[(option_train_data_c["moneyness"] >= -0.5) & (option_train_data_c["moneyness"] <= 2)]

option_train_data_p = option_train_f[option_train_f["cp_flag"] == "P"]
option_train_data_p = option_train_data_p[(option_train_data_p["moneyness"] <= 0.5) & (option_train_data_p["moneyness"] >= -2)]

options_total = pd.concat([option_train_data_c, option_train_data_p], axis=0)
# Merge with the option data & macro data
merge_data_t = pd.merge(options_total, firm_test, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_total_t = pd.merge(merge_data_t, macro_test, on='date', how='left')

merge_data_total_t

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
0,C,AAPL,2021-01-04,4,-0.475,0.433412,0.019841,0.337114,0.325650,2.54,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
1,C,AAPL,2021-01-04,4,-0.345,0.428775,0.019841,0.332640,0.312515,2.03,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
2,C,AAPL,2021-01-04,4,-0.216,0.426358,0.019841,0.330864,0.308907,1.60,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
3,C,AAPL,2021-01-04,4,-0.088,0.422649,0.019841,0.329431,0.306933,1.23,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
4,C,AAPL,2021-01-04,4,0.039,0.421479,0.019841,0.328318,0.305940,0.94,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71690,P,TSLA,2021-12-30,1,-0.027,0.484270,0.007937,0.520039,0.592762,19.50,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
71691,P,TSLA,2021-12-30,1,0.081,0.503165,0.007937,0.520221,0.600908,23.55,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
71692,P,TSLA,2021-12-30,1,0.190,0.536047,0.007937,0.525230,0.609799,27.55,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
71693,P,TSLA,2021-12-30,1,0.298,0.550971,0.007937,0.529985,0.620276,32.05,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04


In [23]:
# Check for NaN values column-wise
nan_columns = merge_data_total_t.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

Number of NaN values in each column:
cp_flag                  0
Ticker                   0
date                     0
trading_days_till_exp    0
moneyness                0
                        ..
OPEN_vix                 0
HIGH_vix                 0
LOW_vix                  0
CLOSE_vix                0
spread_vix               0
Length: 133, dtype: int64
Columns with NaN values:
[]


In [24]:
# Testset save to parquet

data_set_standardized_test = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_nonscaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_standardized_test):
    merge_data_total_t.to_parquet(data_set_standardized_test)

# Merge all the standardized data with each other

In [25]:
# # Merge the option data

# # Concatenate the two DataFrames vertically
# combined_data = pd.concat([standardized_data_opt_p, standardized_data_opt_c], axis=0)

# # Reset the index if necessary
# combined_data.reset_index(drop=True, inplace=True)

# # Display the combined DataFrame
# (combined_data)

In [26]:
# Merge with the firm data & macro data
merge_data_p = pd.merge(standardized_data_opt_p, standardized_data_firm, on=['date', 'Ticker'], how='left')
merge_data_c = pd.merge(standardized_data_opt_c, standardized_data_firm, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_total_p = pd.merge(merge_data_p, standardized_data_macro, on='date', how='left')
merge_data_total_c = pd.merge(merge_data_c, standardized_data_macro, on='date', how='left')

merge_data_total_p

Unnamed: 0,Ticker,date,cp_flag,impl_volatility,T,best_bid_option,best_offer_option,moneyness,moneyness_squared,moneyness_tau,...,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
0,AAPL,2019-01-02,P,0.558181,-0.862489,-0.485564,-0.491857,-1.599060,2.114823,-0.787321,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
1,AAPL,2019-01-02,P,0.541292,-0.862489,-0.484420,-0.490739,-1.348787,1.566450,-0.624106,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
2,AAPL,2019-01-02,P,0.525111,-0.862489,-0.482894,-0.489247,-1.099920,1.075810,-0.461808,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
3,AAPL,2019-01-02,P,0.507597,-0.862489,-0.480987,-0.487382,-0.853865,0.644292,-0.301343,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
4,AAPL,2019-01-02,P,0.468786,-0.862489,-0.472976,-0.479179,-0.242243,-0.197542,0.097526,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72959,TSLA,2020-12-31,P,0.573773,1.327470,0.504291,0.486655,1.347974,-0.845811,1.350982,...,-1.213802,-1.207416,-0.034979,-1.160795,-0.145889,0.015395,-0.057404,1.438222,0.268029,-0.552678
72960,TSLA,2020-12-31,P,0.573565,1.327470,0.557693,0.536997,1.401403,-0.828955,1.420670,...,-1.213802,-1.207416,-0.034979,-1.160795,-0.145889,0.015395,-0.057404,1.438222,0.268029,-0.552678
72961,TSLA,2020-12-31,P,0.572960,1.327470,0.611096,0.589205,1.454832,-0.809587,1.490357,...,-1.213802,-1.207416,-0.034979,-1.160795,-0.145889,0.015395,-0.057404,1.438222,0.268029,-0.552678
72962,TSLA,2020-12-31,P,0.573072,1.327470,0.723623,0.701077,1.561690,-0.763315,1.629732,...,-1.213802,-1.207416,-0.034979,-1.160795,-0.145889,0.015395,-0.057404,1.438222,0.268029,-0.552678


In [27]:
# Check for NaN values column-wise
nan_columns = merge_data_total_p.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

# Drop columns with NaN values
merge_data_total_p = merge_data_total_p.drop(columns=nan_columns_list)

Number of NaN values in each column:
Ticker             0
date               0
cp_flag            0
impl_volatility    0
T                  0
                  ..
LOW_vix            0
OPEN_vix           0
gold_price         0
reces_indi         0
spread_vix         0
Length: 133, dtype: int64
Columns with NaN values:
[]


In [28]:
# Save the parquet file

data_set_standardized_tot_p = Path.cwd().parent.parent / "Data/updated_standardization/data_set_train_val_tech_scaled_p.parquet"
data_set_standardized_tot_c = Path.cwd().parent.parent / "Data/updated_standardization/data_set_train_val_tech_scaled_c.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_standardized_tot_p):
    merge_data_total_p.to_parquet(data_set_standardized_tot_p)

save_parquet = True
if save_parquet or not os.path.exists(data_set_standardized_tot_c):
    merge_data_total_c.to_parquet(data_set_standardized_tot_c)

# Merge nonscaled traingsset tech

In [29]:
# Merge with the option data & macro data
option_train_f  = option_train.copy()

# Apply filter of moneyness
option_train_data_c = option_train_f[option_train_f["cp_flag"] == "C"]
option_train_data_c = option_train_data_c[(option_train_data_c["moneyness"] >= -0.5) & (option_train_data_c["moneyness"] <= 2)]

option_train_data_p = option_train_f[option_train_f["cp_flag"] == "P"]
option_train_data_p = option_train_data_p[(option_train_data_p["moneyness"] <= 0.5) & (option_train_data_p["moneyness"] >= -2)]

# Concatenate the two DataFrames vertically
optiondata_train = pd.concat([option_train_data_p, option_train_data_c], axis=0)

# Reset the index if necessary
optiondata_train.reset_index(drop=True, inplace=True)

optiondata_train

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv,best_bid_option,best_offer_option,volume_option,spread_option,moneyness_squared,tau_squared,moneyness_tau
0,P,AAPL,2019-01-02,2,-1.860,0.558181,0.011905,0.505761,0.000000,0.10,0.11,786,0.01,3.459600,0.000142,-0.022143
1,P,AAPL,2019-01-02,2,-1.682,0.541292,0.011905,0.495051,0.000000,0.13,0.14,982,0.01,2.829124,0.000142,-0.020024
2,P,AAPL,2019-01-02,2,-1.505,0.525111,0.011905,0.484170,0.000000,0.17,0.18,2614,0.01,2.265025,0.000142,-0.017917
3,P,AAPL,2019-01-02,2,-1.330,0.507597,0.011905,0.473299,0.000000,0.22,0.23,7308,0.01,1.768900,0.000142,-0.015833
4,P,AAPL,2019-01-02,2,-0.895,0.468786,0.011905,0.455607,0.000000,0.43,0.45,11884,0.02,0.801025,0.000142,-0.010655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145306,C,TSLA,2020-12-31,5,1.467,0.623205,0.023810,0.554273,0.486818,3.15,3.25,1054,0.10,2.152089,0.000567,0.034929
145307,C,TSLA,2020-12-31,5,1.535,0.629719,0.023810,0.554273,0.486818,2.86,2.92,25829,0.06,2.356225,0.000567,0.036548
145308,C,TSLA,2020-12-31,5,1.670,0.639880,0.023810,0.584788,0.511182,2.30,2.34,1927,0.04,2.788900,0.000567,0.039762
145309,C,TSLA,2020-12-31,5,1.804,0.648805,0.023810,0.601543,0.523462,1.82,1.88,3447,0.06,3.254416,0.000567,0.042952


In [30]:
option_train_data_p

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv,best_bid_option,best_offer_option,volume_option,spread_option,moneyness_squared,tau_squared,moneyness_tau
251482,P,AAPL,2019-01-02,2,-1.860,0.558181,0.011905,0.505761,0.000000,0.10,0.11,786,0.01,3.459600,0.000142,-0.022143
251483,P,AAPL,2019-01-02,2,-1.682,0.541292,0.011905,0.495051,0.000000,0.13,0.14,982,0.01,2.829124,0.000142,-0.020024
251484,P,AAPL,2019-01-02,2,-1.505,0.525111,0.011905,0.484170,0.000000,0.17,0.18,2614,0.01,2.265025,0.000142,-0.017917
251485,P,AAPL,2019-01-02,2,-1.330,0.507597,0.011905,0.473299,0.000000,0.22,0.23,7308,0.01,1.768900,0.000142,-0.015833
251486,P,AAPL,2019-01-02,2,-0.895,0.468786,0.011905,0.455607,0.000000,0.43,0.45,11884,0.02,0.801025,0.000142,-0.010655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413599,P,TSLA,2020-12-31,5,0.236,0.573773,0.023810,0.557458,0.432122,26.05,26.35,4565,0.30,0.055696,0.000567,0.005619
413600,P,TSLA,2020-12-31,5,0.274,0.573565,0.023810,0.543927,0.431956,27.45,27.70,1670,0.25,0.075076,0.000567,0.006524
413601,P,TSLA,2020-12-31,5,0.312,0.572960,0.023810,0.543927,0.431956,28.85,29.10,5458,0.25,0.097344,0.000567,0.007429
413602,P,TSLA,2020-12-31,5,0.388,0.573072,0.023810,0.531722,0.432987,31.80,32.10,4237,0.30,0.150544,0.000567,0.009238


In [31]:
merge_data_train = pd.merge(optiondata_train, firm_train, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_total_train = pd.merge(merge_data_train, macro_train, on='date', how='left')

In [32]:
# Check for NaN values column-wise
nan_columns = merge_data_total_train.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

# Drop columns with NaN values
merge_data_total_train = merge_data_total_train.drop(columns=nan_columns_list)

Number of NaN values in each column:
cp_flag                  0
Ticker                   0
date                     0
trading_days_till_exp    0
moneyness                0
                        ..
OPEN_vix                 0
HIGH_vix                 0
LOW_vix                  0
CLOSE_vix                0
spread_vix               0
Length: 133, dtype: int64
Columns with NaN values:
[]


In [33]:
merge_data_total_train

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
0,P,AAPL,2019-01-02,2,-1.860,0.558181,0.011905,0.505761,0.000000,0.10,...,449.000000,0.130000,2.66,2.5342,2.5442,31.72,31.74,28.57,28.57,3.17
1,P,AAPL,2019-01-02,2,-1.682,0.541292,0.011905,0.495051,0.000000,0.13,...,449.000000,0.130000,2.66,2.5342,2.5442,31.72,31.74,28.57,28.57,3.17
2,P,AAPL,2019-01-02,2,-1.505,0.525111,0.011905,0.484170,0.000000,0.17,...,449.000000,0.130000,2.66,2.5342,2.5442,31.72,31.74,28.57,28.57,3.17
3,P,AAPL,2019-01-02,2,-1.330,0.507597,0.011905,0.473299,0.000000,0.22,...,449.000000,0.130000,2.66,2.5342,2.5442,31.72,31.74,28.57,28.57,3.17
4,P,AAPL,2019-01-02,2,-0.895,0.468786,0.011905,0.455607,0.000000,0.43,...,449.000000,0.130000,2.66,2.5342,2.5442,31.72,31.74,28.57,28.57,3.17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145306,C,TSLA,2020-12-31,5,1.467,0.623205,0.023810,0.554273,0.486818,3.15,...,670.825806,3.042903,0.93,0.1105,0.1443,21.50,21.82,20.56,21.31,1.26
145307,C,TSLA,2020-12-31,5,1.535,0.629719,0.023810,0.554273,0.486818,2.86,...,670.825806,3.042903,0.93,0.1105,0.1443,21.50,21.82,20.56,21.31,1.26
145308,C,TSLA,2020-12-31,5,1.670,0.639880,0.023810,0.584788,0.511182,2.30,...,670.825806,3.042903,0.93,0.1105,0.1443,21.50,21.82,20.56,21.31,1.26
145309,C,TSLA,2020-12-31,5,1.804,0.648805,0.023810,0.601543,0.523462,1.82,...,670.825806,3.042903,0.93,0.1105,0.1443,21.50,21.82,20.56,21.31,1.26


In [34]:
# Save the parquet file

data_set_train_nonscaled = Path.cwd().parent.parent / "Data/updated_standardization/data_set_train_tech_nonscaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_train_nonscaled):
    merge_data_total_train.to_parquet(data_set_train_nonscaled)

# Merge scaled dataset test

In [35]:
# # Standardize micro data

# import pandas as pd

# def standardize(df):
#     """Standardize the numeric columns, excluding the 'date' column."""
#     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
#     df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
#     return df

# # Assuming your DataFrame is called 'macro_train'
# data = macro_test.copy()

# # Exclude the 'date' column from the columns to be standardized
# columns_to_standardize = data.columns.difference(['date'])

# # Standardize the entire DataFrame (except 'date')
# standardized_data = standardize(data[columns_to_standardize])

# # Combine standardized numeric data with the 'date' column
# standardized_data_macro_t = pd.concat([data[['date']], standardized_data], axis=1)

# # Display the standardized DataFrame
# standardized_data_macro_t


In [36]:
# import pandas as pd

# def standardize(df):
#     """Standardize the numeric columns, excluding the 'date' column."""
#     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
#     df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
#     return df

# # Assuming your DataFrame is called 'macro_train'
# data = firm_test.copy()

# # Exclude the 'date' column from the columns to be standardized
# columns_to_standardize = data.columns.difference(['Ticker', 'date'])

# # Standardize the entire DataFrame (except 'date')
# standardized_data = standardize(data[columns_to_standardize])

# # Combine standardized numeric data with the 'date' column
# standardized_data_firm_t = pd.concat([data[['Ticker' ,'date']], standardized_data], axis=1)

# # Display the standardized DataFrame
# standardized_data_firm_t



In [37]:
# option_test_c  = option_test.copy()

# # Apply filter of moneyness
# option_test_c = option_test_c[option_test_c["cp_flag"] == "C"]
# option_test_c = option_test_c[(option_test_c["moneyness"] >= -0.5) & (option_test_c["moneyness"] <= 2)]

# import pandas as pd

# def standardize(df):
#     """Standardize the numeric columns, excluding the 'date' column."""
#     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
#     df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
#     return df

# # Assuming your DataFrame is called 'macro_train'
# data = option_test_c.copy()

# # Exclude the 'date' column from the columns to be standardized
# columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag'])

# # Standardize the entire DataFrame (except 'date')
# standardized_data = standardize(data[columns_to_standardize])

# # Combine standardized numeric data with the 'date' column
# standardized_data_opt_c_t = pd.concat([data[['Ticker' ,'date', 'cp_flag']], standardized_data], axis=1)

# # Display the standardized DataFrame
# standardized_data_opt_c_t


In [38]:
# option_test_p  = option_test.copy()
# option_test_p = option_test_p[option_test_p["cp_flag"] == "P"]
# option_test_p = option_test_p[(option_test_p["moneyness"] >= -2) & (option_test_p["moneyness"] <= 0.5)]

# import pandas as pd

# def standardize(df):
#     """Standardize the numeric columns, excluding the 'date' column."""
#     numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
#     df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
#     return df

# # Assuming your DataFrame is called 'macro_train'
# data = option_test_p.copy()

# # Exclude the 'date' column from the columns to be standardized
# columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag'])

# # Standardize the entire DataFrame (except 'date')
# standardized_data = standardize(data[columns_to_standardize])

# # Combine standardized numeric data with the 'date' column
# standardized_data_opt_p_t = pd.concat([data[['Ticker' ,'date', 'cp_flag']], standardized_data], axis=1)

# # Display the standardized DataFrame
# standardized_data_opt_p_t


In [39]:
# # Concatenate the two DataFrames vertically
# optiondata_test_scaled = pd.concat([standardized_data_opt_c_t, standardized_data_opt_p_t], axis=0)

# # Reset the index if necessary
# optiondata_test_scaled.reset_index(drop=True, inplace=True)

# optiondata_test_scaled

# merge_data_test_standardized = pd.merge(optiondata_test_scaled, standardized_data_firm_t, on=['date', 'Ticker'], how='left')

# # Merge the data but now with the macro data
# merge_data_test_scaled = pd.merge(merge_data_test_standardized, standardized_data_macro_t, on='date', how='left')

# merge_data_test_scaled

# # Check for NaN values column-wise
# nan_columns = merge_data_test_scaled.isnull().sum()
# print("Number of NaN values in each column:")
# print(nan_columns)

# # Add columns with NaN values to a list
# nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
# print("Columns with NaN values:")
# print(nan_columns_list)

# # # Drop columns with NaN values
# merge_data_test_scaled = merge_data_test_scaled.drop(columns=nan_columns_list)

# nan_columns = merge_data_test_scaled.isnull().sum()
# print("Number of NaN values in each column:")
# print(nan_columns)


In [40]:
# # Save the parquet file

# data_set_test_scaled = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_scaled.parquet"

# save_parquet = True
# if save_parquet or not os.path.exists(data_set_test_scaled):
#     merge_data_test_scaled.to_parquet(data_set_test_scaled)

# Create validation and testset

In [41]:
option_train_c = option_train2[option_train2["cp_flag"] == "C"]
option_train_c2 = option_train_c[(option_train_c["moneyness"] >= -0.5) & (option_train_c["moneyness"] <= 2)]

option_train_p = option_train2[option_train2["cp_flag"] == "P"]
option_train_p2 = option_train_p[(option_train_p["moneyness"] >= -2) & (option_train_p["moneyness"] <= 0.5)]

# Concatenate the two DataFrames vertically
optiondata_train2 = pd.concat([option_train_c2, option_train_p2], axis=0)

# Reset the index if necessary
optiondata_train2.reset_index(drop=True, inplace=True)

merge_data_train2 = pd.merge(optiondata_train2, firm_train2, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
validation_train2 = pd.merge(merge_data_train2, macro_train2, on='date', how='left')

# Check for NaN values column-wise
nan_columns = validation_train2.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

# # Drop columns with NaN values
merge_data_train_c1 = validation_train2.drop(columns=nan_columns_list)

nan_columns = merge_data_train_c1.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

validation_train2


Number of NaN values in each column:
cp_flag                  0
Ticker                   0
date                     0
trading_days_till_exp    0
moneyness                0
                        ..
OPEN_vix                 0
HIGH_vix                 0
LOW_vix                  0
CLOSE_vix                0
spread_vix               0
Length: 133, dtype: int64
Columns with NaN values:
[]
Number of NaN values in each column:
cp_flag                  0
Ticker                   0
date                     0
trading_days_till_exp    0
moneyness                0
                        ..
OPEN_vix                 0
HIGH_vix                 0
LOW_vix                  0
CLOSE_vix                0
spread_vix               0
Length: 133, dtype: int64


Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
0,C,AAPL,2020-01-02,1,-0.415,0.395077,0.007937,0.219748,0.257932,10.35,...,535.741935,0.000000,1.92,1.6348,1.6539,14.47,14.69,13.16,13.26,1.53
1,C,AAPL,2020-01-02,1,-0.133,0.342038,0.007937,0.212392,0.253059,7.95,...,535.741935,0.000000,1.92,1.6348,1.6539,14.47,14.69,13.16,13.26,1.53
2,C,AAPL,2020-01-02,1,0.146,0.320892,0.007937,0.209724,0.251866,5.65,...,535.741935,0.000000,1.92,1.6348,1.6539,14.47,14.69,13.16,13.26,1.53
3,C,AAPL,2020-01-02,1,0.423,0.306323,0.007937,0.209129,0.253682,3.65,...,535.741935,0.000000,1.92,1.6348,1.6539,14.47,14.69,13.16,13.26,1.53
4,C,AAPL,2020-01-02,1,0.698,0.294776,0.007937,0.211117,0.259173,2.02,...,535.741935,0.000000,1.92,1.6348,1.6539,14.47,14.69,13.16,13.26,1.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87496,P,TSLA,2020-12-31,5,0.236,0.573773,0.023810,0.557458,0.432122,26.05,...,670.825806,3.042903,0.93,0.1105,0.1443,21.50,21.82,20.56,21.31,1.26
87497,P,TSLA,2020-12-31,5,0.274,0.573565,0.023810,0.543927,0.431956,27.45,...,670.825806,3.042903,0.93,0.1105,0.1443,21.50,21.82,20.56,21.31,1.26
87498,P,TSLA,2020-12-31,5,0.312,0.572960,0.023810,0.543927,0.431956,28.85,...,670.825806,3.042903,0.93,0.1105,0.1443,21.50,21.82,20.56,21.31,1.26
87499,P,TSLA,2020-12-31,5,0.388,0.573072,0.023810,0.531722,0.432987,31.80,...,670.825806,3.042903,0.93,0.1105,0.1443,21.50,21.82,20.56,21.31,1.26


In [42]:
# Save the parquet file

data_set_val_nonscaled = Path.cwd().parent.parent / "Data/updated_standardization/data_set_val_tech_nonscaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_val_nonscaled):
    validation_train2.to_parquet(data_set_val_nonscaled)

# Create a scaled dataset by starting with the train period calls

In [43]:
firm_train1

Unnamed: 0,Ticker,date,BIDLO,ASKHI,PRC,vol_stock,RET,BID,ASK,RETX,...,stdcf,ms,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2
0,AAPL,2019-01-02,154.55000,158.52000,156.23000,42291347.0,0.000512,156.22000,156.24001,0.000512,...,0.056201,6.0,0.027847,2.548018e-12,0.040515,0.027893,0.344719,3.369334,1.040351e-08,36.0
1,AAPL,2019-01-03,156.48000,159.36000,157.74001,35003466.0,0.009665,157.92999,157.94000,0.009665,...,0.056201,6.0,0.028037,2.557111e-12,0.041546,0.027900,0.341555,3.361944,1.046380e-08,36.0
2,AAPL,2019-01-04,154.23000,158.85001,157.92000,37066356.0,0.001141,157.91000,157.92999,0.001141,...,0.056201,6.0,0.028228,2.566203e-12,0.042578,0.027907,0.338390,3.354553,1.052410e-08,36.0
3,AAPL,2019-01-07,142.00000,145.72000,142.19000,91373695.0,-0.099607,142.08000,142.09000,-0.099607,...,0.056201,6.0,0.028801,2.593480e-12,0.045671,0.027927,0.328897,3.332380,1.070499e-08,36.0
4,AAPL,2019-01-08,143.80000,148.54990,148.25999,58603001.0,0.042689,148.25000,148.25999,0.042689,...,0.056201,6.0,0.028991,2.602572e-12,0.046703,0.027934,0.325733,3.324990,1.076528e-08,36.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5546,TSLA,2019-12-24,400.18500,413.00000,405.59000,14750787.0,0.003836,405.48999,405.54999,0.003836,...,0.174166,4.0,0.027660,5.803530e-12,0.061703,0.026152,0.460671,23.352328,2.269810e-09,37.0
5547,TSLA,2019-12-26,410.00000,422.01001,419.22000,13298127.0,0.033605,419.10999,419.17999,0.033605,...,0.174166,4.0,0.027109,5.693842e-12,0.052504,0.024693,0.449543,22.061294,2.334807e-09,37.0
5548,TSLA,2019-12-27,412.68750,425.47000,425.25000,8186207.0,0.014384,425.10001,425.19000,0.014384,...,0.174166,4.0,0.026834,5.638999e-12,0.047905,0.023964,0.443979,21.415777,2.367305e-09,37.0
5549,TSLA,2019-12-30,426.35001,433.48001,430.94000,10617605.0,0.013380,430.82001,430.92001,0.013380,...,0.174166,4.0,0.026008,5.474468e-12,0.034106,0.021775,0.427287,19.479225,2.464799e-09,37.0


In [44]:
# Apply filter of moneyness
option_train_c = option_train1[option_train1["cp_flag"] == "C"]
option_train_c = option_train_c[(option_train_c["moneyness"] >= -0.5) & (option_train_c["moneyness"] <= 2)]

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = option_train_c.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag', 'impl_volatility'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_opt_c_t1 = pd.concat([data[['Ticker' ,'date', 'cp_flag','impl_volatility']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_opt_c_t1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,cp_flag,impl_volatility,T,best_bid_option,best_offer_option,moneyness,moneyness_squared,moneyness_tau,prev2_day_iv,prev_day_iv,spread_option,tau_squared,trading_days_till_exp,volume_option
64506,AAPL,2019-01-02,C,0.344281,-0.847372,-0.266359,-0.273217,-1.694497,-0.729467,-1.363855,-1.715833,-1.684428,-0.368341,-0.904907,-0.847372,1.116993
64507,AAPL,2019-01-02,C,0.348925,-0.847372,-0.377284,-0.382634,-1.107359,-0.916247,-0.981762,-1.715833,-0.487047,-0.404428,-0.904907,-0.847372,1.954066
64508,AAPL,2019-01-02,C,0.346461,-0.847372,-0.449627,-0.453789,-0.528609,-0.802783,-0.605128,-1.715833,-0.422614,-0.422471,-0.904907,-0.847372,3.258493
64509,AAPL,2019-01-02,C,0.349742,-0.847372,-0.484765,-0.488695,0.040355,-0.403252,-0.234861,-1.715833,-0.386493,-0.440515,-0.904907,-0.847372,1.763218
64510,AAPL,2019-01-02,C,0.355280,-0.847372,-0.498545,-0.502121,0.600932,0.269635,0.129947,-1.715833,-0.357108,-0.440515,-0.904907,-0.847372,1.071702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214922,TSLA,2019-12-31,C,0.515649,-0.847372,-0.459273,-0.460502,1.045479,1.000302,0.419246,0.644425,0.977332,-0.350297,-0.904907,-0.847372,-0.248644
214923,TSLA,2019-12-31,C,0.527214,-0.847372,-0.464785,-0.469228,1.199253,1.293624,0.519318,0.664717,1.029089,-0.440515,-0.904907,-0.847372,0.069217
214924,TSLA,2019-12-31,C,0.532132,-0.847372,-0.473741,-0.475941,1.350231,1.601904,0.617570,0.691227,1.078811,-0.386384,-0.904907,-0.847372,-0.359739
214925,TSLA,2019-12-31,C,0.546039,-0.847372,-0.477186,-0.480640,1.502607,1.933425,0.716733,0.720056,1.120815,-0.422471,-0.904907,-0.847372,-0.215660


In [45]:
# Apply filter of moneyness
option_train_p = option_train1[option_train1["cp_flag"] == "P"]
option_train_p = option_train_p[(option_train_p["moneyness"] >= -2) & (option_train_p["moneyness"] <= 0.5)]

import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = option_train_p.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag', 'impl_volatility'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_opt_p_t1 = pd.concat([data[['Ticker' ,'date', 'cp_flag', 'impl_volatility']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_opt_p_t1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,cp_flag,impl_volatility,T,best_bid_option,best_offer_option,moneyness,moneyness_squared,moneyness_tau,prev2_day_iv,prev_day_iv,spread_option,tau_squared,trading_days_till_exp,volume_option
251482,AAPL,2019-01-02,P,0.558181,-0.846775,-0.513686,-0.517308,-1.568502,2.068950,-0.763267,-1.733221,0.323252,-0.452154,-0.90539,-0.846775,-0.283004
251483,AAPL,2019-01-02,P,0.541292,-0.846775,-0.511541,-0.515217,-1.319855,1.526289,-0.601311,-1.733221,0.280146,-0.452154,-0.90539,-0.846775,-0.229258
251484,AAPL,2019-01-02,P,0.525111,-0.846775,-0.508682,-0.512431,-1.072606,1.040760,-0.440264,-1.733221,0.236354,-0.452154,-0.90539,-0.846775,0.218257
251485,AAPL,2019-01-02,P,0.507597,-0.846775,-0.505107,-0.508947,-0.828150,0.613737,-0.281038,-1.733221,0.192603,-0.452154,-0.90539,-0.846775,1.505414
251486,AAPL,2019-01-02,P,0.468786,-0.846775,-0.490093,-0.493619,-0.220504,-0.219329,0.114754,-1.733221,0.121395,-0.433291,-0.90539,-0.846775,2.760213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393306,TSLA,2019-12-31,P,0.510903,-0.846775,-0.088297,-0.092995,1.043681,-0.908698,0.938182,0.258539,0.363153,-0.188070,-0.90539,-0.846775,1.047478
393307,TSLA,2019-12-31,P,0.516629,-0.846775,-0.002504,-0.002419,1.208514,-0.894682,1.045546,0.259157,0.386832,0.000561,-0.90539,-0.846775,0.127767
393308,TSLA,2019-12-31,P,0.518828,-0.846775,0.094013,0.088156,1.373347,-0.856697,1.152911,0.263376,0.412048,-0.093755,-0.90539,-0.846775,0.729665
393309,TSLA,2019-12-31,P,0.524371,-0.846775,0.197679,0.196150,1.535386,-0.795993,1.258455,0.280704,0.464358,0.094877,-0.90539,-0.846775,-0.232001


# Standardize firm characteristics

In [46]:
def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = firm_train1.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_firm_t1 = pd.concat([data[['Ticker' ,'date']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_firm_t1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,5_day_rolling_return_stock,ASK,ASKHI,BID,BIDLO,PRC,PRC_actual,RET,...,spread_stock,std_dolvol,std_turn,stdacc,stdcf,tang,tb,turn,vol_stock,zerotrade
0,AAPL,2019-01-02,-0.307275,-0.505712,-0.505068,-0.505698,-0.504837,-0.505707,-0.822966,-0.071612,...,-0.426741,0.545217,-0.380475,-1.535547,-1.515170,-1.989614,-0.303392,-0.506294,2.383640,-0.311142
1,AAPL,2019-01-03,0.807128,-0.502869,-0.503673,-0.502837,-0.501580,-0.503181,-0.816294,0.371267,...,-0.479031,0.495464,-0.381395,-1.535547,-1.515170,-1.989614,-0.303392,-0.504821,1.731332,-0.304651
2,AAPL,2019-01-04,1.449699,-0.502885,-0.504520,-0.502871,-0.505377,-0.502880,-0.815499,-0.041177,...,-0.426845,0.445712,-0.382316,-1.535547,-1.515170,-1.989614,-0.303392,-0.503348,1.915973,-0.298160
3,AAPL,2019-01-07,-2.332808,-0.529379,-0.526331,-0.529354,-0.526012,-0.529194,-0.884997,-4.915996,...,-0.479083,0.296454,-0.385077,-1.535547,-1.515170,-1.989614,-0.303392,-0.498929,6.776798,-0.278687
4,AAPL,2019-01-08,-1.341300,-0.519059,-0.521630,-0.519032,-0.522975,-0.519040,-0.858179,1.969175,...,-0.479136,0.246701,-0.385998,-1.535547,-1.515170,-1.989614,-0.303392,-0.497456,3.843629,-0.272196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5546,TSLA,2019-12-24,2.693347,-0.088722,-0.082340,-0.088674,-0.090389,-0.088564,-1.035361,0.089224,...,-0.217633,2.368262,2.108093,1.553935,0.835866,-0.191921,0.382637,1.749715,-0.081401,-1.186759
5547,TSLA,2019-12-26,1.966520,-0.065924,-0.067373,-0.065888,-0.073828,-0.065763,-1.019302,1.529634,...,-0.165343,2.193302,1.947315,1.553935,0.835866,-0.191921,0.382637,1.756744,-0.211423,-1.179762
5548,TSLA,2019-12-27,2.479921,-0.055872,-0.061625,-0.055866,-0.069294,-0.055676,-1.012197,0.599602,...,-0.060815,2.105821,1.866926,1.553935,0.835866,-0.191921,0.382637,1.760259,-0.668970,-1.176263
5549,TSLA,2019-12-30,1.905592,-0.046288,-0.048319,-0.046297,-0.046242,-0.046157,-1.005494,0.551022,...,-0.008472,1.843381,1.625759,1.553935,0.835866,-0.191921,0.382637,1.770802,-0.451345,-1.165767


# Standardized macro data

In [47]:
# Standardize micro data
import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = macro_train1.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['date'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_macro_t1 = pd.concat([data[['date']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_macro_t1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,date,10Y_RIR,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
3,2019-01-02,1.286663,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
4,2019-01-03,1.286663,1.307506,1.421618,2.267485,0.733069,3.558028,2.698761,4.140757,-1.207092,1.573545,4.063041
5,2019-01-04,1.038280,1.140601,1.198933,3.302776,0.733069,2.975638,3.195899,2.709562,-1.196444,1.592066,1.412664
8,2019-01-07,1.311501,1.263478,1.354732,1.726498,0.733069,1.915271,2.102196,2.257873,-1.185795,1.610586,0.813781
9,2019-01-08,1.386016,1.362758,1.466746,1.896758,0.733069,1.692524,2.183546,2.100543,-1.153850,1.666149,0.030136
...,...,...,...,...,...,...,...,...,...,...,...,...
359,2019-12-24,-0.526532,-1.168736,-0.815977,-1.299736,-1.880068,-1.309917,-1.287382,-1.211000,1.227346,-0.310944,-0.874559
361,2019-12-26,-0.601047,-1.199239,-0.865402,-1.373882,-1.880068,-1.402728,-1.317511,-1.345492,1.247094,-0.310944,-1.065692
362,2019-12-27,-0.601047,-1.197800,-0.871581,-1.269529,-1.880068,-1.384166,-1.251226,-1.276977,1.286590,-0.310944,-1.154888
365,2019-12-30,-0.650724,-1.256505,-0.947868,-0.978439,-1.880068,-1.054687,-1.073462,-1.223688,1.306338,-0.310944,-0.626087


# Combine all the training data

In [48]:
merge_train_c1 = pd.merge(standardized_data_opt_c_t1, standardized_data_firm_t1, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_train_c1 = pd.merge(merge_train_c1, standardized_data_macro_t1, on='date', how='left')

merge_data_train_c1

# Check for NaN values column-wise
nan_columns = merge_data_train_c1.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

# # Drop columns with NaN values
merge_data_train_c1 = merge_data_train_c1.drop(columns=nan_columns_list)

nan_columns = merge_data_train_c1.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)


Number of NaN values in each column:
Ticker             0
date               0
cp_flag            0
impl_volatility    0
T                  0
                  ..
LOW_vix            0
OPEN_vix           0
gold_price         0
reces_indi         0
spread_vix         0
Length: 133, dtype: int64
Columns with NaN values:
[]
Number of NaN values in each column:
Ticker             0
date               0
cp_flag            0
impl_volatility    0
T                  0
                  ..
LOW_vix            0
OPEN_vix           0
gold_price         0
reces_indi         0
spread_vix         0
Length: 133, dtype: int64


In [49]:
merge_train_p1 = pd.merge(standardized_data_opt_p_t1, standardized_data_firm_t1, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_train_p1 = pd.merge(merge_train_p1, standardized_data_macro_t1, on='date', how='left')

merge_data_train_p1

# Check for NaN values column-wise
nan_columns = merge_data_train_p1.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

# # Drop columns with NaN values
merge_data_train_p1 = merge_data_train_p1.drop(columns=nan_columns_list)

nan_columns = merge_data_train_p1.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)
merge_data_train_p1

Number of NaN values in each column:
Ticker             0
date               0
cp_flag            0
impl_volatility    0
T                  0
                  ..
LOW_vix            0
OPEN_vix           0
gold_price         0
reces_indi         0
spread_vix         0
Length: 133, dtype: int64
Columns with NaN values:
[]
Number of NaN values in each column:
Ticker             0
date               0
cp_flag            0
impl_volatility    0
T                  0
                  ..
LOW_vix            0
OPEN_vix           0
gold_price         0
reces_indi         0
spread_vix         0
Length: 133, dtype: int64


Unnamed: 0,Ticker,date,cp_flag,impl_volatility,T,best_bid_option,best_offer_option,moneyness,moneyness_squared,moneyness_tau,...,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
0,AAPL,2019-01-02,P,0.558181,-0.846775,-0.513686,-0.517308,-1.568502,2.068950,-0.763267,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
1,AAPL,2019-01-02,P,0.541292,-0.846775,-0.511541,-0.515217,-1.319855,1.526289,-0.601311,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
2,AAPL,2019-01-02,P,0.525111,-0.846775,-0.508682,-0.512431,-1.072606,1.040760,-0.440264,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
3,AAPL,2019-01-02,P,0.507597,-0.846775,-0.505107,-0.508947,-0.828150,0.613737,-0.281038,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
4,AAPL,2019-01-02,P,0.468786,-0.846775,-0.490093,-0.493619,-0.220504,-0.219329,0.114754,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28636,TSLA,2019-12-31,P,0.510903,-0.846775,-0.088297,-0.092995,1.043681,-0.908698,0.938182,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
28637,TSLA,2019-12-31,P,0.516629,-0.846775,-0.002504,-0.002419,1.208514,-0.894682,1.045546,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
28638,TSLA,2019-12-31,P,0.518828,-0.846775,0.094013,0.088156,1.373347,-0.856697,1.152911,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
28639,TSLA,2019-12-31,P,0.524371,-0.846775,0.197679,0.196150,1.535386,-0.795993,1.258455,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184


In [50]:
merge_data_train_c1

Unnamed: 0,Ticker,date,cp_flag,impl_volatility,T,best_bid_option,best_offer_option,moneyness,moneyness_squared,moneyness_tau,...,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
0,AAPL,2019-01-02,C,0.344281,-0.847372,-0.266359,-0.273217,-1.694497,-0.729467,-1.363855,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
1,AAPL,2019-01-02,C,0.348925,-0.847372,-0.377284,-0.382634,-1.107359,-0.916247,-0.981762,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
2,AAPL,2019-01-02,C,0.346461,-0.847372,-0.449627,-0.453789,-0.528609,-0.802783,-0.605128,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
3,AAPL,2019-01-02,C,0.349742,-0.847372,-0.484765,-0.488695,0.040355,-0.403252,-0.234861,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
4,AAPL,2019-01-02,C,0.355280,-0.847372,-0.498545,-0.502121,0.600932,0.269635,0.129947,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29164,TSLA,2019-12-31,C,0.515649,-0.847372,-0.459273,-0.460502,1.045479,1.000302,0.419246,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
29165,TSLA,2019-12-31,C,0.527214,-0.847372,-0.464785,-0.469228,1.199253,1.293624,0.519318,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
29166,TSLA,2019-12-31,C,0.532132,-0.847372,-0.473741,-0.475941,1.350231,1.601904,0.617570,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
29167,TSLA,2019-12-31,C,0.546039,-0.847372,-0.477186,-0.480640,1.502607,1.933425,0.716733,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184


In [51]:
# Save the parquet file

data_set_train_p = Path.cwd().parent.parent / "Data/updated_standardization/data_set_train_tech_scaled_p.parquet"
data_set_train_c = Path.cwd().parent.parent / "Data/updated_standardization/data_set_train_tech_scaled_c.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_train_p):
    merge_data_train_p1.to_parquet(data_set_train_p)

save_parquet = True
if save_parquet or not os.path.exists(data_set_train_c):
    merge_data_train_c1.to_parquet(data_set_train_c)

In [52]:
data_set_train_nonscaled = pd.read_parquet("/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_tech_nonscaled.parquet")
data_set_train_nonscaled_c = data_set_train_nonscaled[(data_set_train_nonscaled["cp_flag"] == "C") & (data_set_train_nonscaled["date"] < '2020-01-01')]
# data_set_train_nonscaled_c


validation_c = validation_train2[validation_train2["cp_flag"] == "C"]
validation_p = validation_train2[validation_train2["cp_flag"] == "P"]
test_c = merge_data_total_t[merge_data_total_t["cp_flag"] == "C"]
test_p = merge_data_total_t[merge_data_total_t["cp_flag"] == "P"]

# Example custom standardization function (as you might have implemented)
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features and target from train and test data
train_features = data_set_train_nonscaled_c.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])
validate_features_c = validation_c.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])

# Step 2: Standardize the training data and save the means and stds
train_features_standardized, train_means, train_stds = custom_standardize(train_features)

# # Step 3: Standardize the test data using the saved means and stds from the training data
validatefeatures_c_standardized, _, _ = custom_standardize(validate_features_c, means=train_means, stds=train_stds)

# Step 4: Combine back the standardized test features with the non-feature columns if needed
val_data_c_standardized = pd.DataFrame(validatefeatures_c_standardized, columns=validate_features_c.columns)
val_data_c_standardized = pd.concat([validation_c[['date', 'Ticker', 'cp_flag', 'impl_volatility']], val_data_c_standardized], axis=1)

val_data_c_standardized



Unnamed: 0,date,Ticker,cp_flag,impl_volatility,trading_days_till_exp,moneyness,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
0,2020-01-02,AAPL,C,0.395077,-1.579945,-1.620406,-1.579945,-0.734721,-0.543397,0.209036,...,1.470267,-0.368611,-0.631195,-1.364557,-1.039766,-0.281185,-0.430369,-0.323006,-0.453943,-0.498339
1,2020-01-02,AAPL,C,0.342038,-1.579945,-1.226185,-1.579945,-0.766512,-0.565548,0.043681,...,1.470267,-0.368611,-0.631195,-1.364557,-1.039766,-0.281185,-0.430369,-0.323006,-0.453943,-0.498339
2,2020-01-02,AAPL,C,0.320892,-1.579945,-0.836158,-1.579945,-0.778042,-0.570973,-0.114784,...,1.470267,-0.368611,-0.631195,-1.364557,-1.039766,-0.281185,-0.430369,-0.323006,-0.453943,-0.498339
3,2020-01-02,AAPL,C,0.306323,-1.579945,-0.448926,-1.579945,-0.780610,-0.562717,-0.252579,...,1.470267,-0.368611,-0.631195,-1.364557,-1.039766,-0.281185,-0.430369,-0.323006,-0.453943,-0.498339
4,2020-01-02,AAPL,C,0.294776,-1.579945,-0.064491,-1.579945,-0.772022,-0.537755,-0.364883,...,1.470267,-0.368611,-0.631195,-1.364557,-1.039766,-0.281185,-0.430369,-0.323006,-0.453943,-0.498339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43173,2020-12-31,TSLA,C,0.623205,1.350346,1.010530,1.350346,0.711038,0.497010,-0.287028,...,5.539061,40.289602,-3.046020,-5.717770,-5.000053,1.434729,1.168070,1.806636,1.673194,-0.667091
43174,2020-12-31,TSLA,C,0.629719,1.350346,1.105590,1.350346,0.711038,0.497010,-0.307009,...,5.539061,40.289602,-3.046020,-5.717770,-5.000053,1.434729,1.168070,1.806636,1.673194,-0.667091
43175,2020-12-31,TSLA,C,0.639880,1.350346,1.294313,1.350346,0.842916,0.607756,-0.345591,...,5.539061,40.289602,-3.046020,-5.717770,-5.000053,1.434729,1.168070,1.806636,1.673194,-0.667091
43176,2020-12-31,TSLA,C,0.648805,1.350346,1.481638,1.350346,0.915329,0.663576,-0.378662,...,5.539061,40.289602,-3.046020,-5.717770,-5.000053,1.434729,1.168070,1.806636,1.673194,-0.667091


In [53]:
# Save the parquet file
data_set_val_scaled_c = Path.cwd().parent.parent / "Data/updated_standardization/data_set_val_tech_scaled_c.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_val_scaled_c):
    val_data_c_standardized.to_parquet(data_set_val_scaled_c)

In [54]:
data_set_train_nonscaled = pd.read_parquet("/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_tech_nonscaled.parquet")
data_set_train_nonscaled_p = data_set_train_nonscaled[(data_set_train_nonscaled["cp_flag"] == "P") & (data_set_train_nonscaled["date"] < '2020-01-01')]

# Filter the validation set for 'C' and 'P' flags
validation_c = validation_train2[validation_train2["cp_flag"] == "C"]
validation_p = validation_train2[validation_train2["cp_flag"] == "P"]

# Custom standardization function
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features from train data and the 'P' validation data
train_features = data_set_train_nonscaled_p.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])
validate_features_p = validation_p.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])

# Step 2: Standardize the training data and save the means and stds
train_features_standardized, train_means, train_stds = custom_standardize(train_features)

# Step 3: Standardize the 'P' validation data using the saved means and stds from the training data
validate_features_p_standardized, _, _ = custom_standardize(validate_features_p, means=train_means, stds=train_stds)

# Step 4: Combine back the standardized 'P' validation features with the non-feature columns
val_data_p_standardized = pd.DataFrame(validate_features_p_standardized, columns=validate_features_p.columns)
val_data_p_standardized = pd.concat([validation_p[['date', 'Ticker', 'cp_flag', 'impl_volatility']], val_data_p_standardized], axis=1)

# Output the standardized 'P' validation data
val_data_p_standardized


Unnamed: 0,date,Ticker,cp_flag,impl_volatility,trading_days_till_exp,moneyness,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
43178,2020-01-02,AAPL,P,0.627576,-1.577887,-1.569898,-1.577887,-0.439591,-0.488398,-0.519406,...,1.484330,-0.387092,-0.643075,-1.374615,-1.048546,-0.289238,-0.436901,-0.330642,-0.459463,-0.501101
43179,2020-01-02,AAPL,P,0.589063,-1.577887,-1.157816,-1.577887,-0.479564,-0.364901,-0.518691,...,1.484330,-0.387092,-0.643075,-1.374615,-1.048546,-0.289238,-0.436901,-0.330642,-0.459463,-0.501101
43180,2020-01-02,AAPL,P,0.512918,-1.577887,-0.751321,-1.577887,-0.565772,-0.443045,-0.519406,...,1.484330,-0.387092,-0.643075,-1.374615,-1.048546,-0.289238,-0.436901,-0.330642,-0.459463,-0.501101
43181,2020-01-02,AAPL,P,0.488522,-1.577887,-0.346224,-1.577887,-0.642376,-0.512526,-0.517976,...,1.484330,-0.387092,-0.643075,-1.374615,-1.048546,-0.289238,-0.436901,-0.330642,-0.459463,-0.501101
43182,2020-01-02,AAPL,P,0.429982,-1.577887,0.053287,-1.577887,-0.709124,-0.563057,-0.516546,...,1.484330,-0.387092,-0.643075,-1.374615,-1.048546,-0.289238,-0.436901,-0.330642,-0.459463,-0.501101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87496,2020-12-31,TSLA,P,0.573773,1.346564,1.359378,1.346564,0.531319,0.077475,1.341584,...,5.566764,39.978584,-3.054084,-5.724110,-4.997703,1.428902,1.162449,1.798830,1.667278,-0.670489
87497,2020-12-31,TSLA,P,0.573565,1.346564,1.412460,1.346564,0.476860,0.076780,1.441676,...,5.566764,39.978584,-3.054084,-5.724110,-4.997703,1.428902,1.162449,1.798830,1.667278,-0.670489
87498,2020-12-31,TSLA,P,0.572960,1.346564,1.465541,1.346564,0.476860,0.076780,1.541767,...,5.566764,39.978584,-3.054084,-5.724110,-4.997703,1.428902,1.162449,1.798830,1.667278,-0.670489
87499,2020-12-31,TSLA,P,0.573072,1.346564,1.571705,1.346564,0.427739,0.081102,1.752675,...,5.566764,39.978584,-3.054084,-5.724110,-4.997703,1.428902,1.162449,1.798830,1.667278,-0.670489


In [55]:
# Save the parquet file
data_set_val_scaled_p = Path.cwd().parent.parent / "Data/updated_standardization/data_set_val_tech_scaled_p.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_val_scaled_p):
    val_data_p_standardized.to_parquet(data_set_val_scaled_p)

In [56]:
import pandas as pd

# data_set_train_nonscaled = pd.read_parquet("/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_tech_nonscaled.parquet")
# data_set_train_nonscaled_c_tot = data_set_train_nonscaled[(data_set_train_nonscaled["cp_flag"] == "C")]

# Assuming merge_data_total_t and merge_data_train_c1 are already defined

# Filter the test set for 'C' and 'P' flags
test_c = merge_data_total_t[merge_data_total_t["cp_flag"] == "C"]
test_p = merge_data_total_t[merge_data_total_t["cp_flag"] == "P"]

# Custom standardization function
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features from train data and the 'C' test data
train_features = data_set_train_nonscaled_c.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])
test_features_c = test_c.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])

# Step 2: Standardize the training data and save the means and stds
train_features_standardized, train_means, train_stds = custom_standardize(train_features)

# Step 3: Standardize the 'C' test data using the saved means and stds from the training data
test_features_c_standardized, _, _ = custom_standardize(test_features_c, means=train_means, stds=train_stds)

# Step 4: Combine back the standardized 'C' test features with the non-feature columns
test_data_c_standardized = pd.DataFrame(test_features_c_standardized, columns=test_features_c.columns)
test_data_c_standardized = pd.concat([test_c[['date', 'Ticker', 'cp_flag', 'impl_volatility']], test_data_c_standardized], axis=1)

# Output the standardized 'C' test data
test_data_c_standardized



Unnamed: 0,date,Ticker,cp_flag,impl_volatility,trading_days_till_exp,moneyness,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
0,2021-01-04,AAPL,C,0.433412,0.617773,-1.704283,0.617773,-0.227485,-0.235586,-0.329056,...,5.574817,40.203398,-3.046020,-5.726623,-5.011333,1.383471,1.318274,1.809513,1.644128,-0.254587
1,2021-01-04,AAPL,C,0.428775,0.617773,-1.522549,0.617773,-0.246819,-0.295287,-0.364194,...,5.574817,40.203398,-3.046020,-5.726623,-5.011333,1.383471,1.318274,1.809513,1.644128,-0.254587
2,2021-01-04,AAPL,C,0.426358,0.617773,-1.342214,0.617773,-0.254497,-0.311691,-0.393820,...,5.574817,40.203398,-3.046020,-5.726623,-5.011333,1.383471,1.318274,1.809513,1.644128,-0.254587
3,2021-01-04,AAPL,C,0.422649,0.617773,-1.163277,0.617773,-0.260690,-0.320663,-0.419312,...,5.574817,40.203398,-3.046020,-5.726623,-5.011333,1.383471,1.318274,1.809513,1.644128,-0.254587
4,2021-01-04,AAPL,C,0.421479,0.617773,-0.985738,0.617773,-0.265499,-0.325176,-0.439293,...,5.574817,40.203398,-3.046020,-5.726623,-5.011333,1.383471,1.318274,1.809513,1.644128,-0.254587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36347,2021-12-30,TSLA,C,0.742669,-1.579945,1.134947,-1.579945,1.418885,1.798524,-0.483387,...,4.395939,-3.131456,-1.533705,-4.640248,-3.232142,-0.308035,-0.450546,-0.495679,-0.562282,-0.179586
36348,2021-12-30,TSLA,C,0.764381,-1.579945,1.277538,-1.579945,1.496363,1.872225,-0.486832,...,4.395939,-3.131456,-1.533705,-4.640248,-3.232142,-0.308035,-0.450546,-0.495679,-0.562282,-0.179586
36349,2021-12-30,TSLA,C,0.786317,-1.579945,1.420128,-1.579945,1.574945,1.955351,-0.488210,...,4.395939,-3.131456,-1.533705,-4.640248,-3.232142,-0.308035,-0.450546,-0.495679,-0.562282,-0.179586
36350,2021-12-30,TSLA,C,0.798978,-1.579945,1.561321,-1.579945,1.657239,2.019160,-0.490966,...,4.395939,-3.131456,-1.533705,-4.640248,-3.232142,-0.308035,-0.450546,-0.495679,-0.562282,-0.179586


In [57]:
# Save the parquet file
data_set_val_test_c = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_scaled_c.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_val_test_c):
    test_data_c_standardized.to_parquet(data_set_val_test_c)

In [58]:
merge_data_train_p1

Unnamed: 0,Ticker,date,cp_flag,impl_volatility,T,best_bid_option,best_offer_option,moneyness,moneyness_squared,moneyness_tau,...,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
0,AAPL,2019-01-02,P,0.558181,-0.846775,-0.513686,-0.517308,-1.568502,2.068950,-0.763267,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
1,AAPL,2019-01-02,P,0.541292,-0.846775,-0.511541,-0.515217,-1.319855,1.526289,-0.601311,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
2,AAPL,2019-01-02,P,0.525111,-0.846775,-0.508682,-0.512431,-1.072606,1.040760,-0.440264,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
3,AAPL,2019-01-02,P,0.507597,-0.846775,-0.505107,-0.508947,-0.828150,0.613737,-0.281038,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
4,AAPL,2019-01-02,P,0.468786,-0.846775,-0.490093,-0.493619,-0.220504,-0.219329,0.114754,...,1.307506,1.421618,3.846509,0.733069,3.622996,4.431212,4.206734,-1.217741,1.555024,0.578051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28636,TSLA,2019-12-31,P,0.510903,-0.846775,-0.088297,-0.092995,1.043681,-0.908698,0.938182,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
28637,TSLA,2019-12-31,P,0.516629,-0.846775,-0.002504,-0.002419,1.208514,-0.894682,1.045546,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
28638,TSLA,2019-12-31,P,0.518828,-0.846775,0.094013,0.088156,1.373347,-0.856697,1.152911,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184
28639,TSLA,2019-12-31,P,0.524371,-0.846775,0.197679,0.196150,1.535386,-0.795993,1.258455,...,-1.270893,-0.968821,-0.217761,-1.880068,-0.386446,-0.865568,-1.053670,1.365583,-0.310944,0.769184


In [59]:
import pandas as pd

# Assuming merge_data_total_t and merge_data_train_c1 are already defined
# data_set_train_nonscaled = pd.read_parquet("/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_tech_nonscaled.parquet")
# data_set_train_nonscaled_p = data_set_train_nonscaled[(data_set_train_nonscaled["cp_flag"] == "P")]

# Filter the test set for 'C' and 'P' flags
test_c = merge_data_total_t[merge_data_total_t["cp_flag"] == "C"]
test_p = merge_data_total_t[merge_data_total_t["cp_flag"] == "P"]

# Custom standardization function
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features from train data and the 'C' test data
train_features = data_set_train_nonscaled_p.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])
test_features_p = test_p.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])

# Step 2: Standardize the training data and save the means and stds
train_features_standardized, train_means, train_stds = custom_standardize(train_features)

# Step 3: Standardize the 'C' test data using the saved means and stds from the training data
test_features_p_standardized, _, _ = custom_standardize(test_features_p, means=train_means, stds=train_stds)

# Step 4: Combine back the standardized 'C' test features with the non-feature columns
test_data_p_standardized = pd.DataFrame(test_features_p_standardized, columns=test_features_p.columns)
test_data_p_standardized = pd.concat([test_p[['date', 'Ticker', 'cp_flag', 'impl_volatility']], test_data_p_standardized], axis=1)

# Output the standardized 'C' test data
test_data_p_standardized




Unnamed: 0,date,Ticker,cp_flag,impl_volatility,trading_days_till_exp,moneyness,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
36352,2021-01-04,AAPL,P,0.622737,0.615451,-1.731938,0.615451,-0.084494,0.161081,-0.500817,...,5.60264,39.893000,-3.054084,-5.732956,-5.008952,1.377578,1.312738,1.801708,1.638217,-0.256428
36353,2021-01-04,AAPL,P,0.598160,0.615451,-1.533579,0.615451,-0.167092,0.022254,-0.497958,...,5.60264,39.893000,-3.054084,-5.732956,-5.008952,1.377578,1.312738,1.801708,1.638217,-0.256428
36354,2021-01-04,AAPL,P,0.570362,0.615451,-1.336618,0.615451,-0.205637,-0.043746,-0.495098,...,5.60264,39.893000,-3.054084,-5.732956,-5.008952,1.377578,1.312738,1.801708,1.638217,-0.256428
36355,2021-01-04,AAPL,P,0.549241,0.615451,-1.141054,0.615451,-0.241736,-0.107206,-0.490808,...,5.60264,39.893000,-3.054084,-5.732956,-5.008952,1.377578,1.312738,1.801708,1.638217,-0.256428
36356,2021-01-04,AAPL,P,0.526871,0.615451,-0.946886,0.615451,-0.307405,-0.223133,-0.485089,...,5.60264,39.893000,-3.054084,-5.732956,-5.008952,1.377578,1.312738,1.801708,1.638217,-0.256428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71690,2021-12-30,TSLA,P,0.484270,-1.577887,0.991996,-1.577887,0.380717,0.750597,0.873298,...,4.41981,-3.130058,-1.544159,-4.647508,-3.234762,-0.316122,-0.457089,-0.503302,-0.567782,-0.181144
71691,2021-12-30,TSLA,P,0.503165,-1.577887,1.142860,-1.577887,0.381451,0.784731,1.162849,...,4.41981,-3.130058,-1.544159,-4.647508,-3.234762,-0.316122,-0.457089,-0.503302,-0.567782,-0.181144
71692,2021-12-30,TSLA,P,0.536047,-1.577887,1.295121,-1.577887,0.401610,0.821988,1.448825,...,4.41981,-3.130058,-1.544159,-4.647508,-3.234762,-0.316122,-0.457089,-0.503302,-0.567782,-0.181144
71693,2021-12-30,TSLA,P,0.550971,-1.577887,1.445985,-1.577887,0.420746,0.865889,1.770548,...,4.41981,-3.130058,-1.544159,-4.647508,-3.234762,-0.316122,-0.457089,-0.503302,-0.567782,-0.181144


In [60]:
# Save the parquet file
data_set_val_test_p = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_scaled_p.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_val_test_p):
    test_data_p_standardized.to_parquet(data_set_val_test_p)

In [61]:
test_c

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
0,C,AAPL,2021-01-04,4,-0.475,0.433412,0.019841,0.337114,0.325650,2.54,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
1,C,AAPL,2021-01-04,4,-0.345,0.428775,0.019841,0.332640,0.312515,2.03,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
2,C,AAPL,2021-01-04,4,-0.216,0.426358,0.019841,0.330864,0.308907,1.60,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
3,C,AAPL,2021-01-04,4,-0.088,0.422649,0.019841,0.329431,0.306933,1.23,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
4,C,AAPL,2021-01-04,4,0.039,0.421479,0.019841,0.328318,0.305940,0.94,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36347,C,TSLA,2021-12-30,1,1.556,0.742669,0.007937,0.718058,0.773147,0.30,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
36348,C,TSLA,2021-12-30,1,1.658,0.764381,0.007937,0.735985,0.789361,0.25,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
36349,C,TSLA,2021-12-30,1,1.760,0.786317,0.007937,0.754168,0.807648,0.23,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
36350,C,TSLA,2021-12-30,1,1.861,0.798978,0.007937,0.773210,0.821686,0.19,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04


In [62]:
# data_train_tot_c1 = pd.read_parquet('/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_val_tech_scaled_c.parquet')
# data_train_tot_p1 = pd.read_parquet('/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_val_tech_scaled_p.parquet')

data_train = pd.read_parquet('/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_tech_nonscaled.parquet')
data_train_tot_c1 = data_train[data_train["cp_flag"] == "C"]

import pandas as pd

# Assuming merge_data_total_t and merge_data_train_c1 are already defined

# Filter the test set for 'C' and 'P' flags
test_c = merge_data_total_t[merge_data_total_t["cp_flag"] == "C"]
test_p = merge_data_total_t[merge_data_total_t["cp_flag"] == "P"]

# Custom standardization function
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features from train data and the 'C' test data
train_features = data_train_tot_c1.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])
test_features_c = test_c.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])

# Step 2: Standardize the training data and save the means and stds
train_features_standardized, train_means, train_stds = custom_standardize(train_features)

# Step 3: Standardize the 'C' test data using the saved means and stds from the training data
test_features_c_standardized, _, _ = custom_standardize(test_features_c, means=train_means, stds=train_stds)

# Step 4: Combine back the standardized 'C' test features with the non-feature columns
test_data_c_standardized_total = pd.DataFrame(test_features_c_standardized, columns=test_features_c.columns)
test_data_c_standardized_total = pd.concat([test_c[['date', 'Ticker', 'cp_flag', 'impl_volatility']], test_data_c_standardized_total], axis=1)

# Output the standardized 'C' test data
test_data_c_standardized_total



Unnamed: 0,date,Ticker,cp_flag,impl_volatility,trading_days_till_exp,moneyness,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
0,2021-01-04,AAPL,C,0.433412,0.602872,-1.661302,0.602872,-0.589313,-0.597376,-0.404460,...,1.256206,0.169346,-0.641329,-1.052185,-1.050440,-0.242053,-0.269391,-0.167645,-0.215840,-0.481824
1,2021-01-04,AAPL,C,0.428775,0.602872,-1.479080,0.602872,-0.601956,-0.634582,-0.422144,...,1.256206,0.169346,-0.641329,-1.052185,-1.050440,-0.242053,-0.269391,-0.167645,-0.215840,-0.481824
2,2021-01-04,AAPL,C,0.426358,0.602872,-1.298260,0.602872,-0.606978,-0.644805,-0.437054,...,1.256206,0.169346,-0.641329,-1.052185,-1.050440,-0.242053,-0.269391,-0.167645,-0.215840,-0.481824
3,2021-01-04,AAPL,C,0.422649,0.602872,-1.118841,0.602872,-0.611027,-0.650397,-0.449884,...,1.256206,0.169346,-0.641329,-1.052185,-1.050440,-0.242053,-0.269391,-0.167645,-0.215840,-0.481824
4,2021-01-04,AAPL,C,0.421479,0.602872,-0.940824,0.602872,-0.614172,-0.653209,-0.459939,...,1.256206,0.169346,-0.641329,-1.052185,-1.050440,-0.242053,-0.269391,-0.167645,-0.215840,-0.481824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36347,2021-12-30,TSLA,C,0.742669,-1.582517,1.185566,-1.582517,0.487321,0.670290,-0.482131,...,0.776101,-0.945233,0.199799,-0.648240,-0.297669,-0.658102,-0.681385,-0.715059,-0.709054,-0.460457
36348,2021-12-30,TSLA,C,0.764381,-1.582517,1.328541,-1.582517,0.537987,0.716221,-0.483865,...,0.776101,-0.945233,0.199799,-0.648240,-0.297669,-0.658102,-0.681385,-0.715059,-0.709054,-0.460457
36349,2021-12-30,TSLA,C,0.786317,-1.582517,1.471515,-1.582517,0.589376,0.768025,-0.484558,...,0.776101,-0.945233,0.199799,-0.648240,-0.297669,-0.658102,-0.681385,-0.715059,-0.709054,-0.460457
36350,2021-12-30,TSLA,C,0.798978,-1.582517,1.613087,-1.582517,0.643192,0.807792,-0.485945,...,0.776101,-0.945233,0.199799,-0.648240,-0.297669,-0.658102,-0.681385,-0.715059,-0.709054,-0.460457


In [63]:
# from sklearn.preprocessing import StandardScaler


# data_train = pd.read_parquet('/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_tech_nonscaled.parquet')
# data_train = data_train[data_train["cp_flag"] == "C"]
# data_train = data_train.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])


# test_c = test_c[data_train.columns]

# # Fit scaler on training data
# scaler = StandardScaler()
# combined_x_c_scaled = scaler.fit_transform(data_train)
# test_x_c_scaled = scaler.transform(test_c)  # Make sure to transform test data with the same scaler

# # add the column names back
# test_x_c_scaled = pd.DataFrame(test_x_c_scaled, columns=train_features.columns)
# combined_x_c_scaled = pd.DataFrame(combined_x_c_scaled, columns=train_features.columns)

# combined_x_c_scaled

In [64]:
# Assuming merge_data_total_t and merge_data_train_c1 are already defined
data_train = pd.read_parquet('/Users/sbjpipers/Desktop/FinalThesisQF/FinalThesisQF/Data/updated_standardization/data_set_train_tech_nonscaled.parquet')
data_train_tot_p1 = data_train[data_train["cp_flag"] == "P"]

# Filter the test set for 'C' and 'P' flags
test_c = merge_data_total_t[merge_data_total_t["cp_flag"] == "C"]
test_p = merge_data_total_t[merge_data_total_t["cp_flag"] == "P"]

# Custom standardization function
def custom_standardize(data, means=None, stds=None):
    if means is None or stds is None:
        means = data.mean()
        stds = data.std()
    standardized_data = (data - means) / stds
    return standardized_data, means, stds

# Step 1: Extract features from train data and the 'C' test data
train_features = data_train_tot_p1.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])
test_features_p = test_p.drop(columns=['impl_volatility', 'date', 'Ticker', 'cp_flag'])

# Step 2: Standardize the training data and save the means and stds
train_features_standardized, train_means, train_stds = custom_standardize(train_features)

# Step 3: Standardize the 'C' test data using the saved means and stds from the training data
test_features_p_standardized, _, _ = custom_standardize(test_features_p, means=train_means, stds=train_stds)

# Step 4: Combine back the standardized 'C' test features with the non-feature columns
test_data_p_standardized_total = pd.DataFrame(test_features_p_standardized, columns=test_features_c.columns)
test_data_p_standardized_total = pd.concat([test_p[['date', 'Ticker', 'cp_flag', 'impl_volatility']], test_data_p_standardized_total], axis=1)

# Output the standardized 'C' test data
test_data_p_standardized_total

Unnamed: 0,date,Ticker,cp_flag,impl_volatility,trading_days_till_exp,moneyness,T,prev_day_iv,prev2_day_iv,best_bid_option,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
36352,2021-01-04,AAPL,P,0.622737,0.597483,-1.763565,0.597483,-0.485955,-0.333858,-0.478698,...,1.199927,0.106074,-0.610372,-1.016573,-1.012818,-0.213685,-0.240905,-0.133977,-0.183435,-0.466217
36353,2021-01-04,AAPL,P,0.598160,0.597483,-1.563909,0.597483,-0.540789,-0.424123,-0.477172,...,1.199927,0.106074,-0.610372,-1.016573,-1.012818,-0.213685,-0.240905,-0.133977,-0.183435,-0.466217
36354,2021-01-04,AAPL,P,0.570362,0.597483,-1.365659,0.597483,-0.566378,-0.467037,-0.475646,...,1.199927,0.106074,-0.610372,-1.016573,-1.012818,-0.213685,-0.240905,-0.133977,-0.183435,-0.466217
36355,2021-01-04,AAPL,P,0.549241,0.597483,-1.168815,0.597483,-0.590342,-0.508299,-0.473358,...,1.199927,0.106074,-0.610372,-1.016573,-1.012818,-0.213685,-0.240905,-0.133977,-0.183435,-0.466217
36356,2021-01-04,AAPL,P,0.526871,0.597483,-0.973377,0.597483,-0.633938,-0.583676,-0.470306,...,1.199927,0.106074,-0.610372,-1.016573,-1.012818,-0.213685,-0.240905,-0.133977,-0.183435,-0.466217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71690,2021-12-30,TSLA,P,0.484270,-1.592475,0.978189,-1.592475,-0.177116,0.049449,0.254443,...,0.726109,-0.979749,0.225847,-0.614664,-0.264902,-0.663955,-0.685845,-0.725714,-0.716017,-0.443460
71691,2021-12-30,TSLA,P,0.503165,-1.592475,1.130040,-1.592475,-0.176629,0.071643,0.408929,...,0.726109,-0.979749,0.225847,-0.614664,-0.264902,-0.663955,-0.685845,-0.725714,-0.716017,-0.443460
71692,2021-12-30,TSLA,P,0.536047,-1.592475,1.283297,-1.592475,-0.163246,0.095868,0.561508,...,0.726109,-0.979749,0.225847,-0.614664,-0.264902,-0.663955,-0.685845,-0.725714,-0.716017,-0.443460
71693,2021-12-30,TSLA,P,0.550971,-1.592475,1.435148,-1.592475,-0.150542,0.124412,0.733159,...,0.726109,-0.979749,0.225847,-0.614664,-0.264902,-0.663955,-0.685845,-0.725714,-0.716017,-0.443460


In [65]:
# Save the parquet file
data_set_test_c_total = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_scaled_c_total.parquet"
data_set_test_p_total = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_scaled_p_total.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_test_c_total):
    test_data_c_standardized_total.to_parquet(data_set_test_c_total)

save_parquet = True
if save_parquet or not os.path.exists(data_set_test_p_total):
    test_data_p_standardized_total.to_parquet(data_set_test_p_total)