In [44]:
import polars as pl
import os
from pathlib import Path
import pandas as pd
import hvplot.polars
import datetime
import matplotlib.pyplot as plt
import numpy as np
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay

data_path = Path.cwd().parent.parent / "Data"

#Restrictions
timeperiod = [datetime.datetime(2018, 12, 30), datetime.datetime(2021, 12, 31)]
train_period = [datetime.datetime(2019, 1, 1), datetime.datetime(2020, 12, 31)]
volume_r = 0
open_interest_r = 0
delta_r = 0
vega_r = 0
theta_r = 0
gamma_r = 0
midprice_r = 0.3
bid_price_r = 0.0
days_till_exp_r1 = 7
days_till_exp_r2 = 252
moneyness_min = -2.5
moneyness_max = 2.5



In [58]:
firm_stock_data = Path.cwd().parent.parent / "Data/updated_standardization/firm_stock_dataset_tech.parquet"
option_data = Path.cwd().parent.parent / "Data/updated_standardization/data_set_option_tech.parquet"
macro_data = Path.cwd().parent.parent / "Data/updated_standardization/data_set_macro.parquet"

firm_stock_data = pd.read_parquet(firm_stock_data)
option_data = pd.read_parquet(option_data)
macro_data = pd.read_parquet(macro_data)

firm_train =  firm_stock_data[(firm_stock_data["date"] >= train_period[0]) & (firm_stock_data["date"] <= train_period[1])]
option_train = option_data[(option_data["date"] >= train_period[0]) & (option_data["date"] <= train_period[1])]
macro_train = macro_data[(macro_data["date"] >= train_period[0]) & (macro_data["date"] <= train_period[1])]
firm_test = firm_stock_data[(firm_stock_data["date"] > train_period[1])]
option_test = option_data[(option_data["date"] > train_period[1])]
macro_test = macro_data[(macro_data["date"] > train_period[1])]

In [61]:
option_test

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv
75102,C,AAPL,2021-01-04,4,-2.411,0.599475,0.019841,0.605595,0.327228
75103,C,AAPL,2021-01-04,4,-2.265,0.610254,0.019841,0.605595,0.327228
75104,C,AAPL,2021-01-04,4,-2.120,0.570884,0.019841,0.605595,0.327228
75105,C,AAPL,2021-01-04,4,-1.977,0.566679,0.019841,0.605595,0.327228
75106,C,AAPL,2021-01-04,4,-1.835,0.525325,0.019841,0.448299,0.559822
...,...,...,...,...,...,...,...,...,...
428240,P,TSLA,2021-12-30,1,2.062,1.012930,0.007937,0.753005,0.932566
428241,P,TSLA,2021-12-30,1,2.161,1.047433,0.007937,0.753005,0.932566
428242,P,TSLA,2021-12-30,1,2.261,1.088575,0.007937,0.753005,0.932566
428243,P,TSLA,2021-12-30,1,2.360,1.129544,0.007937,0.753005,0.932566


# Standardize macro data

In [46]:
import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = macro_train.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['date'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_macro = pd.concat([data[['date']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_macro


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,date,10Y_RIR,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
3,2019-01-02,1.562822,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
4,2019-01-03,1.562822,1.371916,1.469739,0.071348,1.124610,0.457626,0.198016,0.656238,-1.349397,-0.709394,1.065358
5,2019-01-04,1.426211,1.310038,1.377262,0.336815,1.124610,0.300487,0.331346,0.252127,-1.344931,-0.708961,0.153294
8,2019-01-07,1.576483,1.355593,1.441962,-0.067370,1.124610,0.014381,0.038020,0.124589,-1.340465,-0.708528,-0.052797
9,2019-01-08,1.617466,1.392399,1.488480,-0.023713,1.124610,-0.045720,0.059838,0.080165,-1.327066,-0.707228,-0.322470
...,...,...,...,...,...,...,...,...,...,...,...,...
725,2020-12-24,-0.759553,-1.214549,-1.201615,-0.183556,-1.160795,-0.265465,-0.221367,-0.171329,1.333629,0.283189,-0.329047
729,2020-12-28,-0.786875,-1.215936,-1.205631,-0.330020,-1.160795,-0.311167,-0.284395,-0.223634,1.348571,0.281024,-0.318085
730,2020-12-29,-0.786875,-1.209428,-1.202285,-0.123703,-1.160795,-0.233536,-0.259346,-0.368369,1.408338,0.272360,-0.114186
731,2020-12-30,-0.786875,-1.206761,-1.201057,-0.023713,-1.160795,-0.092048,-0.049250,-0.162015,1.423280,0.270195,-0.188730


In [47]:
import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = firm_train.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_firm = pd.concat([data[['Ticker' ,'date']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_firm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,5_day_rolling_return_stock,ASK,ASKHI,BID,BIDLO,PRC,PRC_actual,RET,...,spread_stock,std_dolvol,std_turn,stdacc,stdcf,tang,tb,turn,vol_stock,zerotrade
0,AAPL,2019-01-02,-0.286276,-0.568980,-0.569260,-0.569010,-0.567166,-0.568991,-1.119896,-0.080584,...,-0.125902,0.384411,-0.383392,-1.637978,-1.397221,-2.269561,-0.150957,-0.464434,0.737871,-0.186964
1,AAPL,2019-01-03,0.513255,-0.566836,-0.568214,-0.566853,-0.564699,-0.567087,-1.114221,0.234178,...,-0.131505,0.340969,-0.384122,-1.637978,-1.397221,-2.269561,-0.150957,-0.463254,0.462326,-0.179913
2,AAPL,2019-01-04,0.974270,-0.566849,-0.568849,-0.566878,-0.567575,-0.566860,-1.113545,-0.058953,...,-0.125914,0.297528,-0.384853,-1.637978,-1.397221,-2.269561,-0.150957,-0.462073,0.540321,-0.172861
3,AAPL,2019-01-07,-1.739500,-0.586819,-0.585200,-0.586846,-0.583208,-0.586694,-1.172660,-3.523569,...,-0.131510,0.167203,-0.387044,-1.637978,-1.397221,-2.269561,-0.150957,-0.458530,2.593612,-0.151707
4,AAPL,2019-01-08,-1.028140,-0.579040,-0.581676,-0.579063,-0.580907,-0.579040,-1.149848,1.369838,...,-0.131516,0.123762,-0.387774,-1.637978,-1.397221,-2.269561,-0.150957,-0.457350,1.354594,-0.144656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5799,TSLA,2020-12-24,-0.047648,0.041302,0.042610,0.041255,0.020426,0.041417,1.501621,-0.601955,...,0.030951,2.181198,0.928123,1.677337,0.982961,-0.057908,-0.411748,1.730563,1.095747,-1.103519
5800,TSLA,2020-12-28,0.366524,0.048437,0.044627,0.048381,0.031086,0.048528,1.529882,0.204706,...,0.036559,2.620721,1.011200,1.677337,0.982961,-0.057908,-0.411748,1.765113,0.388826,-1.097611
5801,TSLA,2020-12-29,-0.083321,0.068356,0.062795,0.068501,0.054645,0.068437,1.609004,0.742412,...,-0.047459,2.730602,1.031969,1.677337,0.982961,-0.057908,-0.411748,1.773751,0.001428,-1.096134
5802,TSLA,2020-12-30,-0.941110,0.070865,0.081860,0.070998,0.079954,0.070858,1.618625,0.001571,...,-0.041851,2.840483,1.052738,1.677337,0.982961,-0.057908,-0.411748,1.782388,0.357568,-1.094657


# Calls standardization

In [48]:
option_train_c  = option_train.copy()

# Apply filter of moneyness
option_train_c = option_train_c[option_train_c["cp_flag"] == "C"]
option_train_c = option_train_c[(option_train_c["moneyness"] >= -0.5) & (option_train_c["moneyness"] <= 2)]

import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = option_train_c.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_opt_c = pd.concat([data[['Ticker' ,'date', 'cp_flag']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_opt_c


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,cp_flag,T,impl_volatility,moneyness,prev2_day_iv,prev_day_iv,trading_days_till_exp
64436,AAPL,2019-01-02,C,-0.854054,-0.592091,-1.651490,-1.519873,-1.542073,-0.854054
64437,AAPL,2019-01-02,C,-0.854054,-0.579443,-1.062773,-1.519873,-0.759052,-0.854054
64438,AAPL,2019-01-02,C,-0.854054,-0.586154,-0.482466,-1.519873,-0.716917,-0.854054
64439,AAPL,2019-01-02,C,-0.854054,-0.577218,0.088029,-1.519873,-0.693295,-0.854054
64440,AAPL,2019-01-02,C,-0.854054,-0.562136,0.650114,-1.519873,-0.674080,-0.854054
...,...,...,...,...,...,...,...,...,...
234752,TSLA,2020-12-31,C,1.331335,0.167539,1.060814,-0.140819,0.024429,1.331335
234753,TSLA,2020-12-31,C,1.331335,0.185279,1.156130,-0.140819,0.024429,1.331335
234754,TSLA,2020-12-31,C,1.331335,0.212952,1.345361,-0.071801,0.110670,1.331335
234755,TSLA,2020-12-31,C,1.331335,0.237258,1.533190,-0.037014,0.158024,1.331335


# Puts standardization

In [49]:
option_train_p  = option_train.copy()
option_train_p = option_train_p[option_train_p["cp_flag"] == "P"]
option_train_p = option_train_p[(option_train_p["moneyness"] >= -2) & (option_train_p["moneyness"] <= 0.5)]

import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = option_train_p.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_opt_p = pd.concat([data[['Ticker' ,'date', 'cp_flag']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_opt_p


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,cp_flag,T,impl_volatility,moneyness,prev2_day_iv,prev_day_iv,trading_days_till_exp
251194,AAPL,2019-01-02,P,-0.862489,-0.118447,-1.599060,-1.565542,-0.215265,-0.862489
251195,AAPL,2019-01-02,P,-0.862489,-0.161328,-1.348787,-1.565542,-0.243882,-0.862489
251196,AAPL,2019-01-02,P,-0.862489,-0.202412,-1.099920,-1.565542,-0.272954,-0.862489
251197,AAPL,2019-01-02,P,-0.862489,-0.246881,-0.853865,-1.565542,-0.301999,-0.862489
251198,AAPL,2019-01-02,P,-0.862489,-0.345423,-0.242243,-1.565542,-0.349271,-0.862489
...,...,...,...,...,...,...,...,...,...
413055,TSLA,2020-12-31,P,1.327470,-0.078858,1.347974,-0.388218,-0.077136,1.327470
413056,TSLA,2020-12-31,P,1.327470,-0.079386,1.401403,-0.388671,-0.113290,1.327470
413057,TSLA,2020-12-31,P,1.327470,-0.080922,1.454832,-0.388671,-0.113290,1.327470
413058,TSLA,2020-12-31,P,1.327470,-0.080638,1.561690,-0.385860,-0.145900,1.327470


# Merge all the testdata

In [90]:
# Apply filter of moneyness
option_train_f = option_test.copy()

option_train_data_c = option_test[option_train_f["cp_flag"] == "C"]
option_train_data_c = option_train_data_c[(option_train_data_c["moneyness"] >= -0.5) & (option_train_data_c["moneyness"] <= 2)]

option_train_data_p = option_train_f[option_train_f["cp_flag"] == "P"]
option_train_data_p = option_train_data_p[(option_train_data_p["moneyness"] <= 0.5) & (option_train_data_p["moneyness"] >= -2)]

options_total = pd.concat([option_train_data_c, option_train_data_p], axis=0)
# Merge with the option data & macro data
merge_data_t = pd.merge(options_total, firm_test, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_total_t = pd.merge(merge_data_t, macro_test, on='date', how='left')

merge_data_total_t

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv,BIDLO,...,gold_price,reces_indi,10Y_RIR,1Y_bond,2Y_bond,OPEN_vix,HIGH_vix,LOW_vix,CLOSE_vix,spread_vix
0,C,AAPL,2021-01-04,4,-0.475,0.433412,0.019841,0.337114,0.325650,133.39999,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
1,C,AAPL,2021-01-04,4,-0.345,0.428775,0.019841,0.332640,0.312515,133.39999,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
2,C,AAPL,2021-01-04,4,-0.216,0.426358,0.019841,0.330864,0.308907,133.39999,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
3,C,AAPL,2021-01-04,4,-0.088,0.422649,0.019841,0.329431,0.306933,133.39999,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
4,C,AAPL,2021-01-04,4,0.039,0.421479,0.019841,0.328318,0.305940,133.39999,...,672.012903,3.036452,0.93,0.1074,0.1400,21.29,22.49,20.57,21.20,1.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71690,P,TSLA,2021-12-30,1,-0.027,0.484270,0.007937,0.520039,0.592762,1078.42004,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
71691,P,TSLA,2021-12-30,1,0.081,0.503165,0.007937,0.520221,0.600908,1078.42004,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
71692,P,TSLA,2021-12-30,1,0.190,0.536047,0.007937,0.525230,0.609799,1078.42004,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04
71693,P,TSLA,2021-12-30,1,0.298,0.550971,0.007937,0.529985,0.620276,1078.42004,...,632.874194,-0.206774,1.55,0.4878,0.8182,14.36,14.60,12.56,12.85,2.04


In [91]:
# Check for NaN values column-wise
nan_columns = merge_data_total_t.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

Number of NaN values in each column:
cp_flag                  0
Ticker                   0
date                     0
trading_days_till_exp    0
moneyness                0
                        ..
OPEN_vix                 0
HIGH_vix                 0
LOW_vix                  0
CLOSE_vix                0
spread_vix               0
Length: 129, dtype: int64
Columns with NaN values:
[]


In [92]:
# Testset save to parquet

data_set_standardized_test = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_nonscaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_standardized_test):
    merge_data_total_t.to_parquet(data_set_standardized_test)

# Merge all the standardized data with each other

In [50]:
# Merge the option data

# Concatenate the two DataFrames vertically
combined_data = pd.concat([standardized_data_opt_p, standardized_data_opt_c], axis=0)

# Reset the index if necessary
combined_data.reset_index(drop=True, inplace=True)

# Display the combined DataFrame
(combined_data)

Unnamed: 0,Ticker,date,cp_flag,T,impl_volatility,moneyness,prev2_day_iv,prev_day_iv,trading_days_till_exp
0,AAPL,2019-01-02,P,-0.862489,-0.118447,-1.599060,-1.565542,-0.215265,-0.862489
1,AAPL,2019-01-02,P,-0.862489,-0.161328,-1.348787,-1.565542,-0.243882,-0.862489
2,AAPL,2019-01-02,P,-0.862489,-0.202412,-1.099920,-1.565542,-0.272954,-0.862489
3,AAPL,2019-01-02,P,-0.862489,-0.246881,-0.853865,-1.565542,-0.301999,-0.862489
4,AAPL,2019-01-02,P,-0.862489,-0.345423,-0.242243,-1.565542,-0.349271,-0.862489
...,...,...,...,...,...,...,...,...,...
145306,TSLA,2020-12-31,C,1.331335,0.167539,1.060814,-0.140819,0.024429,1.331335
145307,TSLA,2020-12-31,C,1.331335,0.185279,1.156130,-0.140819,0.024429,1.331335
145308,TSLA,2020-12-31,C,1.331335,0.212952,1.345361,-0.071801,0.110670,1.331335
145309,TSLA,2020-12-31,C,1.331335,0.237258,1.533190,-0.037014,0.158024,1.331335


In [51]:
# Merge with the firm data & macro data
merge_data = pd.merge(combined_data, standardized_data_firm, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_total = pd.merge(merge_data, standardized_data_macro, on='date', how='left')

merge_data_total

Unnamed: 0,Ticker,date,cp_flag,T,impl_volatility,moneyness,prev2_day_iv,prev_day_iv,trading_days_till_exp,5_day_rolling_return_stock,...,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
0,AAPL,2019-01-02,P,-0.862489,-0.118447,-1.599060,-1.565542,-0.215265,-0.862489,-0.286276,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
1,AAPL,2019-01-02,P,-0.862489,-0.161328,-1.348787,-1.565542,-0.243882,-0.862489,-0.286276,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
2,AAPL,2019-01-02,P,-0.862489,-0.202412,-1.099920,-1.565542,-0.272954,-0.862489,-0.286276,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
3,AAPL,2019-01-02,P,-0.862489,-0.246881,-0.853865,-1.565542,-0.301999,-0.862489,-0.286276,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
4,AAPL,2019-01-02,P,-0.862489,-0.345423,-0.242243,-1.565542,-0.349271,-0.862489,-0.286276,...,1.371916,1.469739,0.476238,1.124610,0.475155,0.662651,0.674867,-1.353864,-0.709827,-0.133918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145306,TSLA,2020-12-31,C,1.331335,0.167539,1.060814,-0.140819,0.024429,1.331335,0.168781,...,-1.213802,-1.207416,-0.034979,-1.160795,-0.145889,0.015395,-0.057404,1.438222,0.268029,-0.552678
145307,TSLA,2020-12-31,C,1.331335,0.185279,1.156130,-0.140819,0.024429,1.331335,0.168781,...,-1.213802,-1.207416,-0.034979,-1.160795,-0.145889,0.015395,-0.057404,1.438222,0.268029,-0.552678
145308,TSLA,2020-12-31,C,1.331335,0.212952,1.345361,-0.071801,0.110670,1.331335,0.168781,...,-1.213802,-1.207416,-0.034979,-1.160795,-0.145889,0.015395,-0.057404,1.438222,0.268029,-0.552678
145309,TSLA,2020-12-31,C,1.331335,0.237258,1.533190,-0.037014,0.158024,1.331335,0.168781,...,-1.213802,-1.207416,-0.034979,-1.160795,-0.145889,0.015395,-0.057404,1.438222,0.268029,-0.552678


In [55]:
# Check for NaN values column-wise
nan_columns = merge_data_total.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

# Drop columns with NaN values
merge_data_total = merge_data_total.drop(columns=nan_columns_list)

Number of NaN values in each column:
Ticker             0
date               0
cp_flag            0
T                  0
impl_volatility    0
                  ..
LOW_vix            0
OPEN_vix           0
gold_price         0
reces_indi         0
spread_vix         0
Length: 126, dtype: int64
Columns with NaN values:
[]


In [66]:
# Save the parquet file

data_set_standardized_tot = Path.cwd().parent.parent / "Data/updated_standardization/data_set_train_tech_scaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_standardized_tot):
    merge_data_total.to_parquet(data_set_standardized_tot)

# Merge nonscaled traingsset tech

In [70]:
# Merge with the option data & macro data

option_train_f  = option_train.copy()

# Apply filter of moneyness
option_train_data_c = option_train_f[option_train_f["cp_flag"] == "C"]
option_train_data_c = option_train_data_c[(option_train_data_c["moneyness"] >= -0.5) & (option_train_data_c["moneyness"] <= 2)]

option_train_data_p = option_train_f[option_train_f["cp_flag"] == "P"]
option_train_data_p = option_train_data_p[(option_train_data_p["moneyness"] <= 0.5) & (option_train_data_p["moneyness"] >= -2)]

# Concatenate the two DataFrames vertically
optiondata_train = pd.concat([option_train_data_p, option_train_data_c], axis=0)

# Reset the index if necessary
optiondata_train.reset_index(drop=True, inplace=True)

optiondata_train

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv
0,P,AAPL,2019-01-02,2,-1.860,0.558181,0.011905,0.505761,0.000000
1,P,AAPL,2019-01-02,2,-1.682,0.541292,0.011905,0.495051,0.000000
2,P,AAPL,2019-01-02,2,-1.505,0.525111,0.011905,0.484170,0.000000
3,P,AAPL,2019-01-02,2,-1.330,0.507597,0.011905,0.473299,0.000000
4,P,AAPL,2019-01-02,2,-0.895,0.468786,0.011905,0.455607,0.000000
...,...,...,...,...,...,...,...,...,...
145306,C,TSLA,2020-12-31,5,1.467,0.623205,0.023810,0.554273,0.486818
145307,C,TSLA,2020-12-31,5,1.535,0.629719,0.023810,0.554273,0.486818
145308,C,TSLA,2020-12-31,5,1.670,0.639880,0.023810,0.584788,0.511182
145309,C,TSLA,2020-12-31,5,1.804,0.648805,0.023810,0.601543,0.523462


In [88]:
option_train_data_p

Unnamed: 0,cp_flag,Ticker,date,trading_days_till_exp,moneyness,impl_volatility,T,prev_day_iv,prev2_day_iv
251194,P,AAPL,2019-01-02,2,-1.860,0.558181,0.011905,0.505761,0.000000
251195,P,AAPL,2019-01-02,2,-1.682,0.541292,0.011905,0.495051,0.000000
251196,P,AAPL,2019-01-02,2,-1.505,0.525111,0.011905,0.484170,0.000000
251197,P,AAPL,2019-01-02,2,-1.330,0.507597,0.011905,0.473299,0.000000
251198,P,AAPL,2019-01-02,2,-0.895,0.468786,0.011905,0.455607,0.000000
...,...,...,...,...,...,...,...,...,...
413055,P,TSLA,2020-12-31,5,0.236,0.573773,0.023810,0.557458,0.432122
413056,P,TSLA,2020-12-31,5,0.274,0.573565,0.023810,0.543927,0.431956
413057,P,TSLA,2020-12-31,5,0.312,0.572960,0.023810,0.543927,0.431956
413058,P,TSLA,2020-12-31,5,0.388,0.573072,0.023810,0.531722,0.432987


In [89]:
merge_data_train = pd.merge(optiondata_train, firm_train, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_total_train = pd.merge(merge_data_train, macro_train, on='date', how='left')

In [73]:
# Check for NaN values column-wise
nan_columns = merge_data_total_train.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

# Drop columns with NaN values
merge_data_total_train = merge_data_total_train.drop(columns=nan_columns_list)

Number of NaN values in each column:
cp_flag                  0
Ticker                   0
date                     0
trading_days_till_exp    0
moneyness                0
                        ..
OPEN_vix                 0
HIGH_vix                 0
LOW_vix                  0
CLOSE_vix                0
spread_vix               0
Length: 129, dtype: int64
Columns with NaN values:
[]


In [74]:
# Save the parquet file

data_set_train_nonscaled = Path.cwd().parent.parent / "Data/updated_standardization/data_set_train_tech_nonscaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_train_nonscaled):
    merge_data_total_train.to_parquet(data_set_train_nonscaled)

# Merge scaled dataset test

In [76]:
# Standardize micro data

import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = macro_test.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['date'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_macro_t = pd.concat([data[['date']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_macro_t


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,date,10Y_RIR,1Y_bond,2Y_bond,CLOSE_vix,FF_rate,HIGH_vix,LOW_vix,OPEN_vix,gold_price,reces_indi,spread_vix
736,2021-01-04,-2.811923,-0.368545,-0.962903,0.781467,0.888095,0.486602,1.252057,0.802324,2.734423,2.332744,-0.597870
737,2021-01-05,-2.811923,-0.407768,-0.996304,2.053320,0.888095,1.947452,1.840347,1.118475,2.594925,2.313505,1.608684
738,2021-01-06,-2.647970,-0.374785,-0.955666,1.416366,0.888095,1.452482,2.159233,2.095081,2.521689,2.308978,0.207036
739,2021-01-07,-2.210762,-0.241072,-0.797566,1.309522,0.888095,1.208434,0.834207,1.445454,2.448452,2.304451,1.386640
740,2021-01-08,-1.992158,-0.220570,-0.736330,0.415732,0.888095,0.699714,0.633528,1.330687,2.375216,2.299924,0.612959
...,...,...,...,...,...,...,...,...,...,...,...,...
1093,2021-12-27,0.303183,2.841456,2.627189,-0.757495,0.088810,-0.762854,-0.795960,-0.512088,-0.008238,-0.523771,-0.535420
1094,2021-12-28,0.193881,2.959123,2.712362,-0.599284,0.088810,-0.554898,-0.499067,-0.312869,0.057151,-0.515849,-0.490318
1095,2021-12-29,0.248532,2.984083,2.733516,-0.615722,0.088810,-0.649423,-0.446835,-0.557562,0.073499,-0.513868,-0.747055
1096,2021-12-30,0.576438,3.022414,2.812566,-0.934199,0.088810,-0.869410,-0.949905,-0.698315,0.089846,-0.511887,-0.556237


In [77]:
import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = firm_test.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_firm_t = pd.concat([data[['Ticker' ,'date']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_firm_t



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,5_day_rolling_return_stock,ASK,ASKHI,BID,BIDLO,PRC,PRC_actual,RET,...,spread_stock,std_dolvol,std_turn,stdacc,stdcf,tang,tb,turn,vol_stock,zerotrade
505,AAPL,2021-01-04,0.102605,-0.666405,-0.667331,-0.666404,-0.663240,-0.666414,-1.239797,-0.460213,...,-0.509756,0.135654,-0.438053,-2.026340,-1.384202,-1.567174,0.479367,-0.280651,2.017141,0.282292
506,AAPL,2021-01-05,0.087515,-0.667391,-0.668431,-0.667409,-0.664748,-0.667330,-1.253625,-0.423351,...,-0.474757,0.131205,-0.437376,-2.026947,-1.386351,-1.567174,0.479367,-0.285386,2.128704,0.270094
507,AAPL,2021-01-06,-0.575368,-0.670235,-0.669424,-0.670236,-0.669200,-0.670246,-1.297660,-1.184571,...,-0.509721,0.126757,-0.436699,-2.027554,-1.388499,-1.567174,0.479367,-0.290122,3.514675,0.257895
508,AAPL,2021-01-07,-1.025767,-0.668858,-0.671071,-0.668858,-0.667701,-0.668824,-1.276180,0.474356,...,-0.509721,0.122309,-0.436022,-2.028160,-1.390648,-1.567174,0.479367,-0.294857,2.082597,0.245697
509,AAPL,2021-01-08,-1.427561,-0.672662,-0.671678,-0.672744,-0.669539,-0.672744,-1.335386,-1.584640,...,-0.352322,0.117861,-0.435345,-2.028767,-1.392796,-1.567174,0.479367,-0.299592,3.884823,0.233498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6051,TSLA,2021-12-27,0.503850,0.111457,0.106766,0.111439,0.076022,0.111633,1.479796,3.274038,...,0.119930,1.371178,1.466071,1.036588,1.069222,0.843940,0.772739,1.220441,-0.005885,-1.339938
6052,TSLA,2021-12-28,2.891178,0.163649,0.157204,0.163865,0.112381,0.163314,1.739936,2.498862,...,-0.299832,1.297688,1.451704,1.036588,1.069222,0.843940,0.772739,1.233139,-0.015663,-1.342722
6053,TSLA,2021-12-29,3.336664,0.186981,0.195944,0.187304,0.178041,0.187264,1.860497,1.050728,...,-0.492213,1.224199,1.437337,1.036588,1.069222,0.843940,0.772739,1.245837,-0.236675,-1.345506
6054,TSLA,2021-12-30,4.077728,0.182857,0.197704,0.182734,0.184957,0.182401,1.836018,-0.302431,...,0.382117,1.150709,1.422970,1.036588,1.069222,0.843940,0.772739,1.258535,-0.351884,-1.348290


In [79]:
option_test_c  = option_test.copy()

# Apply filter of moneyness
option_test_c = option_test_c[option_test_c["cp_flag"] == "C"]
option_test_c = option_test_c[(option_test_c["moneyness"] >= -0.5) & (option_test_c["moneyness"] <= 2)]

import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = option_test_c.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_opt_c_t = pd.concat([data[['Ticker' ,'date', 'cp_flag']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_opt_c_t


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,cp_flag,T,impl_volatility,moneyness,prev2_day_iv,prev_day_iv,trading_days_till_exp
75116,AAPL,2021-01-04,C,0.592963,-0.102728,-1.663425,-0.503962,-0.467856,0.592963
75117,AAPL,2021-01-04,C,0.592963,-0.122166,-1.481090,-0.561526,-0.487036,0.592963
75118,AAPL,2021-01-04,C,0.592963,-0.132297,-1.300157,-0.577343,-0.494653,0.592963
75119,AAPL,2021-01-04,C,0.592963,-0.147844,-1.120627,-0.585994,-0.500796,0.592963
75120,AAPL,2021-01-04,C,0.592963,-0.152749,-0.942499,-0.590346,-0.505567,0.592963
...,...,...,...,...,...,...,...,...,...
251181,TSLA,2021-12-30,C,-1.583324,1.193604,1.185216,1.457326,1.165364,-1.583324
251182,TSLA,2021-12-30,C,-1.583324,1.284615,1.328279,1.528389,1.242223,-1.583324
251183,TSLA,2021-12-30,C,-1.583324,1.376566,1.471342,1.608539,1.320177,-1.583324
251184,TSLA,2021-12-30,C,-1.583324,1.429638,1.613003,1.670064,1.401815,-1.583324


In [78]:
option_test_p  = option_test.copy()
option_test_p = option_test_p[option_test_p["cp_flag"] == "P"]
option_test_p = option_test_p[(option_test_p["moneyness"] >= -2) & (option_test_p["moneyness"] <= 0.5)]

import pandas as pd

def standardize(df):
    """Standardize the numeric columns, excluding the 'date' column."""
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
    return df

# Assuming your DataFrame is called 'macro_train'
data = option_test_p.copy()

# Exclude the 'date' column from the columns to be standardized
columns_to_standardize = data.columns.difference(['Ticker', 'date', 'cp_flag'])

# Standardize the entire DataFrame (except 'date')
standardized_data = standardize(data[columns_to_standardize])

# Combine standardized numeric data with the 'date' column
standardized_data_opt_p_t = pd.concat([data[['Ticker' ,'date', 'cp_flag']], standardized_data], axis=1)

# Display the standardized DataFrame
standardized_data_opt_p_t


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()


Unnamed: 0,Ticker,date,cp_flag,T,impl_volatility,moneyness,prev2_day_iv,prev_day_iv,trading_days_till_exp
260822,AAPL,2021-01-04,P,0.591558,0.595483,-1.803995,-0.020857,-0.246230,0.591558
260823,AAPL,2021-01-04,P,0.591558,0.495874,-1.603934,-0.163475,-0.333845,0.591558
260824,AAPL,2021-01-04,P,0.591558,0.383210,-1.405282,-0.231277,-0.374732,0.591558
260825,AAPL,2021-01-04,P,0.591558,0.297607,-1.208039,-0.296470,-0.413023,0.591558
260826,AAPL,2021-01-04,P,0.591558,0.206942,-1.012205,-0.415563,-0.482681,0.591558
...,...,...,...,...,...,...,...,...,...
428220,TSLA,2021-12-30,P,-1.591151,0.034282,0.943321,0.584757,0.247237,-1.591151
428221,TSLA,2021-12-30,P,-1.591151,0.110863,1.095480,0.619823,0.248015,-1.591151
428222,TSLA,2021-12-30,P,-1.591151,0.244132,1.249048,0.658098,0.269399,-1.591151
428223,TSLA,2021-12-30,P,-1.591151,0.304619,1.401207,0.703197,0.289697,-1.591151


In [84]:
# Concatenate the two DataFrames vertically
optiondata_test_scaled = pd.concat([standardized_data_opt_c_t, standardized_data_opt_p_t], axis=0)

# Reset the index if necessary
optiondata_test_scaled.reset_index(drop=True, inplace=True)

optiondata_test_scaled

merge_data_test_standardized = pd.merge(optiondata_test_scaled, standardized_data_firm_t, on=['date', 'Ticker'], how='left')

# Merge the data but now with the macro data
merge_data_test_scaled = pd.merge(merge_data_test_standardized, standardized_data_macro_t, on='date', how='left')

merge_data_test_scaled

# Check for NaN values column-wise
nan_columns = merge_data_test_scaled.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)

# Add columns with NaN values to a list
nan_columns_list = nan_columns[nan_columns > 0].index.tolist()
print("Columns with NaN values:")
print(nan_columns_list)

# # Drop columns with NaN values
merge_data_test_scaled = merge_data_test_scaled.drop(columns=nan_columns_list)

nan_columns = merge_data_test_scaled.isnull().sum()
print("Number of NaN values in each column:")
print(nan_columns)


Number of NaN values in each column:
Ticker             0
date               0
cp_flag            0
T                  0
impl_volatility    0
                  ..
LOW_vix            0
OPEN_vix           0
gold_price         0
reces_indi         0
spread_vix         0
Length: 129, dtype: int64
Columns with NaN values:
['divi', 'divo', 'sin']
Number of NaN values in each column:
Ticker             0
date               0
cp_flag            0
T                  0
impl_volatility    0
                  ..
LOW_vix            0
OPEN_vix           0
gold_price         0
reces_indi         0
spread_vix         0
Length: 126, dtype: int64


In [87]:
# Save the parquet file

data_set_test_scaled = Path.cwd().parent.parent / "Data/updated_standardization/data_set_test_tech_scaled.parquet"

save_parquet = True
if save_parquet or not os.path.exists(data_set_test_scaled):
    merge_data_test_scaled.to_parquet(data_set_test_scaled)