In [75]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Statistical modeling and diagnostics
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan, het_white
from statsmodels.stats.stattools import jarque_bera

# Machine learning and preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data source
from ucimlrepo import fetch_ucirepo

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 11

print("All libraries imported successfully")

All libraries imported successfully


In [76]:
start_date = '2003-01-02' 
end_date = '2023-08-31'

### Importing apple data

In [77]:
df = pd.read_csv('raw_data/apple_options.csv')
df = df.set_index(keys=['date'])

In [78]:
df['moneyness'] = df['forward_price']/df['strike_price']
df = df.replace(to_replace=np.inf,value=np.NaN)
df

Unnamed: 0_level_0,secid,days,forward_price,strike_price,premium,impl_volatility,delta,gamma,theta,vega,cp_flag,cusip,ticker,sic,index_flag,exchange_d,class,issue_type,industry_group,moneyness
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2003-01-02,101594,10,14.805549,0.000000,0.000000,,,,,,C,3783310,AAPL,3571,0,4,,0,,
2003-01-02,101594,30,14.813368,14.813368,0.841441,0.497594,0.528298,0.188362,-5.182076,1.688160,C,3783310,AAPL,3571,0,4,,0,,1.0
2003-01-02,101594,60,14.819440,14.819440,1.175942,0.492491,0.539221,0.134143,-3.610675,2.379834,C,3783310,AAPL,3571,0,4,,0,,1.0
2003-01-02,101594,91,14.843963,14.843963,1.462859,0.497691,0.549146,0.107542,-2.993675,2.924233,C,3783310,AAPL,3571,0,4,,0,,1.0
2003-01-02,101594,122,14.866787,14.866787,1.696889,0.498870,0.557222,0.092443,-2.607087,3.378008,C,3783310,AAPL,3571,0,4,,0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31,101594,182,192.666617,192.666617,12.291289,0.221722,-0.502078,0.015307,-7.351980,51.660217,P,3783310,AAPL,3571,0,4,,0,,1.0
2023-08-31,101594,273,195.319096,195.319096,15.962474,0.230937,-0.507065,0.012613,-5.532284,62.376901,P,3783310,AAPL,3571,0,4,,0,,1.0
2023-08-31,101594,365,198.008318,198.008318,19.469372,0.239976,-0.510967,0.010967,-4.504729,71.749699,P,3783310,AAPL,3571,0,4,,0,,1.0
2023-08-31,101594,547,202.941339,202.941339,25.079122,0.245732,-0.520578,0.009445,-3.134944,84.670341,P,3783310,AAPL,3571,0,4,,0,,1.0


In [79]:
for col in df.columns:
    s = df[col]
    n = s.shape[0]
    undefined = sum(s.isna())
    percent = undefined/n
    print(f'col {col} has undefined % of {percent:.2f}')


col secid has undefined % of 0.00
col days has undefined % of 0.00
col forward_price has undefined % of 0.00
col strike_price has undefined % of 0.00
col premium has undefined % of 0.00
col impl_volatility has undefined % of 0.04
col delta has undefined % of 0.04
col gamma has undefined % of 0.04
col theta has undefined % of 0.04
col vega has undefined % of 0.04
col cp_flag has undefined % of 0.00
col cusip has undefined % of 0.00
col ticker has undefined % of 0.00
col sic has undefined % of 0.00
col index_flag has undefined % of 0.00
col exchange_d has undefined % of 0.00
col class has undefined % of 1.00
col issue_type has undefined % of 0.00
col industry_group has undefined % of 1.00
col moneyness has undefined % of 0.04


In [80]:
df = df.drop(columns=['class', 'industry_group'])

In [81]:
df

Unnamed: 0_level_0,secid,days,forward_price,strike_price,premium,impl_volatility,delta,gamma,theta,vega,cp_flag,cusip,ticker,sic,index_flag,exchange_d,issue_type,moneyness
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2003-01-02,101594,10,14.805549,0.000000,0.000000,,,,,,C,3783310,AAPL,3571,0,4,0,
2003-01-02,101594,30,14.813368,14.813368,0.841441,0.497594,0.528298,0.188362,-5.182076,1.688160,C,3783310,AAPL,3571,0,4,0,1.0
2003-01-02,101594,60,14.819440,14.819440,1.175942,0.492491,0.539221,0.134143,-3.610675,2.379834,C,3783310,AAPL,3571,0,4,0,1.0
2003-01-02,101594,91,14.843963,14.843963,1.462859,0.497691,0.549146,0.107542,-2.993675,2.924233,C,3783310,AAPL,3571,0,4,0,1.0
2003-01-02,101594,122,14.866787,14.866787,1.696889,0.498870,0.557222,0.092443,-2.607087,3.378008,C,3783310,AAPL,3571,0,4,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31,101594,182,192.666617,192.666617,12.291289,0.221722,-0.502078,0.015307,-7.351980,51.660217,P,3783310,AAPL,3571,0,4,0,1.0
2023-08-31,101594,273,195.319096,195.319096,15.962474,0.230937,-0.507065,0.012613,-5.532284,62.376901,P,3783310,AAPL,3571,0,4,0,1.0
2023-08-31,101594,365,198.008318,198.008318,19.469372,0.239976,-0.510967,0.010967,-4.504729,71.749699,P,3783310,AAPL,3571,0,4,0,1.0
2023-08-31,101594,547,202.941339,202.941339,25.079122,0.245732,-0.520578,0.009445,-3.134944,84.670341,P,3783310,AAPL,3571,0,4,0,1.0


In [82]:
df = df.dropna(how='any')

In [83]:
df

Unnamed: 0_level_0,secid,days,forward_price,strike_price,premium,impl_volatility,delta,gamma,theta,vega,cp_flag,cusip,ticker,sic,index_flag,exchange_d,issue_type,moneyness
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2003-01-02,101594,30,14.813368,14.813368,0.841441,0.497594,0.528298,0.188362,-5.182076,1.688160,C,3783310,AAPL,3571,0,4,0,1.0
2003-01-02,101594,60,14.819440,14.819440,1.175942,0.492491,0.539221,0.134143,-3.610675,2.379834,C,3783310,AAPL,3571,0,4,0,1.0
2003-01-02,101594,91,14.843963,14.843963,1.462859,0.497691,0.549146,0.107542,-2.993675,2.924233,C,3783310,AAPL,3571,0,4,0,1.0
2003-01-02,101594,122,14.866787,14.866787,1.696889,0.498870,0.557222,0.092443,-2.607087,3.378008,C,3783310,AAPL,3571,0,4,0,1.0
2003-01-02,101594,152,14.877803,14.877803,1.904648,0.502332,0.564019,0.081988,-2.344203,3.758602,C,3783310,AAPL,3571,0,4,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31,101594,182,192.666617,192.666617,12.291289,0.221722,-0.502078,0.015307,-7.351980,51.660217,P,3783310,AAPL,3571,0,4,0,1.0
2023-08-31,101594,273,195.319096,195.319096,15.962474,0.230937,-0.507065,0.012613,-5.532284,62.376901,P,3783310,AAPL,3571,0,4,0,1.0
2023-08-31,101594,365,198.008318,198.008318,19.469372,0.239976,-0.510967,0.010967,-4.504729,71.749699,P,3783310,AAPL,3571,0,4,0,1.0
2023-08-31,101594,547,202.941339,202.941339,25.079122,0.245732,-0.520578,0.009445,-3.134944,84.670341,P,3783310,AAPL,3571,0,4,0,1.0


In [84]:
df['moneyness'].mean()

1.0

In [85]:
### Returns data

ret_df = pd.read_csv('./raw_data/apple_daily_returns.csv')
ret_df = ret_df.set_index('date')
ret_df


Unnamed: 0_level_0,PERMNO,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2003-01-02,14593,0.032798
2003-01-03,14593,0.006757
2003-01-06,14593,0.000000
2003-01-07,14593,-0.003356
2003-01-08,14593,-0.020202
...,...,...
2023-08-25,14593,0.012643
2023-08-28,14593,0.008846
2023-08-29,14593,0.021810
2023-08-30,14593,0.019172


In [87]:
ff3 = pd.read_csv('./raw_data/F-F_Research_Data_Factors_daily.csv')
ff3['Date'] = ff3['Date'].transform(func=lambda x: str(x)[0:4]+"-"+str(x)[4:6]+"-"+str(x)[6:8])
ff3 = ff3.set_index('Date')
ff3 = ff3.loc[start_date:end_date]
ff3

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2003-01-02,3.14,-0.78,-0.38,0.00
2003-01-03,-0.11,-0.46,0.16,0.00
2003-01-06,2.13,-0.53,-0.41,0.00
2003-01-07,-0.63,0.22,-0.12,0.00
2003-01-08,-1.35,0.07,0.51,0.00
...,...,...,...,...
2023-08-25,0.65,-0.07,-0.57,0.02
2023-08-28,0.63,-0.02,0.41,0.02
2023-08-29,1.50,0.01,-0.12,0.02
2023-08-30,0.41,0.24,-0.46,0.02


In [92]:
cdf = pd.merge(left=df,right=ret_df,left_index=True, right_index=True, how="left")

cdf


Unnamed: 0_level_0,secid,days,forward_price,strike_price,premium,impl_volatility,delta,gamma,theta,vega,cp_flag,cusip,ticker,sic,index_flag,exchange_d,issue_type,moneyness,PERMNO,RET
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2003-01-02,101594,30,14.813368,14.813368,0.841441,0.497594,0.528298,0.188362,-5.182076,1.688160,C,3783310,AAPL,3571,0,4,0,1.0,14593,0.032798
2003-01-02,101594,60,14.819440,14.819440,1.175942,0.492491,0.539221,0.134143,-3.610675,2.379834,C,3783310,AAPL,3571,0,4,0,1.0,14593,0.032798
2003-01-02,101594,91,14.843963,14.843963,1.462859,0.497691,0.549146,0.107542,-2.993675,2.924233,C,3783310,AAPL,3571,0,4,0,1.0,14593,0.032798
2003-01-02,101594,122,14.866787,14.866787,1.696889,0.498870,0.557222,0.092443,-2.607087,3.378008,C,3783310,AAPL,3571,0,4,0,1.0,14593,0.032798
2003-01-02,101594,152,14.877803,14.877803,1.904648,0.502332,0.564019,0.081988,-2.344203,3.758602,C,3783310,AAPL,3571,0,4,0,1.0,14593,0.032798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31,101594,182,192.666617,192.666617,12.291289,0.221722,-0.502078,0.015307,-7.351980,51.660217,P,3783310,AAPL,3571,0,4,0,1.0,14593,0.001172
2023-08-31,101594,273,195.319096,195.319096,15.962474,0.230937,-0.507065,0.012613,-5.532284,62.376901,P,3783310,AAPL,3571,0,4,0,1.0,14593,0.001172
2023-08-31,101594,365,198.008318,198.008318,19.469372,0.239976,-0.510967,0.010967,-4.504729,71.749699,P,3783310,AAPL,3571,0,4,0,1.0,14593,0.001172
2023-08-31,101594,547,202.941339,202.941339,25.079122,0.245732,-0.520578,0.009445,-3.134944,84.670341,P,3783310,AAPL,3571,0,4,0,1.0,14593,0.001172
