# Data Processing

## import packages

In [1]:
import numpy as np
import pandas as pd
from arbitragerepair import constraints, repair

In [2]:
raw_data = pd.read_csv('fqjo3s8eacwzxkcw.csv')

In [20]:
# Choose Amazon as the underlying stock
df = raw_data.loc[raw_data.ticker=='AMZN', 
                  ['date','exdate','cp_flag','strike_price','best_bid','best_offer','volume','impl_volatility']].copy()
df['date'] = pd.to_datetime(df['date'])
df['exdate'] = pd.to_datetime(df['exdate'])
df['strike_price'] = df['strike_price']/1000 # scaling strike
df['option_price'] = df[['best_bid','best_offer']].mean(axis=1)
# df = df.drop(columns = ['best_bid','best_offer'])
df.head()

Unnamed: 0,date,exdate,cp_flag,strike_price,best_bid,best_offer,volume,impl_volatility,option_price
0,2018-01-02,2018-01-05,C,1000.0,189.35,190.6,0,0.962943,189.975
1,2018-01-02,2018-01-05,C,1002.5,186.85,188.1,0,0.951017,187.475
2,2018-01-02,2018-01-05,C,1005.0,184.35,185.6,0,0.939104,184.975
3,2018-01-02,2018-01-05,C,1007.5,181.85,183.1,0,0.927205,182.475
4,2018-01-02,2018-01-05,C,1010.0,179.35,180.6,0,0.915319,179.975


In [4]:
# Assume t0 = 2018-01-02
df.loc[df.date=='2018-01-02','exdate'].unique()

<DatetimeArray>
['2018-01-05 00:00:00', '2018-01-12 00:00:00', '2018-01-19 00:00:00',
 '2018-01-26 00:00:00', '2018-02-02 00:00:00', '2018-02-09 00:00:00',
 '2018-02-16 00:00:00', '2018-03-16 00:00:00', '2018-04-20 00:00:00',
 '2018-06-15 00:00:00', '2018-07-20 00:00:00', '2018-09-21 00:00:00',
 '2019-01-18 00:00:00', '2019-06-21 00:00:00', '2020-01-17 00:00:00']
Length: 15, dtype: datetime64[ns]

## Fixed $T_1$, $T_2$, and drag back different $t_0$

In [40]:
# Choose t1 = '2018-01-19', t2 = '2018-04-20' (arbitrarily,temporarily)

t1 = '2018-01-19'
t2 = '2018-04-20'

# call options data
df_t1_C = df.loc[(df.exdate==t1)&(df.cp_flag=='C'), ['date','exdate','strike_price','volume','option_price','impl_volatility','best_bid']]
df_t2_C = df.loc[(df.exdate==t2)&(df.cp_flag=='C'), ['date','exdate','strike_price','volume','option_price','impl_volatility','best_bid']]

# put options data
df_t1_P = df.loc[(df.exdate==t1)&(df.cp_flag=='P'), ['date','exdate','strike_price','volume','option_price','impl_volatility','best_bid']]
df_t2_P = df.loc[(df.exdate==t2)&(df.cp_flag=='P'), ['date','exdate','strike_price','volume','option_price','impl_volatility','best_bid']]

In [37]:
# t0 to proceed with
t0List = list(df_t1_C.date.unique())

print(len(t0List))
print(t0List) 

13
[Timestamp('2018-01-02 00:00:00'), Timestamp('2018-01-03 00:00:00'), Timestamp('2018-01-04 00:00:00'), Timestamp('2018-01-05 00:00:00'), Timestamp('2018-01-08 00:00:00'), Timestamp('2018-01-09 00:00:00'), Timestamp('2018-01-10 00:00:00'), Timestamp('2018-01-11 00:00:00'), Timestamp('2018-01-12 00:00:00'), Timestamp('2018-01-16 00:00:00'), Timestamp('2018-01-17 00:00:00'), Timestamp('2018-01-18 00:00:00'), Timestamp('2018-01-19 00:00:00')]


## Revise stock prices

In [7]:
df_amzn = pd.read_csv('./amzn_stock_daily.csv',index_col=0)

In [8]:
df_amzn.reset_index(inplace=True)
df_amzn['date'] = pd.to_datetime(df_amzn['Date'], format="%Y-%m-%d")
stock_AMZN = df_amzn[['date','Adj Close']].copy()
stock_AMZN.rename(columns = {'Adj Close' : 'stock_price'}, inplace = True)
stock_AMZN

Unnamed: 0,date,stock_price
0,2018-01-02,59.450500
1,2018-01-03,60.209999
2,2018-01-04,60.479500
3,2018-01-05,61.457001
4,2018-01-08,62.343498
...,...,...
1560,2024-03-15,174.419998
1561,2024-03-18,174.479996
1562,2024-03-19,175.899994
1563,2024-03-20,178.149994


### Revise


In [10]:
split_data = {
    'date': ['2022-06-06', '1999-09-02', '1999-01-05', '1998-06-02'],
    'split': ['20:1', '2:1', '3:1', '2:1'],
    'multiple': [20, 2, 3, 2]
}
df_splits = pd.DataFrame(split_data)
df_splits['date'] = pd.to_datetime(df_splits['date'])
df_splits = df_splits.sort_values(by='date', ascending=False)

# Define a function to adjust stock prices based on splits.
def adjust_prices_for_splits(stock_prices_df, splits_df):
    adjusted_prices = stock_prices_df.copy()
    adjusted_prices['adjusted_price'] = adjusted_prices['stock_price']
    
    # Apply splits in reverse chronological order
    for _, split in splits_df.iterrows():
        split_date = split['date']
        split_multiple = split['multiple']
        mask = adjusted_prices['date'] < split_date
        adjusted_prices.loc[mask, 'adjusted_price'] *= split_multiple
        
    return adjusted_prices

# Adjust the AMZN stock prices for splits
stock_AMZN = adjust_prices_for_splits(stock_AMZN, df_splits)
stock_AMZN

Unnamed: 0,date,stock_price,adjusted_price
0,2018-01-02,59.450500,1189.010010
1,2018-01-03,60.209999,1204.199982
2,2018-01-04,60.479500,1209.589996
3,2018-01-05,61.457001,1229.140015
4,2018-01-08,62.343498,1246.869965
...,...,...,...
1560,2024-03-15,174.419998,174.419998
1561,2024-03-18,174.479996,174.479996
1562,2024-03-19,175.899994,175.899994
1563,2024-03-20,178.149994,178.149994


It seems great! :)

## select data with VIX method

In [57]:
import warnings
warnings.filterwarnings('ignore')
def CalcExpiry(data):
    t0 = data['date'].unique()[0]
    t1 = data['exdate'].unique()[0]
    amzn_t = mcal.get_calendar('NASDAQ')
    trading_days_1 = amzn_t.valid_days(start_date=t0, end_date=t1)
    data['T'] = len(trading_days_1)/252
    data['S'] = stock_AMZN.set_index('date').loc[t0,'adjusted_price']
    return data
def Calc_R(x):
    return -1/x['T'] * np.log((x['S']-x['C']+x['P'])/x['strike_price'])
def CalcForward(data):
    import math
    data = data.reset_index()
    f = data['strike_price']+math.exp(data['r']*data['T'])*(data['C']-data['P'])
    return f
def selectOption(data,Fwd):
    data = data.reset_index()
    Fwd = Fwd.loc[data['date'].unique()[0],'F']
    data = data[data['strike_price'] > Fwd].sort_values(by='strike_price') ## out of money option
    data['indc'] = (data['bid']==0).rolling(2).sum() ##delete the data after two zero bids
    data['indc'] = (data['indc']==2).cumsum()
    return data[data['indc']<2].drop(columns = ['bid','diff','indc'])

def VIXFilter(data):
    import pandas_market_calendars as mcal

    ## Combine call and put price
    df_VIX = data.groupby('date',group_keys=False).apply(lambda x: CalcExpiry(x))
    df_VIX = df_VIX.set_index(['date','strike_price'])[['option_price','T','S','best_bid','volume']]
    df_VIX.columns = ['C','T','S','bid','volume']
    df_VIX['P'] = df_t1_P .set_index(['date','strike_price'])['option_price']

    ## Calculate r from put-call parity
    df_VIX.loc[:,'r'] = np.array(df_VIX.reset_index().apply(Calc_R,axis=1))

    ## find the strike price with smallest call-put difference and calculate forward
    df_VIX['diff'] = np.abs(df_VIX['C']-df_VIX['P'])
    df_fwd = df_VIX.groupby(level=0,group_keys=False).apply(lambda x: CalcForward(x.nsmallest(n=1,columns='diff')))
    df_fwd.columns = ['F']

    ## select out of money call

    df_VIX = df_VIX.groupby(level=0,group_keys=False).apply(lambda x: selectOption(x,df_fwd))
    df_VIX.rename(columns={'C':'option_price_C','T':'expiry','P':'option_price_P','S':'adjusted_price'},inplace=True)
    df_VIX['exdate'] = data['exdate'].unique()[0]
    return df_VIX
df_t1 = VIXFilter(df_t1_C)
df_t2 = VIXFilter(df_t2_C)

## Arbitrage Repair

In [38]:
# Arbitrage repair function
def arbitrageRepair(columnT, columnK, columnC, columnF):
    # normalise strikes and call prices
    normaliser = constraints.Normalise()
    T = np.array(columnT)
    K = np.array(columnK)
    C = np.array(columnC)
    F = np.array(columnF)
    normaliser.fit(T, K, C, F)
    T1, K1, C1 = normaliser.transform(T, K, C)
    
    # construct arbitrage constraints and detect violation
    mat_A, vec_b, _, _ = constraints.detect(T1, K1, C1, verbose=False)
    
    # repair arbitrage - l1-norm objective
    epsilon = repair.l1(mat_A, vec_b, C1)
    
    # de-normalise
    K0, C0 = normaliser.inverse_transform(K1, C1 + epsilon)
    
    return K0, C0

In [81]:
result = pd.DataFrame()

for t0 in t0List:
    
    ## T1:
    df_t0t1 = df_t1[df_t1.date==t0]
    df_t0t1 = df_t0t1.nlargest(columns='volume',n=19)
    # calculate the forward price:
    df_t0t1['forward_price'] = df_t0t1['adjusted_price']/((df_t0t1['adjusted_price']-(df_t0t1['option_price_C']-df_t0t1['option_price_P']))/df_t0t1['strike_price'])
    
    # arbitrage repair:
    K0_t1, C0_t1 = arbitrageRepair(df_t0t1['expiry'], df_t0t1['strike_price'], df_t0t1['option_price_C'], df_t0t1['forward_price'])
       
    # temporary result at t0:
    result_t1 = df_t0t1[['date','exdate']].copy()
    result_t1['strike_price'] = K0_t1
    result_t1['call_option_price'] = C0_t1
        
    
    ## T2:
    # t2: 20 largest trading volume
    df_t0t2 = df_t2[df_t2.date==t0]
    df_t0t2 = df_t0t2.nlargest(columns='volume',n=19)
    # calculate the forward price:
    
    df_t0t2['forward_price'] = df_t0t2['adjusted_price']/((df_t0t2['adjusted_price']-(df_t0t2['option_price_C']-df_t0t2['option_price_P']))/df_t0t2['strike_price'])
    
    # arbitrage repair:
    K0_t2, C0_t2 = arbitrageRepair(df_t0t2['expiry'], df_t0t2['strike_price'], df_t0t2['option_price_C'], df_t0t2['forward_price'])
    
    # temporary result at t0:
    result_t2 = df_t0t2[['date','exdate']].copy()
    result_t2['strike_price'] = K0_t2
    result_t2['call_option_price'] = C0_t2
    
    if len(result_t1) >= 19 and len(result_t2) >= 19:  ##keep the shape of data
        ## Concat horizontally  
        result_tmp = pd.concat([result_t1.reset_index(drop=True), result_t2.drop(columns='date').reset_index(drop=True)], axis=1)

        ## Concat vertically
        result = pd.concat([result,result_tmp], axis=0)
    
result.columns = ['date', 'T1', 'K1', 'C1',  'T2', 'K2', 'C2']
result = result.reset_index(drop=True)
result = pd.merge(result, stock_AMZN, on = 'date')
result.columns = ['t0', 'T1', 'K1', 'C1',  'T2', 'K2', 'C2',  'S0', 'Adj_S0']

In [14]:
# pip install pandas_market_calendars
import pandas_market_calendars as mcal

In [84]:
result.to_csv('data_20180119_20180420_VIX.csv', header=True)

In [90]:
result[result['t0']=='2018-01-02']

Unnamed: 0,t0,T1,K1,C1,T2,K2,C2,S0,Adj_S0
0,2018-01-02,2018-01-19,1192.5,17.1,2018-04-20,1240.0,43.875,59.4505,1189.01001
1,2018-01-02,2018-01-19,1195.0,15.75599,2018-04-20,1260.0,32.45721,59.4505,1189.01001
2,2018-01-02,2018-01-19,1197.5,14.675,2018-04-20,1280.0,30.6,59.4505,1189.01001
3,2018-01-02,2018-01-19,1200.0,13.676187,2018-04-20,1300.0,25.325,59.4505,1189.01001
4,2018-01-02,2018-01-19,1210.0,9.725,2018-04-20,1320.0,25.552384,59.4505,1189.01001
5,2018-01-02,2018-01-19,1220.0,7.025,2018-04-20,1340.0,16.468994,59.4505,1189.01001
6,2018-01-02,2018-01-19,1230.0,5.0,2018-04-20,1360.0,14.025,59.4505,1189.01001
7,2018-01-02,2018-01-19,1240.0,3.375,2018-04-20,1380.0,14.166773,59.4505,1189.01001
8,2018-01-02,2018-01-19,1250.0,2.35,2018-04-20,1400.0,9.375,59.4505,1189.01001
9,2018-01-02,2018-01-19,1260.0,1.8,2018-04-20,1420.0,7.662105,59.4505,1189.01001


In [91]:
df_vol = pd.read_csv('data_20180119_20180420.csv',index_col=0)
df_vol[df_vol['t0']=='2018-01-02']

Unnamed: 0,t0,T1,K1,C1,T2,K2,C2,S0,Adj_S0
0,2018-01-02,2018-01-19,1000.0,191.525,2018-04-20,800.0,396.725,59.4505,1189.01001
1,2018-01-02,2018-01-19,1100.0,92.525,2018-04-20,1015.0,191.675,59.4505,1189.01001
2,2018-01-02,2018-01-19,1150.0,46.075,2018-04-20,1025.0,183.025,59.4505,1189.01001
3,2018-01-02,2018-01-19,1160.0,38.0,2018-04-20,1050.0,157.776341,59.4505,1189.01001
4,2018-01-02,2018-01-19,1170.0,30.45,2018-04-20,1080.0,137.85,59.4505,1189.01001
5,2018-01-02,2018-01-19,1175.0,27.125,2018-04-20,1100.0,122.75,59.4505,1189.01001
6,2018-01-02,2018-01-19,1180.0,23.875,2018-04-20,1120.0,108.4,59.4505,1189.01001
7,2018-01-02,2018-01-19,1185.0,20.95,2018-04-20,1140.0,95.0,59.4505,1189.01001
8,2018-01-02,2018-01-19,1187.5,19.480856,2018-04-20,1160.0,82.5,59.4505,1189.01001
9,2018-01-02,2018-01-19,1190.0,18.174364,2018-04-20,1180.0,71.177643,59.4505,1189.01001


In [18]:
# K = [[np.array(result.loc[result.t0 == t0,'K1']),np.array(result.loc[result.t0==t0,'K2'])] for t0 in t0List]
# Pi = [[np.array(result.loc[result.t0 == t0,'C1']),np.array(result.loc[result.t0==t0,'C2'])] for t0 in t0List]