# Data Processing

## import packages

In [2]:
import numpy as np
import pandas as pd
from arbitragerepair import constraints, repair

In [2]:
raw_data = pd.read_csv('./Call Option Data/fqjo3s8eacwzxkcw.csv')

In [3]:
# Choose Amazon as the underlying stock
df = raw_data.loc[raw_data.ticker=='AMZN', 
                  ['date','exdate','cp_flag','strike_price','best_bid','best_offer','volume']].copy()
df['date'] = pd.to_datetime(df['date'])
df['exdate'] = pd.to_datetime(df['exdate'])
df['strike_price'] = df['strike_price']/10000 # scaling strike
df['option_price'] = df[['best_bid','best_offer']].mean(axis=1)
df = df.drop(columns = ['best_bid','best_offer'])
df.head()

Unnamed: 0,date,exdate,cp_flag,strike_price,volume,option_price
0,2018-01-02,2018-01-05,C,100.0,0,189.975
1,2018-01-02,2018-01-05,C,100.25,0,187.475
2,2018-01-02,2018-01-05,C,100.5,0,184.975
3,2018-01-02,2018-01-05,C,100.75,0,182.475
4,2018-01-02,2018-01-05,C,101.0,0,179.975


In [4]:
# Assume t0 = 2018-01-02
df.loc[df.date=='2018-01-02','exdate'].unique()

array(['2018-01-05T00:00:00.000000000', '2018-01-12T00:00:00.000000000',
       '2018-01-19T00:00:00.000000000', '2018-01-26T00:00:00.000000000',
       '2018-02-02T00:00:00.000000000', '2018-02-09T00:00:00.000000000',
       '2018-02-16T00:00:00.000000000', '2018-03-16T00:00:00.000000000',
       '2018-04-20T00:00:00.000000000', '2018-06-15T00:00:00.000000000',
       '2018-07-20T00:00:00.000000000', '2018-09-21T00:00:00.000000000',
       '2019-01-18T00:00:00.000000000', '2019-06-21T00:00:00.000000000',
       '2020-01-17T00:00:00.000000000'], dtype='datetime64[ns]')

## Fixed $T_1$, $T_2$, and drag back different $t_0$

In [4]:
# Choose t1 = '2018-01-19', t2 = '2018-04-20' (arbitrarily,temporarily)

t1 = '2018-01-19'
t2 = '2018-04-20'

# call options data
df_t1 = df.loc[(df.exdate==t1)&(df.cp_flag=='C'), ['date','exdate','strike_price','volume','option_price']]
df_t2 = df.loc[(df.exdate==t2)&(df.cp_flag=='C'), ['date','exdate','strike_price','volume','option_price']]

# put options data
df_t1_P = df.loc[(df.exdate==t1)&(df.cp_flag=='P'), ['date','exdate','strike_price','option_price']]
df_t2_P = df.loc[(df.exdate==t2)&(df.cp_flag=='P'), ['date','exdate','strike_price','option_price']]

In [5]:
# t0 to proceed with
t0List = list(df_t1.date.unique())

print(len(t0List))
print(t0List) 

13
[numpy.datetime64('2018-01-02T00:00:00.000000000'), numpy.datetime64('2018-01-03T00:00:00.000000000'), numpy.datetime64('2018-01-04T00:00:00.000000000'), numpy.datetime64('2018-01-05T00:00:00.000000000'), numpy.datetime64('2018-01-08T00:00:00.000000000'), numpy.datetime64('2018-01-09T00:00:00.000000000'), numpy.datetime64('2018-01-10T00:00:00.000000000'), numpy.datetime64('2018-01-11T00:00:00.000000000'), numpy.datetime64('2018-01-12T00:00:00.000000000'), numpy.datetime64('2018-01-16T00:00:00.000000000'), numpy.datetime64('2018-01-17T00:00:00.000000000'), numpy.datetime64('2018-01-18T00:00:00.000000000'), numpy.datetime64('2018-01-19T00:00:00.000000000')]


## Use stock prices to Arbitrage Repair

In [6]:
import datetime as dt
import yfinance as yf
amzn = yf.Ticker("AMZN")
#amzn.info
#amzn.institutional_holders

start_date = dt.datetime(2018, 1, 1)

tickers_list = ['AMZN']

df_amzn = yf.download(tickers_list, start=start_date, progress=True)
print(df_amzn.head())
fn = './amzn_stock_%s_%s_daily.csv'
df_amzn.to_csv(fn, sep=',', encoding='utf-8')
print('saved to %s'%(fn))

[*********************100%%**********************]  1 of 1 completed

                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2018-01-02  58.599998  59.500000  58.525501  59.450500  59.450500  53890000
2018-01-03  59.415001  60.274502  59.415001  60.209999  60.209999  62176000
2018-01-04  60.250000  60.793499  60.233002  60.479500  60.479500  60442000
2018-01-05  60.875500  61.457001  60.500000  61.457001  61.457001  70894000
2018-01-08  61.799999  62.653999  61.601501  62.343498  62.343498  85590000
saved to ./amzn_stock_%s_%s_daily.csv





In [7]:
df_amzn.reset_index(inplace=True)
df_amzn['date'] = pd.to_datetime(df_amzn['Date'], format="%d/%m/%y")
stock_AMZN = df_amzn[['date','Adj Close']].copy()
stock_AMZN.rename(columns = {'Adj Close' : 'stock_price'}, inplace = True)
stock_AMZN

Unnamed: 0,date,stock_price
0,2018-01-02,59.450500
1,2018-01-03,60.209999
2,2018-01-04,60.479500
3,2018-01-05,61.457001
4,2018-01-08,62.343498
...,...,...
1556,2024-03-11,171.960007
1557,2024-03-12,175.389999
1558,2024-03-13,176.559998
1559,2024-03-14,178.750000


In [7]:
# Arbitrage repair function
def arbitrageRepair(columnT, columnK, columnC, columnF):
    # normalise strikes and call prices
    normaliser = constraints.Normalise()
    T = np.array(columnT)
    K = np.array(columnK)
    C = np.array(columnC)
    F = np.array(columnF)
    normaliser.fit(T, K, C, F)
    T1, K1, C1 = normaliser.transform(T, K, C)
    
    # construct arbitrage constraints and detect violation
    mat_A, vec_b, _, _ = constraints.detect(T1, K1, C1, verbose=False)
    
    # repair arbitrage - l1-norm objective
    epsilon = repair.l1(mat_A, vec_b, C1)
    
    # de-normalise
    K0, C0 = normaliser.inverse_transform(K1, C1 + epsilon)
    
    return K0, C0

In [8]:
result = pd.DataFrame()

for t0 in t0List:
    ## T1:
    # t1: 20 largest trading volume
    df_t0t1 = df_t1[df_t1.date==t0].sort_values('volume',ascending=False).iloc[0:20].drop(columns='volume').reset_index(drop=True)
    
    # merge with put options data and stock price data
    df_t0t1 = df_t0t1.merge(df_t1_P, on = ['date','exdate','strike_price'], suffixes = ('_C','_P'))
    df_t0t1 = pd.merge(df_t0t1, stock_AMZN, on = 'date')
    
    # calculate the forward price:
    # df_t0t1['forward_price'] = df_t0t1['stock_price']/((df_t0t1['stock_price']-(df_t0t1['option_price_C']-df_t0t1['option_price_P']))/df_t0t1['strike_price'])
    df_t0t1['forward_price'] = df_t0t1['stock_price']
    
    # calculate the expiry:
    df_t0t1['expiry'] = (df_t0t1['exdate']-df_t0t1['date']).apply(lambda x: x.days/365)
    
    # arbitrage repair:
    K0_t1, C0_t1 = arbitrageRepair(df_t0t1['expiry'], df_t0t1['strike_price'], df_t0t1['option_price_C'], df_t0t1['forward_price'])
    
    # temporary result at t0:
    result_t1 = df_t0t1[['date','exdate']].copy()
    result_t1['strike_price'] = K0_t1
    result_t1['option_price'] = C0_t1
    
    
    ## T2:
    # t2: 20 largest trading volume
    df_t0t2 = df_t2[df_t2.date==t0].sort_values('volume',ascending=False).iloc[0:20].drop(columns='volume').reset_index(drop=True)
    
    # merge with put options data and stock price data
    df_t0t2 = pd.merge(df_t0t2, df_t2_P, on = ['date','exdate','strike_price'], suffixes = ('_C','_P'))
    df_t0t2 = pd.merge(df_t0t2, stock_AMZN, on = 'date')
    
    # calculate the forward price:
    # df_t0t2['forward_price'] = df_t0t2['stock_price']/((df_t0t2['stock_price']-(df_t0t2['option_price_C']-df_t0t2['option_price_P']))/df_t0t2['strike_price'])
    df_t0t2['forward_price'] = df_t0t2['stock_price']
    
    # calculate the expiry:
    df_t0t2['expiry'] = (df_t0t2['exdate']-df_t0t2['date']).apply(lambda x: x.days/365)
    
    # arbitrage repair:
    K0_t2, C0_t2 = arbitrageRepair(df_t0t2['expiry'], df_t0t2['strike_price'], df_t0t2['option_price_C'], df_t0t2['forward_price'])
    
    # temporary result at t0:
    result_t2 = df_t0t2[['date','exdate']].copy()
    result_t2['strike_price'] = K0_t2
    result_t2['option_price'] = C0_t2
    
    
    ## Concat horizontally
    result_tmp = pd.concat([result_t1, result_t2.drop(columns='date')], axis=1)
    
    ## Concat vertically
    result = pd.concat([result,result_tmp], axis=0)
    
result.columns = ['t0','t1','K1','pi1','t2','K2','pi2']
result = result.reset_index(drop=True)

In [9]:
print(result.shape)
result.head()

(260, 7)


Unnamed: 0,t0,t1,K1,pi1,t2,K2,pi2
0,2018-01-02,2018-01-19,100.0,23.936454,2018-04-20,80.0,49.40179
1,2018-01-02,2018-01-19,110.0,20.385049,2018-04-20,101.5,46.7012
2,2018-01-02,2018-01-19,115.0,18.609347,2018-04-20,102.5,46.575591
3,2018-01-02,2018-01-19,116.0,18.254206,2018-04-20,105.0,46.261569
4,2018-01-02,2018-01-19,117.0,17.899066,2018-04-20,108.0,45.884742


In [12]:
result.to_csv('data_20180119_20180420.csv', header=True)

In [10]:
# K = [[np.array(result.loc[result.t0 == t0,'K1']),np.array(result.loc[result.t0==t0,'K2'])] for t0 in t0List]
# Pi = [[np.array(result.loc[result.t0 == t0,'pi1']),np.array(result.loc[result.t0==t0,'pi2'])] for t0 in t0List]