In [23]:
import numpy as np
import pandas as pd
import os
from datetime import timedelta
# %pip install arbitrageRepair
from arbitragerepair import constraints, repair
# %pip install pandas_market_calendars
import pandas_market_calendars as mcal

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
raw_data = pd.read_csv('Call Option Data/fqjo3s8eacwzxkcw.csv')
tickers = raw_data['ticker'].unique()

In [6]:
tickers

array(['AMZN', 'JPM', 'JNJ', 'MSFT', 'PG', 'WMT', 'GOOGL', 'V', 'TSLA'],
      dtype=object)

In [7]:
raw_data.head()

Unnamed: 0,date,symbol,exdate,last_date,cp_flag,strike_price,best_bid,best_offer,volume,impl_volatility,optionid,forward_price,ticker,index_flag,issuer,exercise_style
0,2018-01-02,AMZN 180105C1000000,2018-01-05,2017-12-29,C,1000000,189.35,190.6,0,0.962943,118189007,,AMZN,0,AMAZON.COM INC.,A
1,2018-01-02,AMZN 180105C1002500,2018-01-05,,C,1002500,186.85,188.1,0,0.951017,118541021,,AMZN,0,AMAZON.COM INC.,A
2,2018-01-02,AMZN 180105C1005000,2018-01-05,,C,1005000,184.35,185.6,0,0.939104,118541022,,AMZN,0,AMAZON.COM INC.,A
3,2018-01-02,AMZN 180105C1007500,2018-01-05,,C,1007500,181.85,183.1,0,0.927205,118541023,,AMZN,0,AMAZON.COM INC.,A
4,2018-01-02,AMZN 180105C1010000,2018-01-05,2017-12-29,C,1010000,179.35,180.6,0,0.915319,118189008,,AMZN,0,AMAZON.COM INC.,A


In [37]:
# Arbitrage repair

def arbitrageRepair(columnT, columnK, columnC_ask, columnC_bid, columnF):
    # normalise strikes and call prices
    normaliser = constraints.Normalise()
    T = np.array(columnT)
    K = np.array(columnK)
    C_bid = np.array(columnC_bid)
    C_ask = np.array(columnC_ask)
    C = (C_ask+C_bid)/2
    F = np.array(columnF)
    
    normaliser.fit(T, K, C, F)
    T1, K1, C1 = normaliser.transform(T, K, C)
    _, _, C1_bid = normaliser.transform(T, K, C_bid)
    _, _, C1_ask = normaliser.transform(T, K, C_ask)

    # construct arbitrage constraints and detect violation
    mat_A, vec_b, _, _ = constraints.detect(T1, K1, C1, verbose=False)

    epsilon = repair.l1(mat_A, vec_b, C1)

    # de-normalise
    K0, C0_ask = normaliser.inverse_transform(K1, C1_ask + epsilon)
    _, C0_bid = normaliser.inverse_transform(K1, C1_bid + epsilon)

    return K0, C0_ask, C0_bid

In [39]:
# Data processing
## call and put options, ask and bid prices

def dataProcess2Di(df,t1,t2,ticker,stock_daily):

    # call options data
    df_t1_C = df.loc[(df.exdate==t1)&(df.cp_flag=='C'), ['date','exdate','cp_flag','strike_price','volume','best_offer','best_bid','impl_volatility']]
    df_t2_C = df.loc[(df.exdate==t2)&(df.cp_flag=='C'), ['date','exdate','cp_flag','strike_price','volume','best_offer','best_bid','impl_volatility']]

    # put options data
    df_t1_P = df.loc[(df.exdate==t1)&(df.cp_flag=='P'), ['date','exdate','cp_flag','strike_price','volume','best_offer','best_bid','impl_volatility']]
    df_t2_P = df.loc[(df.exdate==t2)&(df.cp_flag=='P'), ['date','exdate','cp_flag','strike_price','volume','best_offer','best_bid','impl_volatility']]

    t0List = list(df_t1_C.date.unique())

    # Threshold for trading volume
    volumeThreshold = 5
    
    result = pd.DataFrame()
    for t0 in t0List:
        ## Check data availability at both T1 and T2:
        if len(df_t1_C[df_t1_C.date==t0])*len(df_t2_C[df_t2_C.date==t0])==0:
            continue
    
        ## T1:
        # t1: call options with 20 largest trading volumes above the threshold
        tmp_t1_C = df_t1_C[df_t1_C.date==t0].sort_values('volume',ascending=False).iloc[0:20]
        df_t0t1_C = tmp_t1_C[tmp_t1_C.volume >= volumeThreshold].reset_index(drop=True)
        # Available call options at t1?
        if len(df_t0t1_C)==0:
            continue

        # merge with put options data and stock price data
        df_t0t1_C = df_t0t1_C.merge(df_t1_P, on = ['date','exdate','strike_price'], suffixes = ('_C','_P'))
        df_t0t1_C = pd.merge(df_t0t1_C, stock_daily, on = 'date')

        # calculate the forward price:
        df_t0t1_C['option_price_C'] = df_t0t1_C[['best_offer_C','best_bid_C']].mean(axis=1)
        df_t0t1_C['option_price_P'] = df_t0t1_C[['best_offer_P','best_bid_P']].mean(axis=1)
        df_t0t1_C['forward_price'] = df_t0t1_C['adjusted_price']/((df_t0t1_C['adjusted_price']-(df_t0t1_C['option_price_C']-df_t0t1_C['option_price_P']))/df_t0t1_C['strike_price'])

        # calculate the expiry:
        # Amazon (AMZN) is listed on the NASDAQ exchange
        amzn_t = mcal.get_calendar('NASDAQ')
        trading_days_1 = amzn_t.valid_days(start_date=t0, end_date=t1)
        df_t0t1_C['expiry'] = len(trading_days_1) / 252

        # arbitrage repair:
        K0_t1_C, C0_ask_t1, C0_bid_t1 = arbitrageRepair(df_t0t1_C['expiry'], df_t0t1_C['strike_price'], df_t0t1_C['best_offer_C'], df_t0t1_C['best_bid_C'], df_t0t1_C['forward_price'])

        # temporary result at t0:
        result_t1_C = df_t0t1_C[['date','exdate']].copy()
        result_t1_C['strike_price'] = K0_t1_C
        result_t1_C['ask_price'] = C0_ask_t1
        result_t1_C['bid_price'] = C0_bid_t1
        

        ## T2:
        # t2: call options with 20 largest trading volumes above the threshold
        tmp_t2_C = df_t2_C[df_t2_C.date==t0].sort_values('volume',ascending=False).iloc[0:20]
        df_t0t2_C = tmp_t2_C[tmp_t2_C.volume >= volumeThreshold].reset_index(drop=True)
        # Available call options at t2?
        if len(df_t0t2_C)==0:
            continue

        # merge with put/call options data and stock price data
        df_t0t2_C = df_t0t2_C.merge(df_t2_P, on = ['date','exdate','strike_price'], suffixes = ('_C','_P'))
        df_t0t2_C = pd.merge(df_t0t2_C, stock_daily, on = 'date')

        # calculate the forward price:
        df_t0t2_C['option_price_C'] = df_t0t2_C[['best_offer_C','best_bid_C']].mean(axis=1)
        df_t0t2_C['option_price_P'] = df_t0t2_C[['best_offer_P','best_bid_P']].mean(axis=1)
        df_t0t2_C['forward_price'] = df_t0t2_C['adjusted_price']/((df_t0t2_C['adjusted_price']-(df_t0t2_C['option_price_C']-df_t0t2_C['option_price_P']))/df_t0t2_C['strike_price'])

        # calculate the expiry:
        trading_days_2 = amzn_t.valid_days(start_date=t0, end_date=t2)
        df_t0t2_C['expiry'] = len(trading_days_2) / 252

        # arbitrage repair:
        K0_t2_C, C0_ask_t2, C0_bid_t2 = arbitrageRepair(df_t0t2_C['expiry'], df_t0t2_C['strike_price'], df_t0t2_C['best_offer_C'], df_t0t2_C['best_bid_C'], df_t0t2_C['forward_price'])

        # temporary result at t0:
        result_t2_C = df_t0t2_C[['date','exdate']].copy()
        result_t2_C['strike_price'] = K0_t2_C
        result_t2_C['ask_price'] = C0_ask_t2
        result_t2_C['bid_price'] = C0_bid_t2
        

        ## Concat horizontally
        result_t1_C = result_t1_C.reset_index(drop=True)
        result_t2_C = result_t2_C.reset_index(drop=True)
        result_tmp = pd.concat([result_t1_C, result_t2_C.drop(columns='date')], axis=1)
        
        ## Concat vertically
        result = pd.concat([result,result_tmp], axis=0, ignore_index=True)

    if len(result) != 0:
        result = result.reset_index(drop=True)
        result = pd.merge(result, stock_daily, on = 'date')
        result.columns = ['t0', 'T1', 'K1',  'C1_ask', 'C1_bid', 'T2', 'K2', 'C2_ask', 'C2_bid', 'Adj_S0']
        file_name = 'data/test/{}_{}_{}.csv'.format(ticker,pd.to_datetime(t1).strftime('%Y%m%d'),pd.to_datetime(t2).strftime('%Y%m%d'))
        result.to_csv(file_name, header=True)

In [10]:
# Data of adjusted stocks prices
stocks = pd.read_csv('adjusted_stocks.csv')
stocks = stocks.rename(columns = {'Date':'date'})
stocks.head()

Unnamed: 0,date,AMZN,JPM,JNJ,MSFT,PG,WMT,GOOGL,V,TSLA
0,2018-01-02,1189.01001,90.125488,117.50267,80.080925,76.66967,88.322428,1073.209991,109.733635,320.530014
1,2018-01-03,1204.199982,90.217331,118.625122,80.453613,76.576607,89.092855,1091.520004,110.826103,317.249994
2,2018-01-04,1209.589996,91.509758,118.61673,81.161705,77.117928,89.173491,1095.759964,111.238144,314.619999
3,2018-01-05,1229.140015,90.92231,119.59568,82.167976,77.168678,89.702036,1110.289993,113.902199,316.58
4,2018-01-08,1246.869965,91.056587,119.747589,82.251801,77.574646,91.027891,1114.209976,114.362175,336.410007


In [13]:
# path
os.makedirs('data/test/')

In [None]:
for ticker in tickers:
    stock_Daily = stocks[['date', ticker]]
    stock_Daily = stock_Daily.rename(columns = {ticker:'adjusted_price'})
    stock_Daily['date'] = pd.to_datetime(stock_Daily['date'])
    
    df = raw_data.loc[raw_data.ticker==ticker,
                      ['date','exdate','cp_flag','strike_price','best_bid','best_offer','volume','impl_volatility']].copy()
    df['date'] = pd.to_datetime(df['date'])
    df['exdate'] = pd.to_datetime(df['exdate'])
    df['strike_price'] = df['strike_price']/1000 # scaling strike

    ## Select T1 and T2 which has closest distance of 1 month
    T1_List = pd.to_datetime(df['exdate'].unique())
    T1_T2_List = pd.DataFrame()
    for t1 in T1_List:
        # dates_1month = [t1+timedelta(days=21), t1+timedelta(days=28), t1+timedelta(days=35)]
        dates_1month = [t1+timedelta(days=28)]
        t2 = [date for date in T1_List if date in dates_1month]
        if len(t2) > 0:
            T1_T2_List = pd.concat([T1_T2_List,pd.DataFrame({'t1': [t1] * len(t2), 't2': t2})])
            
    ## Data processing
    for i in range(len(T1_T2_List)):
        t1 = T1_T2_List.iloc[i,0]
        t2 = T1_T2_List.iloc[i,1]
        dataProcess2Di(df,t1,t2,ticker=ticker,stock_daily=stock_Daily)