In [1]:
#!/user/xt2276/.conda/envs/optionEnv/bin/python
import pandas as pd
import pickle
import os
import time
from itertools import product, permutations
from tqdm import tqdm
import pandas as pd
from collections import OrderedDict
import pickle
import numpy as np
from scipy.interpolate import interp1d
from scipy.optimize import root

def get_forward_price(date, exdate, am_settlement):

    with open("./2020_2023_forward_price_dict.pkl", 'rb') as f:
        fp_dict = pickle.load(f)
    
    return fp_dict[(date, exdate, am_settlement)]


def preprocessing(df, am_settlement=1):
    df = df[df['am_settlement'] == am_settlement]
    # Assuming df is your DataFrame, 'strike_price' and 'maturity_date' are the columns of interest

    # First, ensure that 'strike_price' and 'maturity_date' are the appropriate types (if they're not already)
    df['strike_price'] = df['strike_price'].astype(float)
    df['exdate'] = pd.to_datetime(df['exdate'])

    # Create the dictionary for 'strike_price'
    strike_price_dict = df.groupby('strike_price').apply(lambda x: x.index.tolist()).to_dict()

    # Sort the dictionary by key (i.e., strike price)
    strike_price_dict = OrderedDict(sorted(strike_price_dict.items()))

    # Create the dictionary for 'maturity_date'
#     maturity_date_dict = df.groupby('exdate').apply(lambda x: x.index.tolist()).to_dict()

    # Sort the dictionary by key (i.e., maturity date)
#     maturity_date_dict = OrderedDict(sorted(maturity_date_dict.items()))

    return strike_price_dict


# ------------------------------- constructing price 4 tuple set------------------------
#  this version has no constraint that K2 needs to be greater than K1

def get_price_set(strike_price_dict):
    #     perm_dict = {key: len(list(permutations(strike_price_dict[key], 2))) for key in strike_price_dict.keys()}
    # Initialize an empty set to store the 4-tuples
    s1 = set()

    perm_dict = {key: list(permutations(strike_price_dict[key], 2)) for key in strike_price_dict.keys()}
    # For each strike price group
    keys_list = list(strike_price_dict.keys())

    for i, key1 in enumerate(keys_list):
        # Generate all combinations of 2 elements in this group
        perm1 = perm_dict[key1]
        for key2 in keys_list:  # Here, start from the key following key1
            # Generate all permutations of 2 elements in this group
            perm2 = perm_dict[key2]
        
            # Generate all ordered pairs of permutations from the two groups
            for pair1 in perm1:
                for pair2 in perm2:
                    # Check conditions: r1's strike price = r4's strike price and r2's strike price = r3's strike price
                    s1.add((pair1[0], pair2[0], pair2[1], pair1[1]))
    return s1

def index_to_symbol(df, four_tuples_set, cp_flag = 'C'):

    data = []
    for tup in four_tuples_set:
        s1, s2, s3, s4 = tup
        s1_prime = df.loc[s1, 'symbol']
        s2_prime = df.loc[s2, 'symbol']
        s3_prime = df.loc[s3, 'symbol']
        s4_prime = df.loc[s4, 'symbol']

        strike_prices = [df.loc[s, 'strike_price'] for s in [s1, s2, s3, s4]]
        forward_prices = [df.loc[s, 'forward_price'] for s in [s1, s2, s3, s4]]
        maturity_dates = [df.loc[s, 'exdate'] for s in [s1, s2, s3, s4]]
        s1_mid = (df.loc[s1, 'best_bid'] + df.loc[s1, 'best_offer']) / 2
        s2_mid = (df.loc[s2, 'best_bid'] + df.loc[s2, 'best_offer']) / 2
        s3_mid = (df.loc[s3, 'best_bid'] + df.loc[s3, 'best_offer']) / 2
        s4_mid = (df.loc[s4, 'best_bid'] + df.loc[s4, 'best_offer']) / 2
        mid_prices = [s1_mid, s2_mid, s3_mid, s4_mid]
        
        violation_magnitude = 0
        validate = True
        validate_tol = True
        if cp_flag == 'C':
            if s1_mid * s2_mid < s4_mid * s3_mid:
                violation_magnitude = s4_mid * s3_mid - s1_mid * s2_mid
                validate = False
                if df.loc[s1, 'best_offer'] * df.loc[s2, 'best_offer'] < df.loc[s4, 'best_bid'] * df.loc[s3, 'best_bid']:
                    validate_tol = False
        else:
            if s1_mid * s2_mid > s4_mid * s3_mid:
                violation_magnitude = s1_mid * s2_mid - s4_mid * s3_mid
                validate = False
                if df.loc[s1, 'best_bid'] * df.loc[s2, 'best_bid'] > df.loc[s4, 'best_offer'] * df.loc[s3, 'best_offer']:
                    validate_tol = False

        data.append(((s1_prime, s2_prime, s3_prime, s4_prime), validate, validate_tol, strike_prices, maturity_dates, mid_prices, violation_magnitude, forward_prices))

    # Create the new DataFrame
    new_df = pd.DataFrame(data, columns=["Index", "validate", "validate_tol", "Strike Prices", "Maturity Dates", "Mid Prices", "Violation Magnitude", "Forward Prices"])
    new_df.set_index("Index", inplace=True)

    return new_df

def generate_s3_indexed(dataset_name, start_date, end_date, vol=50, cp_flag = 'C', am_flag = 1):
    # Read the main CSV file into a DataFrame
    with open(f'./{dataset_name}', 'rb') as f:
        df = pickle.load(f)
#     df = pd.read_csv(dataset_name)
    # print(len(df))
    df = df[df['volume'] > vol]
    df = df[df['cp_flag'] == cp_flag]
    df = df[df['date'] >= start_date]
    df = df[df['date'] < end_date]
    df = df[df['am_settlement'] == am_flag]
    
    # Ensure that the date column is of type datetime
    df['date'] = pd.to_datetime(df['date'])
    df['exdate'] = pd.to_datetime(df['exdate'])

    # Get a list of unique dates
    dates = df['date'].unique()
    
    # Process the data for each date
    for date in dates:
        df_date = df[df['date'] == date]  # Filter the DataFrame to include only rows with the current date

        strike_price_dict = preprocessing(df_date, am_settlement=am_flag)
#         strike_price_dict, maturity_date_dict = preprocessing(df_date, am_settlement=0)

        s1 = get_price_set(strike_price_dict)
        
#         Enforce that 1<=K1/F_T1 <= K2/F_T2
        s2 = set()
        for s in s1:
            (a,b,c,d) = s
#             if cp_flag == 'C':
#                 print((df_date.loc[a,"strike_price"], df_date.loc[a,"forward_price"])) (4275000, 4317.775429)
#                 if 1000 < (df_date.loc[a,"strike_price"]/df_date.loc[a,"forward_price"]) < (df_date.loc[b,"strike_price"]/df_date.loc[b,"forward_price"]):
            if (df_date.loc[a,"strike_price"]/df_date.loc[a,"forward_price"]) < (df_date.loc[b,"strike_price"]/df_date.loc[b,"forward_price"]):
                s2.add(s)
        
        s3 = set()
        for s in s2:
            (a,b,c,d) = s
            if df_date.loc[a,"exdate"] == df_date.loc[c,"exdate"] and df_date.loc[b,"exdate"] == df_date.loc[d,"exdate"] and df_date.loc[a,"exdate"] < df_date.loc[b,"exdate"]:
                s3.add(s)
        print(f"{date} finished")
        
        
#         original_df = pd.read_csv(dataset_name)
        with open(f'./{dataset_name}', 'rb') as f:
                original_df = pickle.load(f)
        indexed_df = index_to_symbol(original_df, s3, cp_flag)
        with open(f'./2022_Filtering_condition_3/{str(date)[:10]}FC3_indexed_{cp_flag}_am={str(am_flag)}_vol{str(vol)}_{dataset_name}.pkl', 'wb') as f:
            pickle.dump(indexed_df, f)
            
def main():
    pd.options.mode.chained_assignment = None
    generate_s3_indexed(dataset_name = '2022_with_forward_price.pkl', start_date = '2022-02-28', end_date = '2022-03-01', vol=50, cp_flag = 'C', am_flag = 0)
    generate_s3_indexed(dataset_name = '2022_with_forward_price.pkl', start_date = '2022-02-28', end_date = '2022-03-01', vol=50, cp_flag = 'P', am_flag = 0)

    
if __name__ == "__main__":

    main()


2022-02-28T00:00:00.000000000 finished
2022-02-28T00:00:00.000000000 finished


In [2]:
generate_s3_indexed(dataset_name = '2022_with_forward_price.pkl', start_date = '2022-10-31', end_date = '2022-11-01', vol=50, cp_flag = 'C', am_flag = 0)
generate_s3_indexed(dataset_name = '2022_with_forward_price.pkl', start_date = '2022-10-31', end_date = '2022-11-01', vol=50, cp_flag = 'P', am_flag = 0)


2022-10-31T00:00:00.000000000 finished
2022-10-31T00:00:00.000000000 finished
