In [None]:
import pandas as pd
import numpy as np

In [None]:
def remove_tic(data, tickers, num_of_ts, factors):
    '''
    Input: data, tickers, num_of_ts, factors
    1. Removes entries without price/volume data
    2. Removes entries without enough time steps
    Output: data, tickers
    '''
    # Remove entries without key factors such as trading volume, open/close prices
    tic_rm_count = 0
    for f in ['cshoc', 'cshtrd']:
        data[f] = data.groupby('tic')[f].ffill()
        data[f] = data.groupby('tic')[f].bfill()
    for f in ['prccd', 'prchd', 'prcld', 'prcod']:
        tic_to_remove = list(data[data[f].isna()].tic.unique())
        tic_rm_count += len(tic_to_remove)
        print(f'Removed {tic_to_remove} for missing {f}')
        data = data[~data['tic'].isin(tic_to_remove)]
    print(f'There are {tic_rm_count} unique tickers to remove due to missing values')
    
    # Remove entries where there are not enough number of time stamps
    databytic_count = data.groupby('tic').count()
    incomplete_tics = databytic_count.loc[databytic_count['datadate']<num_of_ts].index.tolist()
    print(f'Removed {incomplete_tics}')
    data = data[~data['tic'].isin(incomplete_tics)]
    tickers = data['tic'].unique()
    print(f'There are {len(incomplete_tics)} unique tickers to remove due to insufficient timestamps')
    print('There are ' + str(len(tickers)) + ' tickers')

    print(f'Confirm there are no more columns with missing data {data.columns[data.isnull().sum() != 0]}')
    
    # Sort data by tickers and date
    data = data.sort_values(by=['tic','datadate'])
    data = data.reset_index(drop=True)
    
    return data, tickers

In [None]:
def remove_low_dollar_vol(data, tickers, dol_vol_thres = 10000000):
    '''
    Input: data, tickers, dol_vol_thres
    1. Remove tickers whose dollar volume is less than dol_vol_thres (default 10M)
    Output: data, tickers
    '''
    data['dol_vol'] = data['cshtrd'] * data['prcod']
    dolvol_means = data.groupby('tic')['dol_vol'].mean()
    dolvol_remove = dolvol_means[dolvol_means < dol_vol_thres].index
    print(f'Removed {len(dolvol_remove)} tickers due to low dollar volume')
    data = data[~data['tic'].isin(dolvol_remove)]
    tickers = data['tic'].unique()
    print('There are ' + str(len(tickers)) + ' tickers')
    return data, tickers

In [None]:
def compute_ret(data, TBill_file_path):
    '''
    Input: data, tickers
    1. Create ret_d colume, e.g. 2024-2-2's ret_d is the percentage return of buying at 2024-2-3 market open and 
    selling at 2024-2-4 market open
    2. Fill last two days of missing return as 0
    3. Compute the excess return column by subtracting the risk-free rate
    Output: data, tickers
    '''
    # Add additional column 'ret_d'
    data['ret_d'] = data.groupby('tic')['prcod'].pct_change()
    data['ret_d'] = data.groupby('tic')['ret_d'].shift(-2)
    
    # Currently the last day has no ret_d since it's the test date. Fill it with 0
    data = data.fillna(0)
    data = data.reset_index(drop=True)
    
    # Excess return
    TBill3m = pd.read_csv(TBill_file_path)
    TBill3m['Date'] = pd.to_datetime(TBill3m['Date'], format="%m/%d/%y")
    date_TBill_dict = TBill3m[['Date', ' Open']].set_index('Date')[' Open'].to_dict()
    data['TBill3m'] = data['datadate'].map(date_TBill_dict)
    data['TBill3m'] = data['TBill3m'].fillna(0)
    # Convert to daily rate
    data['TBill3m'] = data['TBill3m'].apply(lambda x: np.power(1 + x/100, 1/252) - 1)
    data['excess_ret_d'] = data['ret_d'] - data['TBill3m']
    
    # Relative return to equal-weighted market returns
    data['market_ret'] = data['datadate'].map(data.groupby('datadate')['ret_d'].mean())
    data['rel_ret_d'] = data['ret_d'] - data['market_ret']
    data = data.drop(columns=['market_ret'])
    
    return data

In [None]:
def remove_dead_stocks(data, price_thres = 0.1):
    '''
    This goes after compute_ret to change those dead stock's return and their rank to 0
    '''
    df_low_price = data[data['prcod'] <= price_thres]
    tic_low_price = list(df_low_price.tic.unique())
    for tic in tic_low_price:
        date_low_price = df_low_price[df_low_price['tic']==tic].datadate.values[0]
        data.loc[(data['tic'] == tic) & (data['datadate'] >= date_low_price), ['ret_d', 'excess_ret_d', 'rel_ret_d', 'rank']] = 0
        print(f'{tic} is dead after {date_low_price}')
    return data

In [None]:
# def assign_label(x):
#     if x >= 0.05:
#         return 2
#     elif 0.01 <= x < 0.05:
#         return 1
#     elif -0.01 <= x < 0.01:
#         return 0
#     elif -0.05 <= x < -0.01:
#         return -1
#     else:
#         return -2

In [None]:
# def assign_movement_label(x):
#     if abs(x) <= 0.01:
#         return -2
#     elif abs(x) <= 0.03:
#         return -1
#     elif abs(x) <= 0.05:
#         return 0
#     elif abs(x) <= 0.07:
#         return 1
#     else:
#         return 2

In [None]:
# def assign_direction_label(x):
#     if x < -0.01:
#         return -1
#     elif abs(x) < 0.01:
#         return 0
#     else:
#         return 1

In [None]:
def assign_class_label(x):
    if x <= -0.03:
        return -2
    elif x <= -0.01:
        return -1
    elif x <= 0.01:
        return 0
    elif x < 0.03:
        return 1
    else:
        return 2

In [None]:
def portfolio(data, quantiles = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]):
    '''
    Input: data, quantiles (default: equal split into 5 classes)
    Compute the daily rank of every stock
    Output: data
    '''
    # 1. Evenly bin everything into 5 labels
    # data['DistinctRank'] = data.groupby('datadate')['ret_d'].rank('first') # Lowest first
    # data['rank'] = data.groupby('datadate')['DistinctRank'].transform(lambda x: pd.qcut(x, quantiles, labels=[-2, -1, 0, 1, 2]))

    # 2. Manually set the bin threshold to be \pm 0.01 and \pm 0.05
    # data['rank'] = data['ret_d'].apply(assign_label)

    # 3. Have the model predict the size of movement of the stock (to be later combined with another expert on the direction (up/down/hold))
    # data['rank'] = data['ret_d'].apply(assign_movement_label)
    # data['direction'] = data['ret_d'].apply(assign_direction_label)

    data['rank'] = data['ret_d'].apply(assign_class_label)
    return data

In [None]:
def standardize_data(data, cs_factors):
    '''
    Input: data
    1. Standardize data either cross-sectionally or ticker-wise
    2. Change outliers (3 STD away from mean) to plus-minus 3
    '''
    # Cross-sectional standardization
    data_cs = data[cs_factors].groupby('datadate').transform(lambda x: (x - x.mean()) / x.std())
    # winsorize further
    # data_cs[data_cs > 3] = 3
    # data_cs[data_cs < -3] = -3
    
    cs_factors.remove('datadate')
    data.loc[:, cs_factors] = data_cs

    data = data.fillna(0)
    
    print(f'Confirm that standardization did not create NaNs: {data.columns[data.isnull().sum() != 0]}')
    return data

In [None]:
def adjust_split(data, tickers, factors):
    '''
    Adjust numeric factors for stock splitting, using the adjustment factor
    '''
    # Divide columns in factors by ajexdi
    data[factors] = data[factors].div(data['ajexdi'].values, axis=0)
    
    # After adjusting for stock splitting and possibly selecting stocks based on exchange we can drop these columns
    data = data.drop(columns=['ajexdi', 'exchg'])
    
    return data, tickers

In [None]:
def make_sic_column(data, filename):

    sic_data = pd.read_csv(filename)
    sic_data = sic_data[~sic_data['sic'].isnull()]
    data = data.merge(sic_data, on='tic', how='left')
    
    # Remove data that have no SIC code (Those were leveraged ETFs)
    data = data[~data.sic.isnull()]
    num_of_tokens = data.sic.nunique()
    
    # Renumber the sics from 0 to num_of_tokens-1
    all_sics = list(data.sic.unique())
    renumber_sics = {}
    for i in range(len(all_sics)):
        renumber_sics[all_sics[i]] = i
    data['sic'] = data['sic'].replace(renumber_sics)
    
    return data, num_of_tokens

In [None]:
def num_tic_dicts(data):
    all_tickers = list(data['tic'].unique())
    all_tickers.sort()
    l = len(all_tickers)
    
    num_to_tic_dict = {}
    for i in range(l):
        num_to_tic_dict[i] = all_tickers[i]
    
    tic_to_num_dict = {}
    for key, value in num_to_tic_dict.items():
        tic_to_num_dict[value] = key
    
    return num_to_tic_dict, tic_to_num_dict