In [None]:
import pandas as pd
import numpy as np

In [None]:
def remove_tic(data, tickers, num_of_ts, factors):
    '''
    Input: data, tickers, num_of_ts, factors
    Removes tickers without enough dates
    Output: data, tickers
    '''

    # Remove entries where there are not enough number of time stamps
    databytic_count = data.groupby('tic').count()
    incomplete_tics = databytic_count.loc[databytic_count['datadate']<num_of_ts].index.tolist()
    print(f'Removed {incomplete_tics}')
    data = data[~data['tic'].isin(incomplete_tics)]
    tickers = data['tic'].unique()
    print(f'There are {len(incomplete_tics)} unique tickers to remove due to insufficient dates')
    print(f'There are {len(tickers)} tickers')

    print(f'Confirm there are no more columns with missing data {data.isna().any().any()}')
    
    # Sort data by tickers and date
    data = data.sort_values(by=['tic','datadate'])
    data = data.reset_index(drop=True)
    
    return data, tickers

In [None]:
def remove_low_dollar_vol(data, tickers, dol_vol_thres = 10000000):
    '''
    Input: data, tickers, dol_vol_thres
    1. Remove tickers whose dollar volume is less than dol_vol_thres (default 10M)
    Output: data, tickers
    '''
    data['dol_vol'] = data['cshtrd'] * data['prcod']
    dolvol_means = data.groupby('tic')['dol_vol'].mean()
    dolvol_remove = dolvol_means[dolvol_means < dol_vol_thres].index
    print(f'Removed {len(dolvol_remove)} tickers due to low dollar volume')
    data = data[~data['tic'].isin(dolvol_remove)]
    tickers = data['tic'].unique()
    print('There are ' + str(len(tickers)) + ' tickers')
    return data, tickers

In [None]:
def compute_ret(data, TBill_file_path):
    '''
    Input: data, tickers
    1. Create ret_d colume, e.g. 2024-2-2's ret_d is the percentage return of buying at 2024-2-3 market open and 
    selling at 2024-2-4 market open
    2. Fill last two days of missing return as 0
    3. Create the column of risk-free rate from the 1y TBill rate for SR calculation later
    Output: data, tickers
    '''
    # Calculate the daily return
    data['ret_d'] = data.groupby('tic')['prcod'].pct_change()
    data['ret_d'] = data.groupby('tic')['ret_d'].shift(-2)
    
    # Currently the last two days have no ret_d since it's the test date. Fill it with 0
    data = data.fillna(0)
    data = data.reset_index(drop=True)
    
    # Excess return
    TBill1y = pd.read_csv(TBill_file_path)
    TBill1y = TBill1y.replace('.', None)
    TBill1y = TBill1y.ffill()
    TBill1y['TBill_rate'] = TBill1y['TBill_rate'].apply(lambda x: float(x))
    TBill1y['date'] = pd.to_datetime(TBill1y['date'])
    date_TBill_dict = TBill1y[['date', 'TBill_rate']].set_index('date')['TBill_rate'].to_dict()
    data['TBill1y'] = data['datadate'].map(date_TBill_dict)
    # Convert to daily rate
    data['TBill1y'] = data['TBill1y'].apply(lambda x: np.power(1 + x/100, 1/252) - 1)
    
    # Relative return to equal-weighted market returns
    data['market_ret'] = data['datadate'].map(data.groupby('datadate')['ret_d'].mean())
    data['rel_ret_d'] = data['ret_d'] - data['market_ret']
    data = data.drop(columns=['market_ret'])
    
    return data

In [None]:
def remove_dead_stocks(data, price_thres = 0.1):
    '''
    Changes dead stock's return and their rank to 0 five days before their price reaches the threshold
    '''
    df_low_price = data[data['prcod'] <= price_thres]
    tic_low_price = list(df_low_price.tic.unique())
    all_days = list(data.datadate.unique())
    for tic in tic_low_price:
        date_low_price = df_low_price[df_low_price['tic']==tic].datadate.values[0]
        date_index = all_days.index(date_low_price)
        data.loc[(data['tic'] == tic) & (data['datadate'] >= all_days[date_index-5]), ['ret_d', 'rel_ret_d', 'rank']] = 0
        print(f'{tic} is dead after {date_low_price}')
    return data

In [None]:
def fixed_thres_classes(x):
    if x <= -0.03:
        return -2
    elif x <= -0.01:
        return -1
    elif x <= 0.01:
        return 0
    elif x < 0.03:
        return 1
    else:
        return 2

In [None]:
def assign_class_labels(data, label_method):
    '''
    Input: data, label_method (fixed_thres or fixed_size)
    Compute the daily rank (-2, -1, 0, 1, 2) of every stock
    Output: data with additional label column
    '''
    # 1. Evenly bin everything into 5 labels
    quantiles = [0, 0.2, 0.4, 0.6, 0.8, 1]
    if label_method == 'fixed_size':
        data['DistinctRank'] = data.groupby('datadate')['ret_d'].rank('first') # Lowest first
        data['rank'] = data.groupby('datadate')['DistinctRank'].transform(lambda x: pd.qcut(x, quantiles, labels=[-2, -1, 0, 1, 2]))
    # 2. Manually set the bin threshold to be \pm 0.01 and \pm 0.03
    elif label_method == 'fixed_thres':
        data['rank'] = data['ret_d'].apply(fixed_thres_classes)
    else:
        raise Exception('Nonexistent label method!')
    return data

In [None]:
def make_sector_column(data):
    '''
    Input: data
    Create the categorical sector column (0, 1, ...)
    Output: data with modified (numerical) sector column
    '''

    num_of_tokens = data.sector.nunique()
    num_to_sector_dict = {}
    
    # Renumber the sics from 0 to num_of_tokens-1
    all_sectors = list(data.sector.unique())
    renumber_sectors = {}
    for i in range(len(all_sectors)):
        renumber_sectors[all_sectors[i]] = i
        num_to_sector_dict[i] = all_sectors[i]
    data['sector'] = data['sector'].replace(renumber_sectors)
    
    return data, num_of_tokens, num_to_sector_dict

In [None]:
def num_tic_dicts(data):
    all_tickers = list(data['tic'].unique())
    all_tickers.sort()
    l = len(all_tickers)
    
    num_to_tic_dict = {}
    for i in range(l):
        num_to_tic_dict[i] = all_tickers[i]
    
    tic_to_num_dict = {}
    for key, value in num_to_tic_dict.items():
        tic_to_num_dict[value] = key
    
    return num_to_tic_dict, tic_to_num_dict