In [1]:
# TODO: Use this to profile: https://stackoverflow.com/questions/45893768/how-do-i-find-out-what-parts-of-my-code-are-inefficient-in-python
%load_ext line_profiler

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import holidays
import datetime
import pprint

from datetime import timedelta
from enum import Enum, auto
from typing import List
from pandas.tseries.offsets import BDay

pp = pprint.PrettyPrinter(indent=4)

In [3]:
API_KEY = 'cfg2wsKZrVNuYBJpETAs'
DEVELOPMENT = False

if DEVELOPMENT:
    daily_metrics = pd.read_csv('SHARADAR-DAILY.csv')
    daily_prices = pd.read_csv('SHARADAR-SEP.csv')
else:
    daily_metrics = pd.read_csv('SHARADAR_DAILY_3_9ffd00fad4f19bbdec75c6e670d3df83.csv')
    daily_prices = pd.read_csv('SHARADAR_SEP_2_0bd2000858d1d8d1f48d4cdea5f8c9e2.csv')

In [4]:
d1 = daily_metrics.copy()

d2 = daily_prices[['ticker', 'date','closeadj']]
d2.rename(columns={'closeadj': 'price'}, inplace=True)

daily_data = d1.merge(d2, on=['date', 'ticker'], how='inner')
daily_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,ticker,date,lastupdated,ev,evebit,evebitda,marketcap,pb,pe,ps,price
0,A,2020-12-18,2020-12-18,37525.1,40.8,30.6,36607.1,7.5,50.9,6.9,118.928
1,AA,2020-12-18,2020-12-18,4896.3,-43.0,8.9,4092.3,1.2,-8.7,0.4,22.010
2,AACQ,2020-12-18,2020-12-18,943.3,,,944.6,188.9,,,10.430
3,AAIC,2020-12-18,2020-12-18,817.6,-15.8,-15.8,125.0,0.5,-2.3,5.3,3.740
4,AAIIQ,2020-12-18,2020-12-18,-2.8,0.5,0.8,0.4,0.0,-0.1,0.0,0.100
...,...,...,...,...,...,...,...,...,...,...,...
14551055,GSTCQ,2011-01-04,2021-05-30,235.6,3.4,3.0,218.6,1.4,-575.3,5.0,4.340
14551056,MHRCQ,2011-01-04,2021-05-30,471.6,-29.7,-64.3,419.2,6.0,-19.3,13.6,7.230
14551057,EPRSQ,2011-01-03,2021-05-30,153.1,187.2,8.6,155.8,2.1,74.4,2.9,105.000
14551058,GSTCQ,2011-01-03,2021-05-30,242.7,3.5,3.1,225.7,1.4,-593.9,5.1,4.480


In [37]:
class StockUniverse(Enum):
    SMALL = auto()  # < $1B
    MID = auto()  # $1B - $10B
    LARGE = auto()  # > $100B
    
class EvaluationMetric(Enum):
    EV_EBIT = auto()
    P_E = auto()
    P_B = auto()
    DIV_YIELD = auto()

    def __str__(self):
        if self.value == EvaluationMetric.EV_EBIT.value:
            return 'EV/EBIT'
        elif self.value == EvaluationMetric.P_E.value:
            return 'P/E'
        elif self.value == EvaluationMetric.P_B.value:
            return 'P/B'
        elif self.value == EvaluationMetric.DIV_YIELD.value:
            return '% Div Yield'
        else:
            raise Exception(f'Unsupported evaluation metric {metric}')
    
def get_closest_previous_work_day(
    check_day: datetime.datetime,
    holidays=holidays.US()
) -> datetime.datetime:
    if check_day.weekday() <= 4 and check_day not in holidays:
        return check_day
    offset = max(1, (check_day.weekday() + 6) % 7 - 3)
    most_recent = check_day - datetime.timedelta(offset)
    if most_recent not in holidays:
        return most_recent
    else:
        return get_closest_previous_work_day(most_recent, holidays)

def get_rebalance_dates(
    start_date: datetime.datetime,
    end_date: datetime.datetime,
    period_length: datetime.timedelta
) -> List[datetime.datetime]:
    curr_date = start_date
    dates = []
    while curr_date < end_date:
        dates.append(get_closest_previous_work_day(curr_date))
        curr_date += period_length
    return dates

# Assumes data is already filtered by date
def filter_stocks_by_universe(
    df: pd.DataFrame,
    stocks_universe: StockUniverse
) -> pd.DataFrame:
    if stocks_universe.value == StockUniverse.SMALL.value:
        return df[df['marketcap'] < 1]
    elif stocks_universe.value == StockUniverse.MID.value:
        return df[(df['marketcap'] >= 1) & (df['marketcap'] <= 10)]
    elif stocks_universe.value == StockUniverse.LARGE.value:
        return df[(df['marketcap'] >= 10)]
    else:
        raise Exception(f'Unsupported stock universe {stocks_universe}')

def sort_df_by_metric(
    df: pd.DataFrame,
    metric: EvaluationMetric
) -> pd.DataFrame:
    if metric.value == EvaluationMetric.EV_EBIT.value:
        return df.sort_values(by='evebit')
    elif metric.value == EvaluationMetric.P_E.value:
        return df.sort_values(by='pe')
    elif metric.value == EvaluationMetric.P_B.value:
        return df.sort_values(by='pb')
    elif metric.value == EvaluationMetric.DIV_YIELD.value:
        raise Exception('EvaluationMetric.DIV_YIELD not yet supported.')
    else:
        raise Exception(f'Unsupported evaluation metric {metric}')

# Assumes df is sorted by date
def get_last_available_price(
    df: pd.DataFrame,
    ticker: str
) -> int:
    return df[df.ticker == ticker].iloc[-1]['price']

# Assumes df is sorted appropriately ahead of time.
def get_top_n_stocks_by_metric(
    df: pd.DataFrame,
    n: int,
    metric: EvaluationMetric    
) -> List[str]:
    df_res = None
    if metric.value == EvaluationMetric.EV_EBIT.value:
        df_res = df[(df['evebit'] > 0) & (df['ev'] > 0)]
    elif metric.value == EvaluationMetric.P_E.value:
        df_res = df[df['pe'] > 0]
    elif metric.value == EvaluationMetric.P_B.value:
        df_res = df[df['pb'] > 0]
    elif metric.value == EvaluationMetric.DIV_YIELD.value:
        raise Exception('EvaluationMetric.DIV_YIELD not yet supported.')
    else:
        raise Exception(f'Unsupported evaluation metric {metric}')
    
    return list(df_res[:n]['ticker'])

def filter_df_by_date(
    df: pd.DataFrame,
    date: datetime.datetime
) -> pd.DataFrame:
    return df[df.date == date.strftime('%Y-%m-%d')]

def get_stock_basket_price(
    df: pd.DataFrame,
    df_full: pd.DataFrame,
    tickers: List[str]
) -> int:
    stocks_of_interest = df.loc[df['ticker'].isin(tickers)]
    assert len(tickers) >= len(stocks_of_interest)
    basket_price = stocks_of_interest['price'].sum()
    if len(stocks_of_interest) != len(tickers):
        missing_stocks = set(tickers) - set(stocks_of_interest['ticker'])
        print(date, ' Stocks closed: ', missing_stocks)
        for ticker in missing_stocks:
            basket_price += get_last_available_price(date_sorted_daily_data, ticker)
    return round(basket_price, 2)        

In [7]:
# Prepare Inputs for Base + Test

INITIAL_PORTFOLIO_VALUE = 10000
PORTFOLIO_SIZE = 30

base_metric = EvaluationMetric.EV_EBIT
test_metric = EvaluationMetric.P_B
stocks_universe = StockUniverse.LARGE

start_date = datetime.datetime.strptime(min(daily_data['date']), '%Y-%m-%d')
end_date = datetime.datetime.strptime(max(daily_data['date']), '%Y-%m-%d')

# Optimization: Sorting every time would take too long
date_sorted_daily_data = daily_data.sort_values(by='date')
base_sorted_daily_data = sort_df_by_metric(daily_data, base_metric)
test_sorted_daily_data = sort_df_by_metric(daily_data, test_metric)

In [39]:
# Compute Results for Base + Test

rebalance_days = 365
rebalance_dates = get_rebalance_dates(start_date, end_date, timedelta(days=rebalance_days))

base_portfolio_value = INITIAL_PORTFOLIO_VALUE
test_portfolio_value = INITIAL_PORTFOLIO_VALUE
portfolio_size = PORTFOLIO_SIZE

start_date = rebalance_dates[0]

base_sorted_df = filter_df_by_date(base_sorted_daily_data, start_date)
test_sorted_df = filter_df_by_date(test_sorted_daily_data, start_date)

base_sorted_df = filter_stocks_by_universe(base_sorted_df, stocks_universe)
test_sorted_df = filter_stocks_by_universe(test_sorted_df, stocks_universe)

base_portfolio = get_top_n_stocks_by_metric(base_sorted_df, portfolio_size, base_metric)
test_portfolio = get_top_n_stocks_by_metric(test_sorted_df, portfolio_size, test_metric)

base_price = get_stock_basket_price(base_sorted_df, date_sorted_daily_data, base_portfolio)
test_price = get_stock_basket_price(test_sorted_df, date_sorted_daily_data, test_portfolio)    

res = {}
res[start_date] = {
    'base_basket_price': base_price,
    'base_portfolio_value': base_portfolio_value,
    'test_basket_price': test_price,
    'test_portfolio_value': test_portfolio_value,        
}

for date in rebalance_dates:
    print(date.strftime('%Y-%m-%d'))

    prev_base_price = base_price
    prev_test_price = test_price    
    
    base_sorted_df = filter_df_by_date(base_sorted_daily_data, date)
    test_sorted_df = filter_df_by_date(test_sorted_daily_data, date)
    
    base_price = get_stock_basket_price(base_sorted_df, date_sorted_daily_data, base_portfolio)
    test_price = get_stock_basket_price(test_sorted_df, date_sorted_daily_data, test_portfolio)
    
    base_change = base_price / prev_base_price
    test_change = test_price / prev_test_price
    
    base_portfolio_value = round(base_portfolio_value * base_change, 2)
    test_portfolio_value = round(test_portfolio_value * test_change, 2)    
    
    res[date] = {
        'base_basket_price': base_price,
        'base_portfolio_value': base_portfolio_value,
        'test_basket_price': test_price,
        'test_portfolio_value': test_portfolio_value,        
    }

    base_sorted_df = filter_stocks_by_universe(base_sorted_df, stocks_universe)
    test_sorted_df = filter_stocks_by_universe(test_sorted_df, stocks_universe)    
    
    base_portfolio = get_top_n_stocks_by_metric(base_sorted_df, portfolio_size, base_metric)
    test_portfolio = get_top_n_stocks_by_metric(test_sorted_df, portfolio_size, test_metric)

    base_price = get_stock_basket_price(base_sorted_df, date_sorted_daily_data, base_portfolio)
    test_price = get_stock_basket_price(test_sorted_df, date_sorted_daily_data, test_portfolio)

    
df_res = pd.DataFrame.from_dict(res, orient='index')
df_res

2011-01-03
2012-01-03
2012-01-03 00:00:00  Stocks closed:  {'ADGI'}
2012-01-03 00:00:00  Stocks closed:  {'ANPIQ', 'TRXBQ'}
2013-01-02
2013-01-02 00:00:00  Stocks closed:  {'CDCSY'}
2013-01-02 00:00:00  Stocks closed:  {'CDCAQ', 'XINGF', 'WATG', 'QXMCF', 'GU1'}
2014-01-02
2014-01-02 00:00:00  Stocks closed:  {'VRNM', 'ART', 'MAXY'}
2014-01-02 00:00:00  Stocks closed:  {'GBGLF', 'FUQI', 'CLNH1', 'UTRA', 'CBPI'}
2015-01-02
2015-01-02 00:00:00  Stocks closed:  {'SOQDF', 'HOLL', 'TGS'}
2015-01-02 00:00:00  Stocks closed:  {'CBEH', 'GNKOQ', 'MMAC', 'FEED'}
2015-12-31
2015-12-31 00:00:00  Stocks closed:  {'GNI', 'ALTV', 'CVVT', 'BTH', 'INSV'}
2015-12-31 00:00:00  Stocks closed:  {'LASLY', 'CTCLY', 'MEA', 'ANVGQ', 'CVVT', 'PALDF', 'BOPH'}
2016-12-30
2016-12-30 00:00:00  Stocks closed:  {'ENUM'}
2016-12-30 00:00:00  Stocks closed:  {'TCPTF', 'GTATQ'}
2017-12-29
2017-12-29 00:00:00  Stocks closed:  {'ADPTQ', 'SRYB', 'CACQ'}
2017-12-29 00:00:00  Stocks closed:  {'BAA', 'MEP', 'LTONY', 'GLFMQ', '

Unnamed: 0,base_basket_price,base_portfolio_value,test_basket_price,test_portfolio_value
2011-01-03,388.84,10000.0,51452837.63,10000.0
2012-01-03,343.78,8841.17,22072266.79,4289.81
2013-01-02,263.86,8914.48,53657.47,1961.8
2014-01-02,380.47,13215.76,16916.05,4263.51
2015-01-02,290.61,13883.64,11601414.13,1114.86
2015-12-31,606.85,7810.01,561.02,625.16
2016-12-30,565.3,4152.64,16054.26,652.25
2017-12-29,311.47,2637.16,195.52,7.94
2018-12-31,286.37,2039.71,91.94,3.81
2019-12-31,257.71,1460.55,75.68,3.32


In [40]:
df_to_plot = df_res[['base_portfolio_value', 'test_portfolio_value']]
df_to_plot.plot(title=f'{str(base_metric)} (base) VS {str(test_metric)} (test); Rebalanced every {} days; Portfolio size: {portfolio_size}')

SyntaxError: f-string: empty expression not allowed (<ipython-input-40-2d14f860f9bf>, line 2)

In [12]:
# daily_data[daily_data.ticker == 'AAPL'].set_index('date').sort_values(by=['date'])

# daily_data[daily_data.ticker == 'AAPL'][['date', 'price']].set_index('date').sort_values(by=['date']).plot()

# d1 = datetime.datetime.strptime('2011-07-01', '%Y-%m-%d')
# d2 = datetime.datetime.strptime('2011-09-30', '%Y-%m-%d')

# get_stock_basket_price(base_sorted_df, d1, tickers)
# get_stock_basket_price(base_sorted_df, d2, tickers)

# date_data = base_sorted_df[base_sorted_df.date == d1.strftime('%Y-%m-%d')]
# stocks_of_interest = date_data.loc[date_data['ticker'].isin(tickers)]
# stocks_of_interest

# date_data = base_sorted_df[base_sorted_df.date == d2.strftime('%Y-%m-%d')]
# stocks_of_interest = date_data.loc[date_data['ticker'].isin(tickers)]
# stocks_of_interest
df_res

Unnamed: 0,base_basket_price,base_portfolio_value,test_basket_price,test_portfolio_value
2011-01-03,388.84,10000.0,51452837.63,10000.0
2011-04-01,389.59,10019.29,47383050.42,9209.03
2011-07-01,273.01,9329.99,6969.98,5789.63
2011-09-30,1107.08,4965.62,4863.76,3206.19
2011-12-29,254.03,5277.67,102049.51,1845.16
2012-03-28,268.98,6059.36,2237542.97,2274.86
2012-06-26,126.08,5069.44,466.03,918.05
2012-09-24,201.41,5290.07,106723.18,1146.64
2012-12-21,336.88,5229.53,390.29,656.06
2013-03-22,301.27,5827.42,10261.41,883.68


In [26]:
date_sorted_daily_data = daily_data.sort_values(by='date')