In [5]:
import boto3
import awswrangler as wr
import pandas as pd
from tqdm import tqdm

accessKeys = pd.read_csv("../../quant-bears_accessKeys.csv")
session = boto3.Session(
	aws_access_key_id=accessKeys.loc[0, "Access key ID"],
	aws_secret_access_key=accessKeys.loc[0, "Secret access key"]
)

s3_collection_path = "s3://quant-bears-data-collection/raw-data/"
s3_price_collection_path = "s3://quant-bears-data-collection/raw-resolved-price/"

def load_data():
	data_sources = ["seekingAlpha.seekingAlphaBulkMetrics", "gurufocus"]
	sources_dict = dict((source, wr.s3.list_objects(s3_collection_path + source + "/", boto3_session=session)) for source in data_sources)
	df_dict = {}
	for source in data_sources:
		dfs = []
		print(source)
		for path in tqdm(sources_dict[source]):
			new_df = wr.s3.read_parquet(path, boto3_session=session)
			new_df["date"] = path.split("/")[-1].split(".")[0]
			dfs.append(new_df)

		df_dict[source] = pd.concat(dfs, axis = 0)
	joined_df = pd.concat([df.set_index(["date", "ticker"]) for df in df_dict.values()], axis = 1)
	return joined_df

In [6]:
df = load_data()

seekingAlpha.seekingAlphaBulkMetrics


  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split(".")[0]
  new_df["date"] = path.split("/")[-1].split("

gurufocus


100%|██████████| 33/33 [00:20<00:00,  1.57it/s]


In [85]:
import numpy as np

def load_pred_price(df, days_ahead=5):
	all_dates = df.index.get_level_values(0).unique()
	adjusted_dates = all_dates[:-days_ahead]
	adjusted_df = df.loc[adjusted_dates]

	pred_dates = all_dates[days_ahead:]
	dfs = []
	for i, d in enumerate(tqdm(pred_dates)):
		path = s3_price_collection_path + d + ".parquet"
		new_df = wr.s3.read_parquet(path, boto3_session=session)

		s = df.loc[all_dates[i], "primary_price"]
		intersect_tickers = np.intersect1d(new_df["ticker"].values, s[~s.isna()].index.values)
		new_df = new_df[new_df["ticker"].isin(intersect_tickers)]
		new_df["date"] = all_dates[i]
		dfs.append(new_df)

	price_df = pd.concat(dfs, axis = 0).set_index(["date", "ticker"]).rename({"primary_price": "pred_price"}, axis = 1)
	return adjusted_df.reindex(price_df.index), price_df
		

In [86]:
adj_df, price_df = load_pred_price(df, days_ahead=5)

100%|██████████| 28/28 [00:12<00:00,  2.19it/s]


In [88]:
adj_df.isna().sum().sum()

3451770

In [89]:
adj_df["pred_price"] = price_df["pred_price"]

In [99]:
adj_df.index.get_level_values(0).unique()

Index(['2023-09-28', '2023-09-29', '2023-10-02', '2023-10-03', '2023-10-04',
       '2023-10-05', '2023-10-06', '2023-10-09', '2023-10-10', '2023-10-11',
       '2023-10-12', '2023-10-13', '2023-10-16', '2023-10-17', '2023-10-18',
       '2023-10-19', '2023-10-20', '2023-10-24', '2023-10-25', '2023-10-26',
       '2023-10-27', '2023-10-30', '2023-10-31', '2023-11-01', '2023-11-02',
       '2023-11-03', '2023-11-06', '2023-11-07'],
      dtype='object', name='date')

In [100]:
import pandas as pd, warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

short_term_columns = ['analysts_down', 'analysts_down_percent', 'analysts_up_percent',
       'authors_count', 'capm_alpha_60m', 'coefficient_of_variation_90d',
       'debt_eq', 'div_growth_category', 'div_safety_category',
       'dividends_estimate_fy1_analyst_down',
       'dividends_estimate_fy1_analyst_up',
       'dividends_estimate_fy2_analyst_down',
       'dividends_estimate_fy2_analyst_up',
       'dps_consensus_mean_percent_revisions_down_1_annual_period_fwd',
       'eps_ltg', 'last_price_vs_sma_100d', 'last_price_vs_sma_10d',
       'last_price_vs_sma_200d', 'last_price_vs_sma_50d', 'momentum_12m',
       'momentum_3m', 'momentum_6m', 'momentum_9m', 'pb_ratio',
       'price_return_1y', 'price_return_3m', 'price_return_6m',
       'price_return_9m', 'return_on_net_tangible_assets',
       'return_on_total_capital', 'Future_3-5Y_EPS_without_NRI_Growth_Rate',
       'moment_rank', '5-Day_RSI', '9-Day_RSI', '14-Day_RSI',
       '6-1_Month_Momentum_%', '12-1_Month_Momentum_%', 'ratios_rank',
       'Forward_PE_Ratio', 'EV-to-Forward-EBITDA',
       'Earnings_Yield__Greenblatt__%', 'Volume']


def create_autoreg_features(df, short_term_features, num_days = 2):
	all_days = df.index.get_level_values(0).unique().sort_values()

	new_df = pd.DataFrame()
	for dayIndex in range(num_days, all_days.shape[0]):
		day = all_days[dayIndex]

		newDayDf = df.loc[day]
		newDayDf["date"] = day
		currTickers = newDayDf.index.values
		for prevIndex in range(num_days):
			prevDay = all_days[dayIndex - prevIndex]
			prevDayFeatures = df.loc[prevDay].reindex(currTickers)[short_term_features].add_suffix(f"_prev_{prevIndex + 1}")
			newDayDf = pd.concat([newDayDf, prevDayFeatures], axis = 1)
		
		new_df = pd.concat([new_df, newDayDf.set_index("date", append = True)], axis = 0)

	autoreg_removed_rows = df.loc[all_days[:num_days]].shape[0]

	assert df.shape[0] - new_df.shape[0] == autoreg_removed_rows, "Autoregression removed different amount of rows than expected"
	
	return new_df

In [102]:
autoreg_df = create_autoreg_features(adj_df, short_term_columns, num_days=2)

In [103]:
autoreg_df.shape

(40727, 424)

In [104]:
adj_df.shape

(43859, 340)

In [1]:
some_dates = ['2023-09-28', '2023-09-29', '2023-10-02', '2023-10-03', '2023-10-04',
       '2023-10-05', '2023-10-06', '2023-10-09', '2023-10-10', '2023-10-11',
       '2023-10-12', '2023-10-13', '2023-10-16', '2023-10-17', '2023-10-18',
       '2023-10-19', '2023-10-20', '2023-10-24', '2023-10-25', '2023-10-26',
       '2023-10-27', '2023-10-30', '2023-10-31', '2023-11-01', '2023-11-02',
       '2023-11-03', '2023-11-06', '2023-11-07']

In [2]:
def split_date(all_dates, train_size = 0.75, days_ahead = 5):
	num_dates = len(all_dates)
	num_train_test_dates = num_dates - days_ahead
	num_train_dates = int(num_train_test_dates * train_size)

	train_dates = all_dates[:num_train_dates]
	val_dates = all_dates[num_train_dates:num_train_dates+days_ahead]
	test_dates = all_dates[num_train_dates+days_ahead:]

	return train_dates, val_dates, test_dates

In [4]:
train, val, test = split_date(some_dates)
len(train), len(val), len(test)

(17, 5, 6)

In [5]:
len(some_dates)

28