In [1]:
import boto3
import awswrangler as wr
import pandas as pd
import numpy as np
from tqdm import tqdm

import pandas as pd, warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

short_term_columns = ['analysts_down', 'analysts_down_percent', 'analysts_up_percent',
       'authors_count', 'capm_alpha_60m', 'coefficient_of_variation_90d',
       'debt_eq', 'div_growth_category', 'div_safety_category',
       'dividends_estimate_fy1_analyst_down',
       'dividends_estimate_fy1_analyst_up',
       'dividends_estimate_fy2_analyst_down',
       'dividends_estimate_fy2_analyst_up',
       'dps_consensus_mean_percent_revisions_down_1_annual_period_fwd',
       'eps_ltg', 'last_price_vs_sma_100d', 'last_price_vs_sma_10d',
       'last_price_vs_sma_200d', 'last_price_vs_sma_50d', 'momentum_12m',
       'momentum_3m', 'momentum_6m', 'momentum_9m', 'pb_ratio',
       'price_return_1y', 'price_return_3m', 'price_return_6m',
       'price_return_9m', 'return_on_net_tangible_assets',
       'return_on_total_capital', 'Future_3-5Y_EPS_without_NRI_Growth_Rate',
       'moment_rank', '5-Day_RSI', '9-Day_RSI', '14-Day_RSI',
       '6-1_Month_Momentum_%', '12-1_Month_Momentum_%', 'ratios_rank',
       'Forward_PE_Ratio', 'EV-to-Forward-EBITDA',
       'Earnings_Yield__Greenblatt__%', 'Volume']

accessKeys = pd.read_csv("../quant-bears_accessKeys.csv")
session = boto3.Session(
	aws_access_key_id=accessKeys.loc[0, "Access key ID"],
	aws_secret_access_key=accessKeys.loc[0, "Secret access key"]
)

s3_collection_path = "s3://quant-bears-data-collection/raw-data/"
s3_price_collection_path = "s3://quant-bears-data-collection/raw-resolved-price/"

def load_data():
	data_sources = ["seekingAlpha.seekingAlphaBulkMetrics", "gurufocus"]
	sources_dict = dict((source, wr.s3.list_objects(s3_collection_path + source + "/", boto3_session=session)) for source in data_sources)
	df_dict = {}
	for source in data_sources:
		dfs = []
		print(source)
		for path in tqdm(sources_dict[source]):
			new_df = wr.s3.read_parquet(path, boto3_session=session)
			new_df["date"] = path.split("/")[-1].split(".")[0]
			dfs.append(new_df)

		df_dict[source] = pd.concat(dfs, axis = 0)
	joined_df = pd.concat([df.set_index(["date", "ticker"]) for df in df_dict.values()], axis = 1)
	return joined_df

def load_pred_price(df, days_ahead=5, diff = True):
	all_dates = df.index.get_level_values(0).unique()
	adjusted_dates = all_dates[:-days_ahead]
	adjusted_df = df.loc[adjusted_dates]

	pred_dates = all_dates[days_ahead:]
	dfs = []
	for i, d in enumerate(tqdm(pred_dates)):
		path = s3_price_collection_path + d + ".parquet"
		new_df = wr.s3.read_parquet(path, boto3_session=session)

		s = df.loc[all_dates[i], "primary_price"]
		intersect_tickers = np.intersect1d(new_df["ticker"].values, s[~s.isna()].index.values)
		new_df = new_df[new_df["ticker"].isin(intersect_tickers)]
		new_df["date"] = all_dates[i]
		dfs.append(new_df)

	price_df = pd.concat(dfs, axis = 0).set_index(["date", "ticker"]).rename({"primary_price": "pred_price"}, axis = 1)

	if diff:
		a_df = adjusted_df.reindex(price_df.index)
		return a_df, pd.Series((price_df["pred_price"] - a_df["primary_price"]) / a_df["primary_price"], index=price_df.index)
	return adjusted_df.reindex(price_df.index), price_df

def split_date(all_dates, train_size = 0.75, days_ahead = 5):
	num_dates = len(all_dates)
	num_train_test_dates = num_dates - days_ahead
	num_train_dates = int(num_train_test_dates * train_size)

	train_dates = all_dates[:num_train_dates]
	val_dates = all_dates[num_train_dates:num_train_dates+days_ahead]
	test_dates = all_dates[num_train_dates+days_ahead:]

	return train_dates, val_dates, test_dates

def create_autoreg_features(df, short_term_features, num_days = 2):
	all_days = df.index.get_level_values(0).unique().sort_values()

	new_df = pd.DataFrame()
	for dayIndex in range(num_days, all_days.shape[0]):
		day = all_days[dayIndex]

		newDayDf = df.loc[day]
		newDayDf["date"] = day
		currTickers = newDayDf.index.values
		for prevIndex in range(num_days):
			prevDay = all_days[dayIndex - prevIndex]
			prevDayFeatures = df.loc[prevDay].reindex(currTickers)[short_term_features].add_suffix(f"_prev_{prevIndex + 1}")
			newDayDf = pd.concat([newDayDf, prevDayFeatures], axis = 1)
		
		new_df = pd.concat([new_df, newDayDf.set_index("date", append = True)], axis = 0)

	autoreg_removed_rows = df.loc[all_days[:num_days]].shape[0]

	assert df.shape[0] - new_df.shape[0] == autoreg_removed_rows, "Autoregression removed different amount of rows than expected"
	
	return new_df.reorder_levels([1, 0]).sort_index()

In [2]:
# Load

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

df = load_data()
print(f"Loaded data shape: {df.shape}")
autoreg_df = df
# autoreg_df = create_autoreg_features(df, short_term_columns, num_days=2)
# print(f"Post autoregressive: {autoreg_df.shape}")
X_df, y_series = load_pred_price(autoreg_df, days_ahead = 5, diff = True)
print(f"Post loaded pred price: {X_df.shape, y_series.shape}")

train_dates, val_dates, test_dates = split_date(X_df.index.get_level_values(0).unique().sort_values())
print(f"# of train dates ({len(train_dates)}), val dates ({len(val_dates)}), test dates ({len(test_dates)})")

X_df = X_df.fillna({"sector": "none", "area": "none"})

X_train, y_train = X_df.loc[train_dates], y_series.loc[train_dates]
X_val, y_val = X_df.loc[val_dates], y_series.loc[val_dates]
X_test, y_test = X_df.loc[test_dates], y_series.loc[test_dates]

print(f"Train shapes: {X_train.shape, y_train.shape}")
print(f"Val shapes: {X_val.shape, y_val.shape}")
print(f"Test shapes: {X_test.shape, y_test.shape}")


seekingAlpha.seekingAlphaBulkMetrics


100%|██████████| 38/38 [00:39<00:00,  1.04s/it]


gurufocus


100%|██████████| 38/38 [00:24<00:00,  1.56it/s]


Loaded data shape: (60290, 339)


100%|██████████| 33/33 [00:15<00:00,  2.18it/s]

Post loaded pred price: ((51918, 339), (51918,))
# of train dates (21), val dates (5), test dates (7)
Train shapes: ((32877, 339), (32877,))
Val shapes: ((7778, 339), (7778,))
Test shapes: ((11263, 339), (11263,))





In [3]:
df.index

MultiIndex([('2023-09-28',    'A'),
            ('2023-09-28',   'AA'),
            ('2023-09-28',  'AAL'),
            ('2023-09-28',  'AAP'),
            ('2023-09-28', 'AAPL'),
            ('2023-09-28', 'ABBV'),
            ('2023-09-28', 'ABCL'),
            ('2023-09-28', 'ABCM'),
            ('2023-09-28', 'ABEV'),
            ('2023-09-28',  'ABM'),
            ...
            ('2023-11-21',   'ZI'),
            ('2023-11-21', 'ZION'),
            ('2023-11-21',  'ZIP'),
            ('2023-11-21',   'ZM'),
            ('2023-11-21',   'ZS'),
            ('2023-11-21',  'ZTO'),
            ('2023-11-21',  'ZTS'),
            ('2023-11-21',  'ZUO'),
            ('2023-11-21',  'ZWS'),
            ('2023-10-17',  'NCR')],
           names=['date', 'ticker'], length=60290)

In [4]:
df.to_csv("../data/raw_features.csv", index = True)

In [5]:
X_df.to_csv("../data/targe_adjusted_features.csv", index = True)

In [9]:
y_series.rename("target").to_csv("../data/target.csv", index = True)