In [1]:
import boto3
import awswrangler as wr
import pandas as pd
accessKeys = pd.read_csv("../../quant-bears_accessKeys.csv")
session = boto3.Session(
	aws_access_key_id=accessKeys.loc[0, "Access key ID"],
	aws_secret_access_key=accessKeys.loc[0, "Secret access key"]
)

s3_collection_path = "s3://quant-bears-data-collection/raw-data/"

In [2]:
sources = ["gurufocus/", "seekingAlpha.seekingAlphaBulkMetrics/"]

dfs_19 = [wr.s3.read_parquet(s3_collection_path + s + "2023-09-19.parquet", boto3_session=session) for s in sources]
dfs_26 = [wr.s3.read_parquet(s3_collection_path + s + "2023-09-26.parquet", boto3_session=session) for s in sources]

# df_2023_09_19 = wr.s3.read_parquet(s3_collection_path + "2023-09-19.parquet", boto3_session=session)
# df_2023_09_26 = wr.s3.read_parquet(s3_collection_path + "2023-09-26.parquet", boto3_session=session)


In [3]:
dfs_19[0].head()

Unnamed: 0,symbol,growth_rank,3-Year_Revenue_Growth_Rate,3-Year_EBITDA_Growth_Rate,3-Year_EPS_without_NRI_Growth_Rate,3-Year_FCF_Growth_Rate,3-Year_Book_Growth_Rate,Future_3-5Y_Total_Revenue_Growth_Rate,profitability_rank,Gross_Margin_%,...,Price-to-DCF__Earnings_Based_,Price-to-Median-PS-Value,Price-to-Peter-Lynch-Fair-Value,Price-to-Graham-Number,Price-to-Net-Current-Asset-Value,Price-to-Net-Cash,Price-to-Owner-Earnings,Price-to-Free-Cash-Flow,Price-to-FFO,Price-to-DCF__FCF_Based_
0,AA,0.4,6.8,54.4,51.8,4.4,9.0,-0.36,0.5,6.3,...,,,,,,,,,,
1,AAP,0.7,10.4,8.0,6.5,-4.2,-4.1,1.54,0.8,43.58,...,0.89,0.27,1.24,1.2,,,,,,
2,ABCL,,215.7,,,,631.1,,0.5,88.57,...,,,,,2.48,3.22,,,,
3,ABCM,0.7,13.9,-22.7,,,22.7,18.15,0.8,75.88,...,,0.74,,39.74,,,,,,
4,ABEV,0.7,15.3,5.3,7.1,2.0,10.1,6.69,0.9,49.95,...,,0.44,,1.94,,,16.7,14.71,,


In [4]:
overlapped_tickers = dfs_19[1]["ticker"][dfs_19[1]["ticker"].isin(dfs_26[1]["ticker"])]


In [7]:
guru_19 = dfs_19[0].set_index("symbol").loc[overlapped_tickers]
guru_26 = dfs_26[0].set_index("symbol").loc[overlapped_tickers]

alpha_19 = dfs_19[1].set_index("ticker").loc[overlapped_tickers]
alpha_26 = dfs_26[1].set_index("ticker").loc[overlapped_tickers]

In [19]:
df_19 = pd.concat([guru_19, alpha_19], axis = 1).select_dtypes("float")

In [20]:
df_26 = pd.concat([guru_26, alpha_26], axis = 1).select_dtypes("float")

In [21]:
df_19.corrwith(df_26).describe()

count    330.000000
mean       0.982506
std        0.076800
min        0.310832
25%        0.997132
50%        0.999887
75%        0.999996
max        1.000000
dtype: float64

In [22]:
df_19.corrwith(df_26).sort_values()

Earnings_Yield__Greenblatt__%    0.310832
ev_ebit_fy1                      0.378230
last_price_vs_sma_10d            0.394811
5-Day_RSI                        0.473860
peg_nongaap_fy1                  0.634679
                                   ...   
3-Year_Revenue_Growth_Rate       1.000000
eps_gaap_annual_growth_yoy       1.000000
sustainable_growth_rate          1.000000
3-Year_FCF_Growth_Rate           1.000000
peg_gaap_avg_5y                       NaN
Length: 331, dtype: float64

In [23]:
df_19.isna().sum().describe()

count     331.000000
mean      302.897281
std       276.459326
min         0.000000
25%        53.000000
50%       215.000000
75%       552.000000
max      1057.000000
dtype: float64

In [24]:
price_diff = df_26["primary_price"] - df_19["primary_price"]
price_diff_pct = price_diff / df_19["primary_price"]
price_diff_pct.tail()

ZNTL   -0.077368
ZS     -0.037164
ZTO    -0.028343
ZUO    -0.036824
ZWS    -0.014565
Name: primary_price, dtype: float64

In [25]:
X = df_19.copy()
X["target"] = price_diff_pct

In [26]:
# What correlates the most with the target variable?
feature_target_corr = X.corr()["target"].sort_values()
feature_target_corr

beta24                            -0.301661
coefficient_of_variation_90d      -0.273251
short_interest_percent_of_float   -0.241841
p_week_vol_shares                 -0.187265
3-Year_Revenue_Growth_Rate        -0.180957
                                     ...   
profitability_rank                 0.237251
return_on_total_capital            0.256354
cf_payout_avg_5y                   0.443166
target                             1.000000
peg_gaap_avg_5y                         NaN
Name: target, Length: 332, dtype: float64

In [27]:
feature_target_corr.tail(20)

return_on_total_capital_avg_5y        0.150472
ebitda_margin                         0.154197
diluted_eps                           0.154245
ebit_margin                           0.155126
levered_fcf_margin                    0.157841
Earnings_Yield__Greenblatt__%         0.159189
earn_yield_gaap_fy1                   0.161375
net_lt_debt_tot_assets_avg_5y         0.163731
ROE_%                                 0.177245
3-Year_EBITDA_Growth_Rate             0.178879
eps_estimate                          0.191914
return_on_avg_tot_assets              0.205067
ROA_%                                 0.215810
ROCE_%                                0.222657
3-Year_EPS_without_NRI_Growth_Rate    0.235203
profitability_rank                    0.237251
return_on_total_capital               0.256354
cf_payout_avg_5y                      0.443166
target                                1.000000
peg_gaap_avg_5y                            NaN
Name: target, dtype: float64

In [28]:
feature_target_corr_spearman = X.corr(method = "spearman")["target"].sort_values()
feature_target_corr_spearman

beta24                            -0.291399
ps_ratio_avg_5y                   -0.224339
ev_12m_sales_ratio_avg_5y         -0.220358
short_interest_percent_of_float   -0.217066
ev_ebit_fy1_avg_5y                -0.212758
                                     ...   
int_cover                          0.323999
return_on_total_capital            0.331165
cf_payout_avg_5y                   0.545320
target                             1.000000
peg_gaap_avg_5y                         NaN
Name: target, Length: 332, dtype: float64

In [29]:
feature_target_corr_spearman.tail(20)

roe                                 0.246532
cash_from_operations_as_reported    0.246552
net_inc_per_employee                0.253559
return_on_total_capital_avg_5y      0.256821
return_on_net_tangible_assets       0.265342
ROE_%                               0.267037
ROA_%                               0.274354
return_on_avg_tot_assets            0.275564
diluted_eps                         0.276394
ebitda                              0.285664
Earnings_Yield__Greenblatt__%       0.286944
ROIC_%                              0.287414
net_income                          0.292214
ROCE_%                              0.292333
eps_estimate                        0.318226
int_cover                           0.323999
return_on_total_capital             0.331165
cf_payout_avg_5y                    0.545320
target                              1.000000
peg_gaap_avg_5y                          NaN
Name: target, dtype: float64