# Data Transformations for Price Data 


    - Basic
    - Stats
    - Signatures
    - Catch22
    

In [1]:
import pandas as pd 

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 100)

In [3]:
import numpy as np
import datetime, os, glob, gc

from pytrend.compustat import Compustat_CRSP_Data

## Sklearn Transformers
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import FeatureUnion


In [4]:
pd.options.mode.chained_assignment = None

In [5]:
from pytrend.numerai_signals import CompustatSignatureTransformer

In [6]:
CRSP_data = pd.read_parquet('../signals-data/Numerai_raw_price.parquet')

In [11]:
numerai_signals_metadata = pd.read_csv("../signals-data/numerai_signals_metadata_2021.csv")
permnos = numerai_signals_metadata["permno"].unique()

In [27]:
def add_industry_labels(CRSP_single_stock, sample_id, sample_id_type="permno"):
    ## Mapped History
    mapped_history = numerai_signals_metadata[
        numerai_signals_metadata[sample_id_type] == sample_id
    ]
    mapped_history["map_start"] = pd.to_datetime(mapped_history["map_start"])
    mapped_history["map_end"] = pd.to_datetime(mapped_history["map_end"])
    mapped_history["crsp_start"] = pd.to_datetime(mapped_history["crsp_start"])
    mapped_history["crsp_end"] = pd.to_datetime(mapped_history["crsp_end"])
    CRSP_single_stock["bloomberg_ticker"] = None
    CRSP_single_stock["group_subindustry"] = None
    for i, row in mapped_history.iterrows():

        if row["map_start"] == datetime.datetime(year=2007, month=4, day=14):
            valid_start = row["crsp_start"]
            valid_end = min(row["crsp_end"], row["map_end"])
        else:
            valid_start = max(row["crsp_start"], row["map_start"])
            valid_end = min(row["crsp_end"], row["map_end"])
        if valid_end > valid_start:
            CRSP_single_stock.loc[valid_start:valid_end, "group_subindustry"] = row[
                "hgsubind"
            ]
            CRSP_single_stock.loc[valid_start:valid_end, "bloomberg_ticker"] = row[
                "bloomberg_ticker"
            ]
    CRSP_single_stock.dropna(
        subset=["bloomberg_ticker", "group_subindustry"], inplace=True
    )
    if CRSP_single_stock.shape[0] > 0:
        ## Derive Group Labels
        CRSP_single_stock["group_subindustry"] = CRSP_single_stock[
            "group_subindustry"
        ].astype(int)
        CRSP_single_stock["group_industry"] = (
            CRSP_single_stock["group_subindustry"] // 100
        )
        CRSP_single_stock["group_sector"] = (
            CRSP_single_stock["group_subindustry"] // 1000000
        )
        ## Downsample to Friday
        shift = pd.to_datetime(CRSP_single_stock.index).dayofweek[0]
        subsampled = (
            CRSP_single_stock.fillna(method="pad")
            .resample("D")
            .fillna(method="pad", limit=31)[11 - shift :: 7]
        )
        subsampled["friday_date"] = subsampled.index.strftime("%Y%m%d").astype(int)
        subsampled["era"] = subsampled.index
        output = subsampled.set_index(["friday_date", "bloomberg_ticker"])
        return output 
    else:
        return pd.DataFrame()

In [19]:
def transform_era(df, feature_cols, group_labels=None, keep_original=False):
    transformed_features = list()
    if group_labels is not None:
        for group in group_labels:
            group_features = list()
            for i, df_group in df.groupby(group):
                df_group_ranked = df_group[feature_cols].rank(pct=True, axis=0) - 0.5
                df_group_ranked.fillna(0, inplace=True)
                df_group_ranked = df_group_ranked * 5
                df_group_ranked.columns = [
                    "{}_{}_ranked".format(x, group) for x in feature_cols
                ]
                group_features.append(pd.concat([df_group_ranked], axis=1))
            group_features_df = pd.concat(group_features, axis=0)
            transformed_features.append(group_features_df)
    ## On All Data
    df_ranked = df[feature_cols].rank(pct=True, axis=0) - 0.5
    df_ranked.fillna(0, inplace=True)
    df_ranked = df_ranked * 5
    df_ranked.columns = ["{}_ranked".format(x) for x in feature_cols]
    transformed_features.append(df_ranked)
    if keep_original:
        transformed_features.append(df[feature_cols])
    transformed_df = pd.concat(transformed_features, axis=1)
    return transformed_df

In [None]:
"""
Transformations for Price Data from Compustat 
Format: Open, High, Low, Close 
"""


class CompustatStatsTransformer(TransformerMixin, BaseEstimator):
    def __init__(
        self,
    ):
        pass

    def transform(self, X):

        X["average_price"] = (
            (
                X["adjusted_open"]
                + X["adjusted_close"]
                + X["adjusted_high"]
                + X["adjusted_low"]
            )
            / 4
        ).astype(float)

        log_returns = np.log(X["average_price"]) - np.log(X["average_price"].shift(1))

        output_cols = list()

        ## Momentum
        for lookback in [21, 63, 252]:
            output_col = f"momentum_{lookback}"
            X[output_col] = log_returns.rolling(lookback).sum()
            output_cols.append(output_col)

        ## Volatility
        for lookback in [21, 63, 252]:
            output_col = f"volatility_{lookback}"
            X[output_col] = log_returns.rolling(lookback).std() * np.sqrt(
                252 / lookback
            )
            output_cols.append(output_col)

        ## Skewness
        for lookback in [21, 63, 252]:
            output_col = f"skewness_{lookback}"
            X[output_col] = log_returns.rolling(lookback).skew()
            output_cols.append(output_col)

        ## Kurtosis
        for lookback in [21, 63, 252]:
            output_col = f"kurtosis_{lookback}"
            X[output_col] = log_returns.rolling(lookback).kurt()
            output_cols.append(output_col)

        output = X[output_cols]

        return output.dropna().add_prefix("feature_")


class CompustatCatch22Transformer(TransformerMixin, BaseEstimator):
    def __init__(
        self,
    ):
        pass

    def transform(self, X):

        if X.shape[0] > 252:
            X["average_price"] = (
                (
                    X["adjusted_open"]
                    + X["adjusted_close"]
                    + X["adjusted_high"]
                    + X["adjusted_low"]
                )
                / 4
            ).astype(float)
            X["average_price"] = np.log(X["average_price"])

            catch22_short = list()
            catch22_mid = list()
            catch22_long = list()

            for df in X["average_price"].rolling(21):
                if df.shape[0] >= 21:
                    temp = pycatch22.catch22_all(df.values)
                    col_names = temp["names"]
                    catch22_short.append(temp["values"])
                else:
                    catch22_short.append([np.nan for i in range(22)])
            catch22_short_df = pd.DataFrame(catch22_short, columns=col_names)
            catch22_short_df = catch22_short_df.add_suffix("_short")
            catch22_short_df.index = X.index

            for df in X["average_price"].rolling(63):
                if df.shape[0] >= 63:
                    temp = pycatch22.catch22_all(df.values)
                    col_names = temp["names"]
                    catch22_mid.append(temp["values"])
                else:
                    catch22_mid.append([np.nan for i in range(22)])
            catch22_mid_df = pd.DataFrame(catch22_mid, columns=col_names)
            catch22_mid_df = catch22_mid_df.add_suffix("_mid")
            catch22_mid_df.index = X.index

            for df in X["average_price"].rolling(252):
                if df.shape[0] >= 252:
                    temp = pycatch22.catch22_all(df.values)
                    col_names = temp["names"]
                    catch22_long.append(temp["values"])
                else:
                    catch22_long.append([np.nan for i in range(22)])
            catch22_long_df = pd.DataFrame(catch22_long, columns=col_names)
            catch22_long_df = catch22_long_df.add_suffix("_long")
            catch22_long_df.index = X.index

            output = pd.concat(
                [
                    catch22_short_df,
                    catch22_mid_df,
                    catch22_long_df,
                ],
                axis=1,
            )
            return output.dropna().add_prefix("feature_")
        else:
            return pd.DataFrame()


class CompustatSignatureTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, signature_level, lookbacks=[21, 63, 252]):
        self.signature_level = signature_level
        self.lookbacks = lookbacks
        self.signaturetransformers = dict()
        for lookback in lookbacks:
            self.signaturetransformers[lookback] = SignatureTransformer(
                lookback, signature_level
            )

    def transform(self, X):
        X["average_price"] = (
            (
                X["adjusted_open"]
                + X["adjusted_close"]
                + X["adjusted_high"]
                + X["adjusted_low"]
            )
            / 4
        ).astype(float)
        X["average_price"] = np.log(X["average_price"])
        selected_cols = ["average_price"]
        for smooth in [5, 21]:
            X[f"average_price_{smooth}"] = X["average_price"].rolling(smooth).mean()
            selected_cols.append(f"average_price_{smooth}")

        signature_outputs = list()
        for lookback in self.lookbacks:
            signature_outputs.append(
                self.signaturetransformers[lookback].transform(
                    X[selected_cols].astype(float).dropna()
                )
            )

        output = pd.concat(signature_outputs, axis=1)

        return output.dropna().add_prefix("feature_")

In [17]:
for sample_id in permnos:
    try:
        CRSP_single_stock = CRSP_data.xs(str(sample_id), level=1, axis=1).dropna(
            subset=["market_cap"]
        )
        transformer = CompustatSignatureTransformer(4)
        output = transformer.transform(
            CRSP_single_stock[
                [
                    "adjusted_open",
                    "adjusted_close",
                    "adjusted_high",
                    "adjusted_low",
                    "dollar_volume",
                    "market_cap",
                ]
            ]
            .dropna()
            .astype(float)
        )
        ans = add_industry_labels(output.copy(), sample_id)
        if ans.shape[0] > 0:
            ans.to_parquet(f"../signals-data/temp_signature/signature_{sample_id}.parquet")    
    except:
        pass 

In [None]:
os.mkdir('../signals-data/temp_signature')

In [18]:
ans_list = list()
files = glob.glob("../signals-data/temp_signature/*.parquet")
for file in files:
    ans = pd.read_parquet(file)
    ans_list.append(ans)
pd.concat(ans_list, axis=0).to_parquet(
    "../signals-data/numerai_signals_features_signature.parquet"
)

In [19]:
raw_features = pd.read_parquet(
    "../signals-data/numerai_signals_features_signature.parquet"
).sort_values("era")
normalised = list()
for i, df in raw_features.groupby("era"):
    print(i)
    df_new = df[~df.index.duplicated(keep=False)]
    feature_cols = [
        col for col in raw_features.columns if col.startswith("feature_")
    ]
    group_labels = [col for col in raw_features.columns if col.startswith("group_")]
    normalised.append(
        transform_era(
            df_new,
            feature_cols=feature_cols,
            group_cols=None,
        )
        .round()
        .astype(int)
    )
feature_normalised = pd.concat(normalised, axis=0)
feature_normalised.to_parquet(
    "../signals-data/numerai_signals_features_signature_normalised.parquet"
)    

2001-02-09 00:00:00
2001-02-16 00:00:00
2001-02-23 00:00:00
2001-03-02 00:00:00
2001-03-09 00:00:00
2001-03-16 00:00:00
2001-03-23 00:00:00
2001-03-30 00:00:00
2001-04-06 00:00:00
2001-04-13 00:00:00
2001-04-20 00:00:00
2001-04-27 00:00:00
2001-05-04 00:00:00
2001-05-11 00:00:00
2001-05-18 00:00:00
2001-05-25 00:00:00
2001-06-01 00:00:00
2001-06-08 00:00:00
2001-06-15 00:00:00
2001-06-22 00:00:00
2001-06-29 00:00:00
2001-07-06 00:00:00
2001-07-13 00:00:00
2001-07-20 00:00:00
2001-07-27 00:00:00
2001-08-03 00:00:00
2001-08-10 00:00:00
2001-08-17 00:00:00
2001-08-24 00:00:00
2001-08-31 00:00:00
2001-09-07 00:00:00
2001-09-14 00:00:00
2001-09-21 00:00:00
2001-09-28 00:00:00
2001-10-05 00:00:00
2001-10-12 00:00:00
2001-10-19 00:00:00
2001-10-26 00:00:00
2001-11-02 00:00:00
2001-11-09 00:00:00
2001-11-16 00:00:00
2001-11-23 00:00:00
2001-11-30 00:00:00
2001-12-07 00:00:00
2001-12-14 00:00:00
2001-12-21 00:00:00
2001-12-28 00:00:00
2002-01-04 00:00:00
2002-01-11 00:00:00
2002-01-18 00:00:00


2008-12-19 00:00:00
2008-12-26 00:00:00
2009-01-02 00:00:00
2009-01-09 00:00:00
2009-01-16 00:00:00
2009-01-23 00:00:00
2009-01-30 00:00:00
2009-02-06 00:00:00
2009-02-13 00:00:00
2009-02-20 00:00:00
2009-02-27 00:00:00
2009-03-06 00:00:00
2009-03-13 00:00:00
2009-03-20 00:00:00
2009-03-27 00:00:00
2009-04-03 00:00:00
2009-04-10 00:00:00
2009-04-17 00:00:00
2009-04-24 00:00:00
2009-05-01 00:00:00
2009-05-08 00:00:00
2009-05-15 00:00:00
2009-05-22 00:00:00
2009-05-29 00:00:00
2009-06-05 00:00:00
2009-06-12 00:00:00
2009-06-19 00:00:00
2009-06-26 00:00:00
2009-07-03 00:00:00
2009-07-10 00:00:00
2009-07-17 00:00:00
2009-07-24 00:00:00
2009-07-31 00:00:00
2009-08-07 00:00:00
2009-08-14 00:00:00
2009-08-21 00:00:00
2009-08-28 00:00:00
2009-09-04 00:00:00
2009-09-11 00:00:00
2009-09-18 00:00:00
2009-09-25 00:00:00
2009-10-02 00:00:00
2009-10-09 00:00:00
2009-10-16 00:00:00
2009-10-23 00:00:00
2009-10-30 00:00:00
2009-11-06 00:00:00
2009-11-13 00:00:00
2009-11-20 00:00:00
2009-11-27 00:00:00


2016-11-18 00:00:00
2016-11-25 00:00:00
2016-12-02 00:00:00
2016-12-09 00:00:00
2016-12-16 00:00:00
2016-12-23 00:00:00
2016-12-30 00:00:00
2017-01-06 00:00:00
2017-01-13 00:00:00
2017-01-20 00:00:00
2017-01-27 00:00:00
2017-02-03 00:00:00
2017-02-10 00:00:00
2017-02-17 00:00:00
2017-02-24 00:00:00
2017-03-03 00:00:00
2017-03-10 00:00:00
2017-03-17 00:00:00
2017-03-24 00:00:00
2017-03-31 00:00:00
2017-04-07 00:00:00
2017-04-14 00:00:00
2017-04-21 00:00:00
2017-04-28 00:00:00
2017-05-05 00:00:00
2017-05-12 00:00:00
2017-05-19 00:00:00
2017-05-26 00:00:00
2017-06-02 00:00:00
2017-06-09 00:00:00
2017-06-16 00:00:00
2017-06-23 00:00:00
2017-06-30 00:00:00
2017-07-07 00:00:00
2017-07-14 00:00:00
2017-07-21 00:00:00
2017-07-28 00:00:00
2017-08-04 00:00:00
2017-08-11 00:00:00
2017-08-18 00:00:00
2017-08-25 00:00:00
2017-09-01 00:00:00
2017-09-08 00:00:00
2017-09-15 00:00:00
2017-09-22 00:00:00
2017-09-29 00:00:00
2017-10-06 00:00:00
2017-10-13 00:00:00
2017-10-20 00:00:00
2017-10-27 00:00:00


In [None]:
if False:
    import shutil
    shutil.rmtree('../signals-data/temp_signature')

## Financials 

In [None]:
### Process Financial Ratios from Open Source AP  (2000 to 2020)

numerai_signals_metadata = pd.read_csv("data/numerai_signals_metadata_2021.csv")
financial_ratios = pd.read_parquet("data/numerai_financials_2021.parquet")
financial_ratios["rawdatadate"] = pd.to_datetime(
    financial_ratios["yyyymm"], format="%Y%m"
)

for sample_id in numerai_signals_metadata["permno"].unique():
    print(sample_id)
    single_stock = financial_ratios[
        financial_ratios["permno"] == sample_id
    ].sort_values("rawdatadate")
    if single_stock.shape[0] > 0:
        ## Data calculated at the end of month can be used for the following month
        single_stock["datadate"] = single_stock["rawdatadate"].shift(-1)
        single_stock.drop(
            [
                "rawdatadate",
                "permno",
                "yyyymm",
            ],
            axis=1,
            inplace=True,
        )
        single_stock.dropna(subset=["datadate"], inplace=True)
        single_stock_daily = (
            single_stock.set_index("datadate").resample("D").asfreq()
        )
        single_stock_daily = single_stock_daily.add_prefix("feature_")
        ans = add_industry_labels(single_stock_daily.copy(), sample_id)
        if ans.shape[0] > 0:
            output = ans[ans["era"] <= "2021-12-31"]
            output.to_parquet(
                f"data/temp_financials/financials_{sample_id}.parquet"
            )

del financial_ratios
gc.collect()

ans_list = list()
files = glob.glob("data/temp_financials/*.parquet")
for file in files:
    ans = pd.read_parquet(file)
    ans_list.append(ans)
pd.concat(ans_list, axis=0).to_parquet(
    "data/numerai_signals_features_financials.parquet"
)

In [None]:
## Financials
raw_features = pd.read_parquet(
    "data/numerai_signals_features_financials.parquet"
).sort_values("era")
normalised = list()
for i, df in raw_features.groupby("era"):
    df_new = df[~df.index.duplicated(keep=False)]
    print(i)
    feature_cols = [
        col for col in raw_features.columns if col.startswith("feature_")
    ]
    group_labels = [col for col in raw_features.columns if col.startswith("group_")]
    normalised.append(
        transform_era(df_new, feature_cols=feature_cols, group_labels=None)
        .round()
        .astype(int)
    )
feature_normalised = pd.concat(normalised, axis=0)
feature_normalised.to_parquet(
    "features/numerai_signals_features_financials_normalised.parquet"
)

### Sentiment 

In [3]:
import pandas as pd

In [20]:
taxnomy = pd.read_csv('../../../data/Ravenpack_rpa_taxonomy_2021.csv')
taxnomy.head() 

Unnamed: 0,topic,group,type,sub_type,property,fact_level,category,description,scheduled,valid_entity_types
0,business,products-services,accelerated-approval-application,,,fact,accelerated-approval-application,An Entity applies for an accelerated approval ...,False,"COMPANY,ORGANIZATION"
1,business,products-services,accelerated-approval-application,,authority,fact,accelerated-approval-application-authority,The Organization that reviews the accelerated ...,False,ORGANIZATION
2,business,products-services,accelerated-approval-application,,location,fact,accelerated-approval-application-location,The Place where the accelerated approval appli...,False,PLACE
3,business,products-services,accelerated-approval-application,,,opinion,accelerated-approval-application-opinion,A view or opinion is expressed about the Entit...,False,"COMPANY,ORGANIZATION"
4,business,products-services,accelerated-approval-application,,authority,opinion,accelerated-approval-application-opinion-autho...,A view or opinion is expressed about the regul...,False,ORGANIZATION


In [9]:
### Ravenpack
def read_ravenpack_equities(
    ra_folder="../../../data/Ravenpack", startyear=2000, endyear=2021, rp_entity_ids=None
):
    ravenpacks = list()
    for year in range(startyear, endyear + 1):
        for month in range(1, 13):
            filename = f"{ra_folder}/Ravenpack_equities_{year}_{month}.parquet"
            ravenpack = pd.read_parquet(filename)
            ravenpack = ravenpack[ravenpack["rp_entity_id"].isin(rp_entity_ids)]
            print(f"Reading Ravepack Equities {year} {month}")
            drop_cols = [
                "headline",
                "rpa_time_utc",
                "timestamp_utc",
                "rp_story_id",
                "product_key",
                "provider_id",
                "provider_story_id",
                "rp_story_event_index",
                "rp_story_event_count",
                "news_type",
                "rp_source_id",
                "source_name",
                "rp_position_id",
                "position_name",
            ]
            ravenpack_small = ravenpack.drop(drop_cols, axis=1)
            ## Filter important events
            ravenpack_important = ravenpack_small[
                (ravenpack_small["event_relevance"] >= 100)
                & (ravenpack_small["event_similarity_days"] >= 1)
                & (ravenpack_small["event_sentiment_score"] != 0)
            ]
            ravenpack_important.to_parquet(f'../signals-data/raw_ravenpack/ravenpack_filtered_{year}_{month}.parquet')
            ## Summarise data by event similar keys
            ravenpacks.append(ravenpack_important)
    return pd.concat(ravenpacks, axis=0)

In [16]:
from sklearn.base import TransformerMixin, BaseEstimator

class RavenpackSentimentTransformer(TransformerMixin, BaseEstimator):
    def __init__(
        self,
        reference_categories_file='../signals-data/ravenpack_category.csv'
    ):
        self.top_categories=pd.read_csv(reference_categories_file,index_col=0).head(200).index

    def transform(self, X):
        
        output_cols = list()

        X["rpa_date_utc"] = pd.to_datetime(X["rpa_date_utc"])
        daily_newssentiment = X.groupby("rpa_date_utc")[["event_sentiment_score"]].mean()
        dailyX = pd.DataFrame(index=daily_newssentiment.index)

        ## Event Sentiment
        for lookback in [
            5,
            21,
            63,
        ]:
            output_col = f"rp_EventSentiment_{lookback}"
            dailyX[output_col] = daily_newssentiment.rolling(lookback).mean()
            output_cols.append(output_col)
            
        ## Count by Category 
        for category in self.top_categories:
            X_category = X[X['category']==category]
            daily_categorysentiment = X_category.groupby("rpa_date_utc")[["event_sentiment_score"]].mean()    
            ## Event Sentiment
            for lookback in [
                63,
            ]:
                output_col = f"rp_Sentiment_{category}_{lookback}"
                dailyX[output_col] = daily_categorysentiment.rolling(lookback).mean()
                output_cols.append(output_col)          
            
        output = dailyX[output_cols]
        return output.add_prefix("feature_").fillna(0)

In [10]:
### Process Ravenpack

numerai_signals_metadata = pd.read_csv("../signals-data/numerai_signals_metadata_2021.csv")
rp_entity_ids = numerai_signals_metadata["rp_entity_id"].unique()
ravenpack = read_ravenpack_equities(rp_entity_ids=rp_entity_ids)

Reading Ravepack Equities 2000 1
Reading Ravepack Equities 2000 2
Reading Ravepack Equities 2000 3
Reading Ravepack Equities 2000 4
Reading Ravepack Equities 2000 5
Reading Ravepack Equities 2000 6
Reading Ravepack Equities 2000 7
Reading Ravepack Equities 2000 8
Reading Ravepack Equities 2000 9
Reading Ravepack Equities 2000 10
Reading Ravepack Equities 2000 11
Reading Ravepack Equities 2000 12
Reading Ravepack Equities 2001 1
Reading Ravepack Equities 2001 2
Reading Ravepack Equities 2001 3
Reading Ravepack Equities 2001 4
Reading Ravepack Equities 2001 5
Reading Ravepack Equities 2001 6
Reading Ravepack Equities 2001 7
Reading Ravepack Equities 2001 8
Reading Ravepack Equities 2001 9
Reading Ravepack Equities 2001 10
Reading Ravepack Equities 2001 11
Reading Ravepack Equities 2001 12
Reading Ravepack Equities 2002 1
Reading Ravepack Equities 2002 2
Reading Ravepack Equities 2002 3
Reading Ravepack Equities 2002 4
Reading Ravepack Equities 2002 5
Reading Ravepack Equities 2002 6
Read

Reading Ravepack Equities 2020 8
Reading Ravepack Equities 2020 9
Reading Ravepack Equities 2020 10
Reading Ravepack Equities 2020 11
Reading Ravepack Equities 2020 12
Reading Ravepack Equities 2021 1
Reading Ravepack Equities 2021 2
Reading Ravepack Equities 2021 3
Reading Ravepack Equities 2021 4
Reading Ravepack Equities 2021 5
Reading Ravepack Equities 2021 6
Reading Ravepack Equities 2021 7
Reading Ravepack Equities 2021 8
Reading Ravepack Equities 2021 9
Reading Ravepack Equities 2021 10
Reading Ravepack Equities 2021 11
Reading Ravepack Equities 2021 12


### Useful Categories 

Select Top Categories based on data in training period 

In [12]:
import datetime

In [13]:
ravenpack_train = ravenpack[ravenpack['rpa_date_utc']<datetime.datetime(2013,12,31).date()]
ravenpack_train['category'].value_counts().to_csv('../signals-data/ravenpack_category.csv')
ravenpack_train['category'].value_counts().head(200).index
ravenpack_train['entity_name'].value_counts().head(500)

Apple Inc.                 19196
Microsoft Corp.            18921
Ford Motor Co.             15949
Alphabet Inc.              15833
General Electric Co.       12977
                           ...  
Finisar Corp.               2510
C.R. Bard Inc.              2509
Emulex Corp.                2505
Trimble Navigation Ltd.     2505
PerkinElmer Inc.            2501
Name: entity_name, Length: 500, dtype: int64

In [22]:
import pandas as pd
pd.options.mode.chained_assignment = None

In [25]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [28]:
for sample_id in rp_entity_ids:
    print(sample_id)
    sample = ravenpack[ravenpack["rp_entity_id"] == sample_id]
    if type(sample) == pd.DataFrame:
        if sample.shape[0] > 0:
            transformer = RavenpackSentimentTransformer()
            output = transformer.transform(sample)
            ans = add_industry_labels(
                output.copy(),
                sample_id,
                sample_id_type="rp_entity_id",
            )
            if ans.shape[0] > 0:
                ans.to_parquet(f"../signals-data/temp_ravenpack/ravenpack_{sample_id}.parquet")

gc.collect()



ED79D9
EC821B
D93A25
2753BA
E48465
C38E3A
1F355F
CCE528
F163A7
8C9666
13C3E0
D518D8
1C4LN9
48F795
A94637
F39E1E
D8442A
DBE2CE
DF18E6
1B4F31
D2AC74
03B9F5
665D7D
7D85A9
8B2C4A
431DEB
490754
A87CD6
24A89A
900A69
492117
19F9BC
964A22
966DBE
9BF177
D65A13
02750A
76BD96
520632
9EED50
D5E3FF
275268
CE4E6D
227D48
6248C7
F751DC
789647
0BD8C4
A02FB1
94C0B5
317EC6
BE1B62
0BC6D8
ACDCFA
C87F45
7532EC
74448B
61BCA7
FC1A6E
A4FBB4
228924
4E5784
EF25A5
CD2078
03B431
03596A
3215CF
F4F354
91B8B1
B51823
0D779C
D9F34F
DF8D70
3A5113
37B120
324285
8B69DC
8786E3
7AA561
D2C287
BD92FD
BEF4F3
F9E164
F44849
E78692
C9881C
EE6473
06F889
ED21B9
E09997
65C4E6
E68C3D
B6320F
1627DE
2B7A40
D07B07
DDB616
D4B9FC
F88FA9
66ECFD
9D0252
7D4104
1B67F5
184988
D4DCFB
EF1AD9
64E346
D49AAF
97644E
9E98F2
7C790D
BE65E7
2EAEAF
B61583
1C8310
372CBE
68A15D
38A0C0
87BDAB
789A7D
3056CF
7E2521
1D943E
698FF9
780ADF
AC3263
2CC60C
D9B1C9
2082EA
9C8BC3
B8C294
F5117C
C69EDE
8EB29B
D1D9AF
213039
CDD5C2
6C0A2F
175363
9D71E7
3F774A
D1F6A4
C7AE14

14391F
F90EE5
328250
58B46F
43281D
A3062B
E05026
8ADC4F
B7385B
245E13
B05118
EEB5F9
0CD2C7
BC9103
1AB023
E12050
190CBE
A1B848
DB0E31
729E87
EC70EC
QOO7LB
B0998E
58E2FD
2FD2F9
0E63F3
C7C451
A38D68
D01BE7
055018
388E00
9465C9
236917
0FC848
5B15E1
717DB4
8B4C82
822E25
B7202A
5D0337
86E7F8
ED9C04
EC6E9C
75EE28
DA5ACF
F43A11
AEA0A9
CD2DA4
35557B
FE89E0
95A92E
2906AD
FE2D58
E07573
8A5630
B381D8
0BBAD0
B8EF97
23EFAD
F18844
2AA861
D04C62
2A4449
762A3E
23DF72
2BC176
543900
937DCC
3696BF
A6CBD9
70C9F1
DDBCF4
484A8D
83F6FE
0CCCB7
7F7FF1
BB17A6
41351A
467C65
8B4A45
1880C5
3B3E51
64E615
BFEA04
0B202E
DC5299
6BEAA0
F1B01F
D2FB67
7E2BC2
55CBA5
75D1A1
DCA984
DF46C5
4BFD72
DDCB34
BEC88D
B3586B
E71AE6
F32526
9A5F6B
EF0DE5
B73395
0555FF
D5B868
6E8349
F2D6C8
4D72C8
B13B68
BB127B
F88147
905356
BED43A
292388
42EFC7
69B3D8
0C355F
8B66CE
D374D9
5A64D5
9A29F4
D731FB
D4C0CB
F8B149
21F9CA
E5609A
36ECA4
EDF7B2
349DA4
A009C9
870DED
2E511F
12DE76
537594
10F29E
ED0402
810FAD
187229
11074E
01AA9C
292011
9548BB
E229EA

EF63B9
CE03DF
1E6021
108214
6DD6BA
CF0218
99990D
D0909F
9D2CE0
9DB9E2
0D1E8C
68E6E9
BEB756
86B99A
281305
CD051C
DF2BF0
0A2CDE
1DEBBE
B00BD3
9368BE
AF9895
BF3D8C
D854DD
CC1A5C
3B9886
766047
A1EAC8
9804F2
4B59D6
A56293
59D492
22B3F4
D29224
8EE5D6
601785
3A9C1F
86AA9C
60A790
724F84
040DBB
DD9E41
9E0811
BBD31D
66E04A
2037DC
B5A12B
FC9809
F61FE8
F8BBC5
1E3FD1
5CDEA8
E2CFD2
F42FAE
49914B
95F59E
6BF593
DF3287
07F43C
881190
90103A
689580
29869A
C356AC
42978B
DC486E
82A2EE
C951A2
1E5786
863350
399BA6
FF6644
108FDA
033B58
AFF43F
42851E
27F154
0F7B72
E4714D
039E10
1DE526
C887A8
DBB28E
5F0350
4AC91D
C80CC5
A94091
ABD425
AED61E
7F9BE2
56EFC7
8020F1
D854F3
545252
3471C0
C4A432
8946B2
CC3EC4
EE6F1C
A8FDE8
02FC1A
035C35
E22153
51BCB6
9F5997
CEF5B9
A7102F
68B03F
09062E
5604F9
A4E5E6
ADF1B0
9F03CF
735388
ED8676
5DA438
12812A
042C91
2EBC0B
5BE42C
F94F23
0D38DA
8BC8DF
BC71FF
7F6F0B
02F9D9
E6E012
A7CCF0
4458AA
55B4D2
1A65AB
AB0D0F
00067A
8FCA78
5CCDAD
A9F866
E60015
F2C9CB
850779
43298A
A0566D
B604DF
E69BA5

D64C6D
649CBD
D47D56
D90441
C1A8BA
03C2E7
89F693
3DB18B
90E378
DB9F70
0B4D10
CFF326
4F9351
8E9CD8
F721A7
EFE1AC
6DD4DB
FC6A50
8BFAA4
2BBE62
2893E8
419285
FC1B7B
792471
E1539B
5D02B7
28AC4E
CC0C78
D14977
0308A3
E4D56A
22DAF6
BB88B6
902255
C1EF2F
2F6782
62058A
389CFB
3390DE
FFC6C3
FD39EB
65AB30
726DDF
2D643C
B65A8A
41A93D
C09F0A
0425A7
63DB0A
F27358
73A213
FC72A1
293B05
3DD5E2
25DEA9
934CC3
525A1C
882426
DC0998
A2CCC7
FB7C4C
5D1329
777914
D0D4EE
59872F
4002BD
CBDB4D
B317AA
3CCC90
215817
3A9ECD
30DFD3
13011D
E5752F
DCAEA5
986AF6
60EA12
88193E
2C3093
D16D10
2C681C
11D2E3
E09E2B
12119D
610D47
0F2BC9
7ABE86
C29715
2E89AE
420CE9
999607
FC4652
B48FC9
A8DE1C
F95726
FA044A
17D3D5
317B34
C3BCD5
A8674F
86F123
6F26FF
DD1BA1
F66B58
C9D3F0
53E575
015515
3197F5
351EC3
9C82E1
D36B23
6ADD69
1AC284
BDE3D5
0E2992
C3F31B
875F41
A0D8E6
CC7DC7
003B70
2974D3
160825
01652B
4D971D
47752F
7FB79B
04DFFC
8F3231
1458C8
8951CA
C0A884
48B9E9
6B67BD
A79F7D
119CB6
B66928
BAFE5A
C624A1
10CAC8
34BE51
FC8733
102231
0C4F5C

65D0AA
32F943
4DF465
1CD4B5
0C6293
9E55EE
6AE66E
D616BC
B8BA75
F6A945
7F11B5
D3C794
F6FB19
D3A769
HK22VT
E0207A
E87C8C
BDE867
A48593
A2A20B
CC176E
67A42B
A9A026
1AB808
56A13C
E86379
4FFA2D
59C539
01CE44
804708
BF23B0
A2422A
9ECF97
1B7ACC
54510A
7E3390
DF1F7A
1D3EEA
CC3224
385DD5
20CE5B
5186C8
5675EB
20BEEA
F399F5
5DBC98
1A5957
F882AF
74D650
AE7CBC
80FB5F
D1588D
5704AC
3BEF73
63BB7B
4135F7
ED1FFF
7F9E16
152FA5
43EBC1
7371F8
8CBA5D
523039
B2C6AF
5BC2F4
ECE19E
789D75
808319
2720AB
6D2BF6
1D1B07
2EA7A5
159739
CB9E13
273BA3
2A9BB9
CEB009
7FA446
B6DE40
8CE4CA
64DE3B
9C0D3B
931E75
249C8E
66C9A7
B67F76
5EF711
50061B
6F73DA
BBB06B
009EF0
8DFDF2
855DF7
6F1835
14D340
CDAA0E
D953B3
28439E
5555B6
11A5F0
0E128A
54F15F
FA840E
73F7F9
52F96B
1FC422
6A8B40
C028D5
2D450D
DFB81B
CE1002
EB5E78
12731D
73529F
9FF741
92B7BD
384139
EB1E1A
F80B8E
83E0AC
0B4D07
62EFEE
39E412
254EC1
83598E
5BFE94
3E0E22
E63A0D
E3A22C
2031CE
35A5CD
6D3D88
3D9999
C6751C
C81E00
13A782
5C5D57
BBF349
B7103B
66A750
642057
65F629
E3D30A

NameError: name 'gc' is not defined

In [None]:
ans_list = list()
files = glob.glob("../signals-data/temp_ravenpack/*.parquet")
for file in files:
    ans = pd.read_parquet(file)
    ans_list.append(ans)
pd.concat(ans_list, axis=0).to_parquet(
    "../signals-data/numerai_signals_features_ravenpack_check.parquet"
)

del ans_list
gc.collect()

In [None]:
raw_features = pd.read_parquet(
    "../signals-data/numerai_signals_features_ravenpack.parquet"
).sort_values("era")
normalised = list()
for i, df in raw_features.groupby("era"):
    print(i)
    df_new = df[~df.index.duplicated(keep=False)]
    feature_cols = [
        col for col in raw_features.columns if col.startswith("feature_")
    ]
    group_labels = [col for col in raw_features.columns if col.startswith("group_")]
    normalised.append(
        transform_era(
            df_new,
            feature_cols=feature_cols,
            group_labels=None,
        )
        .round()
        .astype(int)
    )
feature_normalised = pd.concat(normalised, axis=0)
feature_normalised.to_parquet(
    "../signals-data/numerai_signals_features_ravenpack_normalised.parquet"
)