### Split + Standardize + KNN Imputation + Autoregressive feature generation

In [4]:
# Helper functions
import pandas as pd
import numpy as np
from tqdm import tqdm

import pandas as pd, warnings
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

path_to_data_dir = "../../data/"

def load_local_data():
	df = pd.read_csv(path_to_data_dir + "raw_features.csv").set_index(["date", "ticker"])
	X_df = pd.read_csv(path_to_data_dir + "target_adjusted_features.csv").set_index(["date", "ticker"])
	y_series = pd.read_csv(path_to_data_dir + "target.csv").set_index(["date", "ticker"])
	return df, X_df, y_series

def split_date(all_dates, train_size = 0.75, days_ahead = 5):
	num_dates = len(all_dates)
	num_train_test_dates = num_dates - days_ahead
	num_train_dates = int(num_train_test_dates * train_size)

	train_dates = all_dates[:num_train_dates]
	val_dates = all_dates[num_train_dates:num_train_dates+days_ahead]
	test_dates = all_dates[num_train_dates+days_ahead:]

	return train_dates, val_dates, test_dates

In [5]:
# (Not used) y to classification

def to_multi_classification(y_):
	"""
	-thresh < value < thresh : class 0
	value <= -thresh : class 1
	thresh <= value : class 2
	"""
	threshold = 0.02
	# y_onehot = np.zeros((y_.shape[0], 3))
	y_new = np.zeros_like(y_)
	y_new += (y_ <= -threshold)
	y_new += (2 * (y_ >= threshold))

	# y_onehot[np.arange(y_onehot.shape[0]), np.int32(y_new).flatten()] = 1.
	return pd.Series(np.int32(y_new).flatten(), index = y_.index)

def to_binary_classification(y_):
	"""
	-thresh < value < thresh : class 0
	value <= -thresh : class 1
	thresh <= value : class 2
	"""
	threshold = 0.01
	# y_onehot = np.zeros((y_.shape[0], 3))
	# y_new = np.zeros_like(y_)
	# y_new += (y_ <= -threshold)
	# y_new += (2 * (y_ >= threshold))

	# y_onehot[np.arange(y_onehot.shape[0]), np.int32(y_new).flatten()] = 1.

	return (y_ > threshold)
	# return y_onehot

In [6]:
# Load data

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

df, X_df, y_series = load_local_data()

train_dates, val_dates, test_dates = split_date(X_df.index.get_level_values(0).unique().sort_values()[2:])
print(f"# of train dates ({len(train_dates)}), val dates ({len(val_dates)}), test dates ({len(test_dates)})")

X_df = X_df.fillna({"sector": "none", "area": "none"})

y_clf_series = to_multi_classification(y_series)
# y_clf_series = to_binary_classification(y_series)

X_train, y_train = X_df.loc[train_dates], y_clf_series[y_series.index.get_level_values(0).isin(train_dates)]
X_val, y_val = X_df.loc[val_dates], y_clf_series[y_series.index.get_level_values(0).isin(val_dates)]
X_test, y_test = X_df.loc[test_dates], y_clf_series[y_series.index.get_level_values(0).isin(test_dates)]

print(f"Train shapes: {X_train.shape, y_train.shape}")
print(f"Val shapes: {X_val.shape, y_val.shape}")
print(f"Test shapes: {X_test.shape, y_test.shape}")


# of train dates (19), val dates (5), test dates (7)
Train shapes: ((29745, 339), (29745,))
Val shapes: ((7778, 339), (7778,))
Test shapes: ((11263, 339), (11263,))


In [9]:
# Choose columns to keep (features with NaN >= 60%)
nan_gte_60_pct = X_train.columns[((X_train.isna().sum() / X_train.shape[0]) >= .6)]

In [10]:
X_train.shape, nan_gte_60_pct.shape

((29745, 339), (23,))

In [11]:
# Define preprocessor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

onehot_features = ["sector", "area"]
standard_scale_features = [f for f in X_train.columns.values if (f not in onehot_features) and (f not in nan_gte_60_pct)]

numerical_transformer = make_pipeline(
	StandardScaler(),
	KNNImputer(),
	# SimpleImputer(strategy="median")
)

preprocessor = ColumnTransformer(
	transformers = [
		("numericTransform", numerical_transformer, standard_scale_features),
		("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), onehot_features)
	],
	remainder="drop"
)

In [14]:
# Fit transform data
prep_X_train = preprocessor.fit_transform(X_train)
prep_X_val = preprocessor.transform(X_val)
prep_X_test = preprocessor.transform(X_test)

In [18]:
prep_X_train_df = pd.DataFrame(prep_X_train, columns = preprocessor.get_feature_names_out(), index=X_train.index)
prep_X_val_df = pd.DataFrame(prep_X_val, columns = preprocessor.get_feature_names_out(), index=X_val.index)
prep_X_test_df = pd.DataFrame(prep_X_test, columns = preprocessor.get_feature_names_out(), index=X_test.index)

In [19]:
prep_X_train_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,numericTransform__altman_z_score,numericTransform__analysts_down,numericTransform__analysts_down_percent,numericTransform__analysts_down_percent_avg_5y,numericTransform__analysts_up,numericTransform__analysts_up_percent,numericTransform__analysts_up_percent_avg_5y,numericTransform__assets_turnover,numericTransform__assets_turnover_avg_5y,numericTransform__authors_count,...,onehot__area_Software,onehot__area_Steel,onehot__area_Telecommunication Services,onehot__area_Tobacco Products,onehot__area_Transportation,onehot__area_Travel & Leisure,onehot__area_Utilities - Independent Power Producers,onehot__area_Utilities - Regulated,onehot__area_Vehicles & Parts,onehot__area_Waste Management
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2023-10-02,AEO,-0.037471,-0.866712,-1.2148,0.987353,0.406119,1.2148,-0.987353,1.234098,1.213896,-0.185812,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-10-02,AAPL,0.33801,0.622037,-0.642618,-1.331685,3.06139,0.642618,1.331685,0.785715,0.421125,8.289433,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023-10-02,AEE,-0.092572,-0.680618,-0.571095,-0.913375,-0.432387,0.571095,0.913375,-0.726254,-0.702846,-0.684356,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2023-10-02,AEP,-0.185198,-0.308431,0.07261,-0.982961,-0.432387,-0.07261,0.982961,-0.724739,-0.683713,0.06346,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2023-10-02,AES,-0.230088,0.063756,1.360019,-0.17736,-0.851641,-1.360019,0.17736,-0.521993,-0.525313,-0.684356,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
prep_X_train_df.shape

(29745, 380)

In [None]:
prep_X_train_df.to_csv(path_to_data_dir + "prep_X_train.csv", index = True)
prep_X_val_df.to_csv(path_to_data_dir + "prep_X_val.csv", index = True)
prep_X_test_df.to_csv(path_to_data_dir + "prep_X_test.csv", index = True)