In [10]:
import os

root_dir = os.path.abspath('..')
data_dir = os.path.join(root_dir, 'data')
save_path = os.path.join(data_dir, 'assets_info.csv')

In [11]:
import pandas as pd
import numpy as np

## Sliding Window & Target

It's pretty interesting that market works not on all days (weekends & holidays).

Also, it works from 9am to 4pm (Eastern Time).


For this reason, I will consider all the available days as continuous.

And, in the day, I'll have 7 hours of data (from 9am to 4pm).

In [12]:
data = pd.read_csv(save_path)

In [13]:
def create_sliding_windows(data, window_size, future_offset):
    """
    Create sliding windows and corresponding targets from time series data.
    
    Parameters:
    - data: np.array, time series data (e.g., closing prices).
    - window_size: int, the number of points to include in each window (e.g., 30 days * 24 hours = 720).
    - future_offset: int, how many points ahead to select the target value (e.g., 24 for 24 hours later).
    
    Returns:
    - X_windows: np.array, 2D array where each row is a window of size `window_size`.
    - y_targets: np.array, 1D array of target values corresponding to each window.
    """
    
    # Initialize lists to store windows and targets
    X_windows = []
    y_targets = []
    
    # Sliding window loop
    for i in range(len(data) - window_size - future_offset):
        # Extract the window of `window_size` points
        window = data[i:i + window_size]
        
        # Extract the target value `future_offset` points after the window ends
        target = data[i + window_size + future_offset - 1]
        
        # Append the window and target to their respective lists
        X_windows.append(window)
        y_targets.append(target)
    
    # Convert the lists to numpy arrays
    X_windows = np.array(X_windows)
    y_targets = np.array(y_targets)
    
    return X_windows, y_targets


In [14]:
# Example time series data (close prices)
data_ = np.array([i for i in range(1000)])  # Replace with your actual time series data

# Set the parameters
window_size = 30 * 7  # 30 days * 7 hours = 210 points per window
future_offset = 7  # Target is the price 7 hours later

# Call the function
X_windows, y_targets = create_sliding_windows(data_, window_size, future_offset)

# Check the output
print(f'X_windows shape: {X_windows.shape}')  # Each row is a window of 720 points
print(f'y_targets shape: {y_targets.shape}')  # Corresponding targets

assert X_windows.shape[0] == y_targets.shape[0], 'Number of windows and targets do not match!'
assert X_windows.shape == (1000 - window_size - future_offset, window_size), 'Incorrect shape for X_windows!'

X_windows shape: (783, 210)
y_targets shape: (783,)


In [15]:
data.head(2)

Unnamed: 0,symbol,sector,industry,country,data
0,X,Basic Materials,Steel,United States,"{Timestamp('2024-03-04 09:30:00-0500', tz='Ame..."
1,HRL,Consumer Defensive,Packaged Foods,United States,"{Timestamp('2024-03-04 09:30:00-0500', tz='Ame..."


In [16]:
from pandas import Timestamp


FEATURE_COLS = ['symbol', 'sector', 'industry', 'country']
def get_features(data_row):
    return data_row[FEATURE_COLS].to_dict()

def get_values(data_row):
    data_str = data_row['data'] # str(dict)
    data_dict = eval(data_str)
    return np.array(list(data_dict.values()))

data_row_ = data.iloc[0]
features = get_features(data_row_)
values = get_values(data_row_)
features, values[:3]

({'symbol': 'X',
  'sector': 'Basic Materials',
  'industry': 'Steel',
  'country': 'United States'},
 array([47.45000076, 47.45500183, 47.77999878]))

In [17]:
from copy import deepcopy

def row_to_samples(data_row, window_size, future_offset):
    features = get_features(data_row)
    values = get_values(data_row)
    X_windows, y_targets = create_sliding_windows(values, window_size, future_offset)
    for i in range(len(X_windows)):
        sample = deepcopy(features)
        sample.update({f'value_{j}': X_windows[i][j] for j in range(window_size)})
        sample['target'] = y_targets[i]
        yield sample
    

# Set the parameters
window_size = 30 * 7  # 30 days * 7 hours = 210 points per window
future_offset = 7  # Target is the price 7 hours later

# Get the first row of data
data_row_ = data.iloc[0]
cnt = 10
for sample in row_to_samples(data_row_, window_size, future_offset):
    print(sample)
    break

{'symbol': 'X', 'sector': 'Basic Materials', 'industry': 'Steel', 'country': 'United States', 'value_0': 47.45000076293945, 'value_1': 47.45500183105469, 'value_2': 47.779998779296875, 'value_3': 47.814998626708984, 'value_4': 47.78499984741211, 'value_5': 47.755001068115234, 'value_6': 47.69499969482422, 'value_7': 47.650001525878906, 'value_8': 47.939998626708984, 'value_9': 48.005001068115234, 'value_10': 48.03499984741211, 'value_11': 48.07500076293945, 'value_12': 48.04499816894531, 'value_13': 47.7599983215332, 'value_14': 47.36000061035156, 'value_15': 47.38999938964844, 'value_16': 47.20000076293945, 'value_17': 47.1150016784668, 'value_18': 47.16999816894531, 'value_19': 47.229000091552734, 'value_20': 47.20000076293945, 'value_21': 47.439998626708984, 'value_22': 47.5, 'value_23': 47.32500076293945, 'value_24': 47.39500045776367, 'value_25': 47.119998931884766, 'value_26': 47.349998474121094, 'value_27': 47.56999969482422, 'value_28': 47.53499984741211, 'value_29': 47.6549987

In [18]:
# data --> dataframe

def data_to_samples(data, window_size, future_offset):
    for _, data_row in data.iterrows():
        yield from row_to_samples(data_row, window_size, future_offset)

df = pd.DataFrame(data_to_samples(data, window_size, future_offset))

In [19]:
print(f'Number of samples: {len(df)}')

Number of samples: 127410


In [20]:
df.head(5)

Unnamed: 0,symbol,sector,industry,country,value_0,value_1,value_2,value_3,value_4,value_5,...,value_201,value_202,value_203,value_204,value_205,value_206,value_207,value_208,value_209,target
0,X,Basic Materials,Steel,United States,47.450001,47.455002,47.779999,47.814999,47.785,47.755001,...,40.994999,41.34,41.330002,41.240002,41.209999,40.9076,40.93,40.724998,40.830002,40.299999
1,X,Basic Materials,Steel,United States,47.455002,47.779999,47.814999,47.785,47.755001,47.695,...,41.34,41.330002,41.240002,41.209999,40.9076,40.93,40.724998,40.830002,40.450001,40.599998
2,X,Basic Materials,Steel,United States,47.779999,47.814999,47.785,47.755001,47.695,47.650002,...,41.330002,41.240002,41.209999,40.9076,40.93,40.724998,40.830002,40.450001,40.52,40.5
3,X,Basic Materials,Steel,United States,47.814999,47.785,47.755001,47.695,47.650002,47.939999,...,41.240002,41.209999,40.9076,40.93,40.724998,40.830002,40.450001,40.52,40.380001,40.134998
4,X,Basic Materials,Steel,United States,47.785,47.755001,47.695,47.650002,47.939999,48.005001,...,41.209999,40.9076,40.93,40.724998,40.830002,40.450001,40.52,40.380001,40.34,40.209999


## Target

I will classify changes by more than `std / denominator` as 1 and -1, and the rest as 0. \
Denominator is chosen in such a way that the number of 1s and -1s is approximately equal to the number of 0s.


In [21]:
N_values = window_size

def get_change_over_std(data_row):
    """Return the target value for a given row of data."""
    std = np.std([data_row[f'value_{i}'] for i in range(N_values)])
    pre_last_price = data_row[f'value_{N_values-1}']
    last_price = data_row[f'target']
    return (last_price - pre_last_price) / std

DENOMINATOR = 4
def get_target(data_row):
    """Return the target value for a given row of data."""
    change_over_std = get_change_over_std(data_row)
    
    if abs(change_over_std) * DENOMINATOR < 1:
        return 0
    return 1 if change_over_std > 0 else -1

# df['target_class'] = df.apply(get_target, axis=1)
df.apply(get_change_over_std, axis=1).apply(abs).describe()

count    127410.000000
mean          0.431984
std           0.489741
min           0.000000
25%           0.129276
50%           0.294078
75%           0.568306
max           9.862311
dtype: float64

In [22]:
df['target_class'] = df.apply(get_target, axis=1)

In [23]:
df.target_class.value_counts()

target_class
 0    56380
 1    38230
-1    32800
Name: count, dtype: int64

## Additional preprocessing

Before moving to clustering and adding time-series based features, I will preprocess the data a bit:

* OHE to `sector`, `industry`, `country` columns
* Split the data into train and test (test = 0.1)
Motivation: I want in test to be samples from the assets model has never seen before.
* Scale the data


#### Splitting the data

The test will contain the tickers that are not in the train. For this reason, I'll take only the test_split sample for test \
`shuffle=False`

In [24]:
from sklearn.model_selection import train_test_split

test_split = 0.15

df_train, df_test = train_test_split(df, test_size=test_split, shuffle=False)

#### OHE

In [25]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [26]:
ohe_cols = ['sector', 'industry', 'country']

# count values
for col in ohe_cols:
    print(col, df_train[col].nunique())

sector 11
industry 79
country 19


In [27]:
df_train['industry'].value_counts()

industry
Asset Management                    9819
Oil & Gas Equipment & Services      4007
REIT - Retail                       3340
Banks - Regional                    3336
Medical Instruments & Supplies      3328
                                    ... 
Utilities - Diversified              660
Telecom Services                     649
Other Industrial Metals & Mining     634
Industrial Distribution              454
Pharmaceutical Retailers             282
Name: count, Length: 79, dtype: int64

In [28]:
industry_size = df_train.groupby('industry').size().sort_values(ascending=False)
small_industries = industry_size[industry_size < 850].index
print("Amount of small industries:", len(small_industries))

df_train.loc[df_train.industry.isin(small_industries), 'industry'] = "default_industry"
df_test.loc[df_test.industry.isin(small_industries), 'industry'] = "default_industry"

Amount of small industries: 40


In [29]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe_encoder.fit(df_train[ohe_cols])

df_train_ohe = pd.DataFrame(ohe_encoder.transform(df_train[ohe_cols]), columns=ohe_encoder.get_feature_names_out(ohe_cols))
df_test_ohe = pd.DataFrame(ohe_encoder.transform(df_test[ohe_cols]), columns=ohe_encoder.get_feature_names_out(ohe_cols))

In [30]:
df_train_ohe.shape

(108298, 70)

In [31]:
df_train_ohe.head(2)

Unnamed: 0,sector_Basic Materials,sector_Communication Services,sector_Consumer Cyclical,sector_Consumer Defensive,sector_Energy,sector_Financial Services,sector_Healthcare,sector_Industrials,sector_Real Estate,sector_Technology,...,country_India,country_Ireland,country_Israel,country_Japan,country_Luxembourg,country_Netherlands,country_Philippines,country_South Korea,country_United Kingdom,country_United States
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### More simplfe features (before normalization)

In [32]:
cfg = dict(
    mean=np.mean,
    sum=np.sum,
    std=np.std,
    min=np.min,
    max=np.max
)

def extract_tsfel_simple(data):
    features = []
    for feature_name, feature_extractor in cfg.items():
        feature_values = feature_extractor(data, axis=1)
        features.append(feature_values)

    return pd.DataFrame(np.array(features).T, columns=list(cfg.keys()))

scale_cols = [f'value_{i}' for i in range(N_values)]
train_stat_features = extract_tsfel_simple(df_train[scale_cols])
test_stat_features = extract_tsfel_simple(df_test[scale_cols])
train_stat_features.shape

(108298, 5)

#### Scaling

In [33]:
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

# scale along with axis=1
scaler = TimeSeriesScalerMeanVariance(mu=0., std=1.)
X_train_scaled = scaler.fit_transform(df_train[scale_cols]).reshape(-1, N_values)
X_test_scaled = scaler.transform(df_test[scale_cols]).reshape(-1, N_values)

X_train_scaled = pd.DataFrame(X_train_scaled)
X_test_scaled = pd.DataFrame(X_test_scaled)

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [34]:
random_row = np.random.randint(0, len(X_train))
n_checks = 25
for i in range(n_checks):
    random_row = np.random.randint(0, len(X_train))
    assert np.isclose(X_train_scaled.iloc[random_row].values.mean(), 0, atol=1e-5), 'Mean of the time series is not close to 0!'
    assert np.isclose(X_train_scaled.iloc[random_row].values.std(), 1, atol=1e-5), 'Standard deviation of the time series is not close to 1!'

    random_row = np.random.randint(0, len(X_test_scaled))
    assert np.isclose(X_test_scaled.iloc[random_row].values.mean(), 0, atol=1e-5), 'Mean of the time series is not close to 0!'
    assert np.isclose(X_test_scaled.iloc[random_row].values.std(), 1, atol=1e-5), 'Standard deviation of the time series is not close to 1!'

In [82]:
df_test = df_test.copy()

In [85]:
df_test.dtypes

symbol        object
sector        object
industry      object
country       object
target       float64
              ...   
value_205    float64
value_206    float64
value_207    float64
value_208    float64
value_209    float64
Length: 216, dtype: object

In [92]:
df_train[scale_cols] = X_train_scaled
df_test[scale_cols] = X_test_scaled.values # some strange bug

#### Final dataset

In [94]:
df_train.head(2)

Unnamed: 0,symbol,sector,industry,country,value_0,value_1,value_2,value_3,value_4,value_5,...,value_202,value_203,value_204,value_205,value_206,value_207,value_208,value_209,target,target_class
0,X,Basic Materials,Steel,United States,1.674574,1.676223,1.78336,1.794898,1.785008,1.775119,...,-0.339624,-0.34292,-0.372589,-0.382479,-0.482167,-0.474783,-0.542363,-0.507748,40.299999,0
1,X,Basic Materials,Steel,United States,1.697054,1.804816,1.816421,1.806474,1.796527,1.776632,...,-0.333868,-0.36371,-0.373658,-0.473927,-0.4665,-0.534474,-0.499657,-0.625657,40.599998,0


In [95]:
df_test.head(2)

Unnamed: 0,symbol,sector,industry,country,target,target_class,value_0,value_1,value_2,value_3,...,value_200,value_201,value_202,value_203,value_204,value_205,value_206,value_207,value_208,value_209
108298,BFK,Financial Services,Asset Management,United States,9.9435,0,0.954421,1.044754,1.052608,0.996972,...,-1.696732,-1.664001,-1.631925,-1.598539,-1.696732,-1.826994,-1.762187,-1.794918,-1.925842,-1.991298
108299,BFK,Financial Services,Asset Management,United States,9.96,0,1.052026,1.059836,1.004514,0.99475,...,-1.641429,-1.609534,-1.576337,-1.673975,-1.803501,-1.739061,-1.771607,-1.901791,-1.966877,-1.804153


In [96]:
drop_cols = ohe_cols + ['symbol', 'target']
target_col = 'target_class'

y_train = df_train[target_col]
y_test = df_test[target_col]

X_train = df_train.drop(columns=drop_cols + [target_col])
X_test = df_test.drop(columns=drop_cols + [target_col])

X_simple_features_train = pd.concat([df_train_ohe, train_stat_features], axis=1)
X_simple_features_test = pd.concat([df_test_ohe, test_stat_features], axis=1)

In [97]:
print(X_train.shape, X_test.shape)

(108298, 210) (19112, 210)


In [98]:
print(X_simple_features_train.shape, X_simple_features_test.shape)

(108298, 75) (19112, 75)


In [99]:
# save
import os

X_train.to_csv(os.path.join(data_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(data_dir, 'X_test.csv'), index=False)

X_simple_features_train.to_csv(os.path.join(data_dir, 'X_simple_features_train.csv'), index=False)
X_simple_features_test.to_csv(os.path.join(data_dir, 'X_simple_features_test.csv'), index=False)

y_train.to_csv(os.path.join(data_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(data_dir, 'y_test.csv'), index=False)