In [14]:
%load_ext autoreload
%autoreload 2
# chdir to parent directory of current notebook
import sys
sys.path.append('../')

import pandas as pd
from khronos import transform
import jax
import jax.numpy as jnp
import flax.linen as nn


In [39]:
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
ARTIFACT_DIR = f'{BASE_DIR}/artifacts'
INPUT_PATH = f'{ARTIFACT_DIR}/data/input.csv'


In [16]:
dataset = pd.read_csv(os.path.join(BASE_DIR, './artifacts/data/raw.csv'), index_col=0, header=[0, 1]).sort_index(axis=1)

In [18]:
flat_df = transform.flatten_dataset(dataset)
df_with_index_as_datetime = transform.convert_index_to_datetime(flat_df)
df_with_timestamp = transform.add_timestamp_in_seconds_as_raw_input(df_with_index_as_datetime)
df_with_timestamp.head()

Unnamed: 0_level_0,close,high,low,open,volume,ticker,timestamp_in_seconds
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-09-11 09:30:00,44.01,44.05,44.01,44.01,39049.0,AAL,1505122200
2017-09-11 09:30:00,160.57,160.62,160.5,160.5,407091.0,AAPL,1505122200
2017-09-11 09:30:00,84.53,84.53,84.46,84.46,200.0,ABBV,1505122200
2017-09-11 09:30:00,83.24,83.33,83.24,83.33,13302.0,ABC,1505122200
2017-09-11 09:30:00,134.0,134.62,133.61,134.62,23371.0,ACN,1505122200


In [19]:
# store artifact as input.csv
df_with_timestamp.to_csv(os.path.join(BASE_DIR, './artifacts/data/input.csv'))

In [32]:
raw_input_df = pd.read_csv(os.path.join(BASE_DIR, './artifacts/data/input.csv'), index_col=0, parse_dates=True)

In [33]:
from khronos.task import PipelineConfig

p_config = PipelineConfig(
    artifact_dir='./artifacts',
    train_split=0.8,
    val_split=0.1,
    test_split=0.1,
)

In [59]:
# import train_test_split

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [43]:
from khronos.data_loader import RawInputArtifact

example_gen = RawInputArtifact(INPUT_PATH, ARTIFACT_DIR, p_config.train_split, p_config.val_split, p_config.test_split)

In [45]:
example_gen.run()

[{'name': 'train__split',
  'path': '/Users/stefruinard/Library/CloudStorage/OneDrive-Microsoft/Documents/work/projects/202302 - Applications and Machine Learning/azure_staff_engineer/src/khronos/artifacts/example_gen/train.csv'},
 {'name': 'val__split',
  'path': '/Users/stefruinard/Library/CloudStorage/OneDrive-Microsoft/Documents/work/projects/202302 - Applications and Machine Learning/azure_staff_engineer/src/khronos/artifacts/example_gen/val.csv'},
 {'name': 'test__split',
  'path': '/Users/stefruinard/Library/CloudStorage/OneDrive-Microsoft/Documents/work/projects/202302 - Applications and Machine Learning/azure_staff_engineer/src/khronos/artifacts/example_gen/test.csv'}]

In [47]:
train_df, val_df, test_df = example_gen.load_input_artifacts()

In [53]:
# verify no look-ahead bias between train, val, test dataframes using the index

has_lookahead_bias = False
datasets = [train_df, val_df, test_df]
for i, (flag, dataset) in enumerate(zip(['train', 'val', 'test'], datasets)):
    if i == 0:
        continue
    if dataset.index[0] < datasets[i-1].index[-1]:
        print(f'dataset.index[0]: {dataset.index[0]}')
        print(f'datasets[i-1].index[-1]: {datasets[i-1].index[-1]}')
        has_lookahead_bias = True
        break
print(f'has_lookahead_bias: {has_lookahead_bias}')

has_lookahead_bias: False


In [75]:
# fit train_df on RobustScaler

columns_to_scale = ['open', 'high', 'low', 'close', 'volume']
# compute max and min for each column
max_values = train_df[columns_to_scale].max()
min_values = train_df[columns_to_scale].min()





In [80]:
maxs = jnp.array(max_values)
mins = jnp.array(min_values)


Array([1.9614500e+03, 1.9614500e+03, 1.9610900e+03, 1.9610900e+03,
       2.3145858e+07], dtype=float32)

In [66]:


# batch, seq_len, features
batch = jnp.array([
    [[1, 2, 3], [4, 5, 6]],
    [[7, 8, 9], [10, 11, 12]],
])


In [73]:
def normalize(x, mins, maxs):
    return (x - mins) / (maxs - mins)

In [74]:
mins, maxs = jnp.array([1, 1, 1]), jnp.array([12, 12, 12])
normalize(batch, mins, maxs)

Array([[[0.        , 0.09090909, 0.18181819],
        [0.27272728, 0.36363637, 0.45454547]],

       [[0.54545456, 0.6363636 , 0.72727275],
        [0.8181818 , 0.90909094, 1.        ]]], dtype=float32)