In [1]:
# add autoreload magic
import pandas as pd
from loguru import logger
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 
%load_ext autoreload
%autoreload 2

In [2]:
# Global parameters
feature_view_name = 'ohlc_feature_view'
feature_view_version = 1
ohlc_window_sec = 60
product_id = 'BTC/USD'
last_n_days_to_fetch_from_store = 90
last_n_days_to_test_model = 7
discretization_thresholds = [-0.0001, 0.0001]
prediction_window_sec = 60*5

In [3]:
import os
os.environ['HOPSWORKS_API_KEY'] = <API_KEY>
os.environ['HOPSWORKS_PROJECT_NAME'] = <PROJECT_NAME>

In [4]:
import sys
import os

# Get the path three levels up from the current script/notebook
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir))

# Add this directory to sys.path
sys.path.append(parent_dir)

In [5]:
from tools.tools.ohlc_data_reader import OhlcDataReader


ohlc_data_reader = OhlcDataReader(
    ohlc_window_sec=ohlc_window_sec,
    feature_view_name=feature_view_name,
    feature_view_version=feature_view_version,
)

logger.info('Fetching OHLC data from the feature store')

ohlc_data = ohlc_data_reader.read_from_offline_store(
    product_id=product_id,
    last_n_days=last_n_days_to_fetch_from_store,
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/897173



[32m2024-08-09 02:17:56.332[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mFetching OHLC data from the feature store[0m


Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.29s) 


In [6]:
# add a column to ohlc_data with a human-readable data, using
# the ohlc_data['timestamp'] column in milliseconds
ohlc_data['datetime'] = pd.to_datetime(ohlc_data['timestamp'], unit='ms')

In [9]:
# Get the path one level up from the current script/notebook
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Add this directory to sys.path
sys.path.append(parent_dir)
from src.training import split_train_test

# Step 2
# Split the data into training and testing using a cutoff date
logger.info('Splitting the data into training and testing')
ohlc_train, ohlc_test = split_train_test(
    ohlc_data=ohlc_data,
    last_n_days_to_test_model=last_n_days_to_test_model,
)

# print(ohlc_train.head())
# print(ohlc_test.head())

[32m2024-08-09 02:21:40.124[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mSplitting the data into training and testing[0m


In [10]:
from src.training import interpolate_missing_candles

# Step 3
# Preprocess the data for training and for testing
# Interpolate missing candles
logger.info('Interpolating missing candles for training data')
ohlc_train = interpolate_missing_candles(ohlc_train, ohlc_window_sec)
logger.info('Interpolating missing candles for testing data')
ohlc_test = interpolate_missing_candles(ohlc_test, ohlc_window_sec)

[32m2024-08-09 02:22:25.377[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mInterpolating missing candles for training data[0m
[32m2024-08-09 02:22:25.718[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mInterpolating missing candles for testing data[0m


In [13]:
from src.data_preprocessing import create_target_metric

# Step 4
# Create the target metric as a new column in our dataframe for training and testing
logger.info('Creating the target metric')
ohlc_train = create_target_metric(
    ohlc_train,
    ohlc_window_sec,
    #discretization_thresholds,
    prediction_window_sec,
)
ohlc_test = create_target_metric(
    ohlc_test,
    ohlc_window_sec,
    #discretization_thresholds,
    prediction_window_sec,
)

[32m2024-08-09 02:28:30.192[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mCreating the target metric[0m


In [14]:
# Plot distribution of the target
logger.info('Distribution of the target in the training data')
logger.debug(ohlc_train['target'].value_counts())
logger.info('Distribution of the target in the testing data')
logger.debug(ohlc_test['target'].value_counts())

[32m2024-08-09 02:29:12.610[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mDistribution of the target in the training data[0m
[32m2024-08-09 02:29:12.655[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [34m[1mtarget
 0.000000    5079
 0.000001      22
-0.000001      20
 0.000002      19
 0.000001      17
             ... 
-0.001522       1
-0.001722       1
-0.002219       1
-0.000845       1
-0.004413       1
Name: count, Length: 74295, dtype: int64[0m
[32m2024-08-09 02:29:12.687[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mDistribution of the target in the testing data[0m
[32m2024-08-09 02:29:12.692[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [34m[1mtarget
 0.000000    38
 0.001775     5
-0.000327     3
 0.000867     2
-0.000655     2
-0.001224     2
-0.001364     2
-0.000861     2
 0.003162     2
-0.002193     1
-0.001540     1
-0.002369     1


In [17]:
X_test = ohlc_test.drop(columns=['target'])
y_test = ohlc_test['target']
X_train = ohlc_train.drop(columns=['target'])
y_train = ohlc_train['target']

In [18]:
from src.baseline_model import BaselineModel

# create model
model = BaselineModel(
    n_candles_into_future=prediction_window_sec // ohlc_window_sec,
    #discretization_thresholds=discretization_thresholds,
)

# generate predictions
y_test_predictions = model.predict(X_test)

# evalute our dummy model
# Let's evaluate the model. It is a classifier with 3 classes

print('****** TEST DATA ******')
# Compute accuracy using scikit-learn
accuracy = accuracy_score(y_test, y_test_predictions)
print(f'Accuracy of the model on test data: {accuracy}')

print(f'Classification report of the model:')
print(classification_report(y_test, y_test_predictions))

# generate predictions
print('****** TRAINING DATA ******')
y_train_predictions = model.predict(X_train)
accuracy = accuracy_score(y_train, y_train_predictions)
print(f'Accuracy of the model: {accuracy}')

print(f'Classification report of the model:')
print(classification_report(y_train, y_train_predictions))

****** TEST DATA ******


ValueError: Classification metrics can't handle a mix of continuous and binary targets