In [1]:
# add autoreload magic
%load_ext autoreload
%autoreload 2

In [2]:
# Global parameters
feature_view_name = 'ohlc_feature_view'
feature_view_version = 10
ohlc_window_sec = 60
product_id = 'BTC/USD'
last_n_days_to_fetch_from_store = 90
last_n_days_to_test_model = 7
discretization_thresholds = [-0.0001, 0.0001]
prediction_window_sec = 60*5

In [3]:
import os
os.environ['HOPSWORKS_API_KEY'] = '<PLACEHOLDER>'
os.environ['HOPSWORKS_PROJECT_NAME'] = '<PLACEHOLDER>'

In [45]:
import pandas as pd
from loguru import logger
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report   


In [5]:
# Step 1    
# Fetch the data from the feature store
from tools.ohlc_data_reader import OhlcDataReader

ohlc_data_reader = OhlcDataReader(
    ohlc_window_sec=ohlc_window_sec,
    feature_view_name=feature_view_name,
    feature_view_version=feature_view_version,
)

logger.info('Fetching OHLC data from the feature store')

ohlc_data = ohlc_data_reader.read_from_offline_store(
    product_id=product_id,
    last_n_days=last_n_days_to_fetch_from_store,
)

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/3285


[32m2024-06-27 12:11:22.573[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mFetching OHLC data from the feature store[0m


Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.54s) 


In [6]:

# add a column to ohlc_data with a human-readable data, using
# the ohlc_data['timestamp'] column in milliseconds
ohlc_data['datetime'] = pd.to_datetime(ohlc_data['timestamp'], unit='ms')

In [7]:
from src.training import split_train_test

# Step 2
# Split the data into training and testing using a cutoff date
logger.info('Splitting the data into training and testing')
ohlc_train, ohlc_test = split_train_test(
    ohlc_data=ohlc_data,
    last_n_days_to_test_model=last_n_days_to_test_model,
)

# print(ohlc_train.head())
# print(ohlc_test.head())

[32m2024-06-27 12:11:31.989[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mSplitting the data into training and testing[0m


In [8]:
from src.training import interpolate_missing_candles

# Step 3
# Preprocess the data for training and for testing
# Interpolate missing candles
logger.info('Interpolating missing candles for training data')
ohlc_train = interpolate_missing_candles(ohlc_train, ohlc_window_sec)
logger.info('Interpolating missing candles for testing data')
ohlc_test = interpolate_missing_candles(ohlc_test, ohlc_window_sec)

[32m2024-06-27 12:11:33.107[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mInterpolating missing candles for training data[0m
[32m2024-06-27 12:11:33.137[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mInterpolating missing candles for testing data[0m


In [9]:
from src.training import create_target_metric

# Step 4
# Create the target metric as a new column in our dataframe for training and testing
logger.info('Creating the target metric')
ohlc_train = create_target_metric(
    ohlc_train,
    ohlc_window_sec,
    discretization_thresholds,
    prediction_window_sec,
)
ohlc_test = create_target_metric(
    ohlc_test,
    ohlc_window_sec,
    discretization_thresholds,
    prediction_window_sec,
)

[32m2024-06-27 12:11:34.238[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mCreating the target metric[0m


In [10]:
# Plot distribution of the target
logger.info('Distribution of the target in the training data')
logger.debug(ohlc_train['target'].value_counts())
logger.info('Distribution of the target in the testing data')
logger.debug(ohlc_test['target'].value_counts())

[32m2024-06-27 12:11:35.198[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mDistribution of the target in the training data[0m
[32m2024-06-27 12:11:35.202[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [34m[1mtarget
2.0    49698
0.0    47364
1.0    21002
Name: count, dtype: int64[0m
[32m2024-06-27 12:11:35.204[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mDistribution of the target in the testing data[0m
[32m2024-06-27 12:11:35.206[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [34m[1mtarget
1.0    6973
0.0    1561
2.0    1541
Name: count, dtype: int64[0m


In [11]:
# Before training, let's split the features and the target
X_train = ohlc_train.drop(columns=['target'])
y_train = ohlc_train['target']
X_test = ohlc_test.drop(columns=['target'])
y_test = ohlc_test['target']

## Baseline model

In [51]:
from src.baseline_model import BaselineModel

# create model
model = BaselineModel(
    n_candles_into_future=prediction_window_sec // ohlc_window_sec,
    discretization_thresholds=discretization_thresholds,
)

# generate predictions
y_test_predictions = model.predict(X_test)

# evalute our dummy model
# Let's evaluate the model. It is a classifier with 3 classes

print('****** TEST DATA ******')
# Compute accuracy using scikit-learn
accuracy = accuracy_score(y_test, y_test_predictions)
print(f'Accuracy of the model on test data: {accuracy}')

print(f'Classification report of the model:')
print(classification_report(y_test, y_test_predictions))

# generate predictions
print('****** TRAINING DATA ******')
y_train_predictions = model.predict(X_train)
accuracy = accuracy_score(y_train, y_train_predictions)
print(f'Accuracy of the model: {accuracy}')

print(f'Classification report of the model:')
print(classification_report(y_train, y_train_predictions))

****** TEST DATA ******
Accuracy of the model on test data: 0.783424317617866
Classification report of the model:
              precision    recall  f1-score   support

         0.0       0.46      0.46      0.46      1561
         1.0       0.93      0.93      0.93      6973
         2.0       0.44      0.44      0.44      1541

    accuracy                           0.78     10075
   macro avg       0.61      0.61      0.61     10075
weighted avg       0.78      0.78      0.78     10075

****** TRAINING DATA ******
Accuracy of the model: 0.4303174549396937
Classification report of the model:
              precision    recall  f1-score   support

         0.0       0.44      0.44      0.44     47364
         1.0       0.33      0.33      0.33     21002
         2.0       0.46      0.46      0.46     49698

    accuracy                           0.43    118064
   macro avg       0.41      0.41      0.41    118064
weighted avg       0.43      0.43      0.43    118064



In [47]:
from src.feature_engineering import add_features

X_train = add_features(
    X_train,
    n_candles_into_future=prediction_window_sec // ohlc_window_sec,
    discretization_thresholds=discretization_thresholds,
)

X_test = add_features(
    X_test,
    n_candles_into_future=prediction_window_sec // ohlc_window_sec,
    discretization_thresholds=discretization_thresholds,
)

features_to_use = [
    'rsi',
    'momentum',
    'std',
    'last_observed_target',
    'day_of_week',
    'hour_of_day',
    'minute_of_hour',
]

X_train_ = X_train[features_to_use]
X_test_ = X_test[features_to_use]

## Train a boosting tree algorithm -> XGBoost

In [48]:
import xgboost as xgb

In [52]:
# Create the DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_, label=y_train)
dtest = xgb.DMatrix(X_test_, label=y_test)

# Set parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Specify the objective for classification
    'num_class': 3                 # Number of classes in the dataset,

    # Add other parameters here
    # These are things you can tune to optimize the model (aka hyperparameters)
    # 'eta': 0.1,                    # Learning rate
    # 'max_depth': 6,                # Maximum depth of a tree
    # 'subsample': 0.8,              # Subsample ratio of the training instances
    # 'colsample_bytree': 0.8,       # Subsample ratio of columns when constructing each tree
    # 'gamma': 1,                    # Minimum loss reduction required to make a further partition
    # 'alpha': 0,                    # L1 regularization term on weights
    # 'lambda': 1,                   # L2 regularization term on weights
    # 'scale_pos_weight': 1          # Balancing of positive and negative weights
}

# Train the model
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds)

# Predict on the test set
y_test_predictions = model.predict(dtest)

print('****** TEST DATA ******')
# Calculate accuracy both on the training and test set
accuracy = accuracy_score(y_test, y_test_predictions)
print(f"Accuracy on test data: {accuracy * 100:.2f}%")

# Classifcation report
print(f'Classification report of the model:')
print(classification_report(y_test, y_test_predictions))

print('****** TRAINING DATA ******')
y_train_predictions = model.predict(dtrain)
accuracy = accuracy_score(y_train, y_train_predictions)
print("Accuracy: %.2f%%" % (accuracy * 100))

# Classifcation report
print(f'Classification report of the model:')
print(classification_report(y_train, y_train_predictions))

****** TEST DATA ******
Accuracy on test data: 42.96%
Classification report of the model:
              precision    recall  f1-score   support

         0.0       0.33      0.45      0.38      1561
         1.0       0.98      0.39      0.56      6973
         2.0       0.18      0.59      0.27      1541

    accuracy                           0.43     10075
   macro avg       0.49      0.48      0.40     10075
weighted avg       0.75      0.43      0.49     10075

****** TRAINING DATA ******
Accuracy: 60.85%
Classification report of the model:
              precision    recall  f1-score   support

         0.0       0.61      0.62      0.62     47364
         1.0       0.68      0.35      0.46     21002
         2.0       0.59      0.70      0.65     49698

    accuracy                           0.61    118064
   macro avg       0.63      0.56      0.57    118064
weighted avg       0.62      0.61      0.60    118064



## Let's try a simpler model, to make sure we are not overfitting

In [43]:
from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train_, y_train)

# Predict on the test set
preds = model.predict(X_test_)

# Calculate accuracy both on the training and test set
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy on test data: {accuracy * 100:.2f}%')

accuracy = accuracy_score(y_train, model.predict(X_train_))
print(f'Accuracy on training data: {accuracy * 100:.2f}%')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

Accuracy on test data: 31.87%
Accuracy on training data: 44.98%
