In [57]:
# import sys
# !{sys.executable} -m pip install xgboost

In [58]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, mean_squared_error
from glob import glob
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [59]:
def check_timestamp(ts):
    return len(str(ts)) == 13


In [60]:

# Your provided setup
file_idx = 12
num_epochs = 10
TIME_DIFF_1 = 1
TIME_DIFF_24 = 24  # a day
TIME_DIFF_168 = 24 * 7  # a week
features = [
    'Volume USD', 'SMA', 'EMA', 'RSI', 'MACD',
    'Bollinger_High', 'Bollinger_Low', 'VWAP',
    'Percentage_Returns', 'Log_Returns'
]
targets = [
    f"Target_shifted_{TIME_DIFF_1}",
    f"Target_shifted_{TIME_DIFF_24}",
    f"Target_shifted_{TIME_DIFF_168}"
]
file_paths = glob('modified_data/gemini_data_*')
file_paths.sort()

print(f'Opening file: {file_paths[file_idx]}')
data = pd.read_csv(file_paths[file_idx])
data = data.dropna()


Opening file: modified_data/gemini_data_BOND_mod.csv


In [61]:
data['unix'] = data['unix'].apply(lambda x: x//1000 if check_timestamp(x) else x)
data['Datetime'] = pd.to_datetime(data['unix'], unit='s')
data['Hour'] = data['Datetime'].dt.hour
data['Day_of_Week'] = data['Datetime'].dt.dayofweek  # Monday=0, Sunday=6
data['Day_of_Month'] = data['Datetime'].dt.day
data['Month'] = data['Datetime'].dt.month
data['Year'] = data['Datetime'].dt.year
data['Is_Weekend'] = (data['Day_of_Week'] >= 5).astype(int)  # 1 for weekend, 0 for weekdays
# Updating the features list
features_extended = features + ['Hour', 'Day_of_Week', 'Day_of_Month', 'Month', 'Year', 'Is_Weekend']
#  ('MACD', 0.045680176),
#  ('SMA', 0.039496846),
#  ('Day_of_Week', 0.038991235),
#  ('Day_of_Month', 0.038741197),
#  ('Hour', 0.03847502),
#  ('Log_Returns', 0.0),
#  ('Is_Weekend', 0.0)]
drop_features = ['MACD', 'SMA', 'Day_of_Week', 'Day_of_Month', 'Hour', 'Log_Returns', 'Is_Weekend']
features_extended = [feature for feature in features_extended if feature not in drop_features]

In [62]:
# Selecting the features to be normalized
X_extended = data[features_extended].values

# Applying Z-normalization
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_extended)

# If you want to update your dataframe with normalized values
data_normalized = data.copy()
data_normalized[features_extended] = X_normalized

In [63]:
# Step 1: Data Preparation
all_columns = features_extended + targets
prices = data_normalized[all_columns].values

# Assume the targets are continuous values. If they are categories, you'll need to adjust the code accordingly.
X = prices[:, :-3]  # Features
y1, y2, y3 = prices[:, -3], prices[:, -2], prices[:, -1]  # Targets
# Splitting the data into training and testing sets for each target
tscv = TimeSeriesSplit(n_splits=4)  # Adjust n_splits as needed

# Take the last split as the training/testing set
for train_index, test_index in tscv.split(X):
    X_train_1, X_test_1, y_train_1, y_test_1 = X[train_index], X[test_index], y1[train_index], y1[test_index]
    X_train_24, X_test_24, y_train_24, y_test_24 = X[train_index], X[test_index], y2[train_index], y2[test_index]
    X_train_168, X_test_168, y_train_168, y_test_168 = X[train_index], X[test_index], y3[train_index], y3[test_index]

In [64]:
# Step 2: Model Training
# Adjust the hyperparameters as needed
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 150],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'gamma': [0, 0.1, 0.2]
}

# Creating the grid search
grid_search = GridSearchCV(estimator=xgb.XGBClassifier(objective='binary:logistic', random_state=42),
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=TimeSeriesSplit(n_splits=4),
                           n_jobs=-1)

# Fitting the grid search (for each target separately)
grid_search.fit(X_train_1, y_train_1)
best_params_1 = grid_search.best_params_

In [65]:
# Fitting the grid search (for each target separately)
grid_search.fit(X_train_24, y_train_24)
best_params_24 = grid_search.best_params_

In [66]:
# Fitting the grid search (for each target separately)
grid_search.fit(X_train_168, y_train_168)
best_params_168 = grid_search.best_params_

In [67]:
# Training the model for each target
model_1 = xgb.XGBClassifier(**best_params_1)
model_1.fit(X_train_1, y_train_1)

model_24 = xgb.XGBClassifier(**best_params_24)
model_24.fit(X_train_24, y_train_24)

model_168 = xgb.XGBClassifier(**best_params_168)
model_168.fit(X_train_168, y_train_168)

In [68]:
# Step 3: Model Evaluation
# Predicting the values for each target
y_pred_1 = model_1.predict(X_test_1)
y_pred_24 = model_24.predict(X_test_24)
y_pred_168 = model_168.predict(X_test_168)

# Calculating the accuracy for each target
accuracy_1 = accuracy_score(y_test_1, y_pred_1)
accuracy_24 = accuracy_score(y_test_24, y_pred_24)
accuracy_168 = accuracy_score(y_test_168, y_pred_168)

print(f'Accuracy for 1-hour ahead: {accuracy_1}')
print(f'Accuracy for 1-day ahead: {accuracy_24}')
print(f'Accuracy for 1-week ahead: {accuracy_168}')

Accuracy for 1-hour ahead: 0.5497026338147833
Accuracy for 1-day ahead: 0.5730671197960917
Accuracy for 1-week ahead: 0.29141886151231944


In [69]:
# Calculating accuracy for always predicting 1 or 0
accuracy_always_1 = np.mean(y_test_1 == 1), np.mean(y_test_24 == 1), np.mean(y_test_168 == 1)
accuracy_always_0 = np.mean(y_test_1 == 0), np.mean(y_test_24 == 0), np.mean(y_test_168 == 0)

print(f'Accuracy for always predicting 1: {accuracy_always_1}')
print(f'Accuracy for always predicting 0: {accuracy_always_0}')

Accuracy for always predicting 1: (0.5148683092608326, 0.5730671197960917, 0.7336448598130841)
Accuracy for always predicting 0: (0.4851316907391674, 0.42693288020390824, 0.26635514018691586)


In [70]:
# After fitting the model, you can get feature importances
feature_importances = model_1.feature_importances_

# Mapping feature importances to feature names and sorting them
feature_importance_dict = dict(zip(features_extended, feature_importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print('Feature importance for model 1')
sorted_features


Feature importance for model 1


[('Percentage_Returns', 0.24420872),
 ('VWAP', 0.14988111),
 ('Year', 0.14249837),
 ('Volume USD', 0.122677535),
 ('Month', 0.10428923),
 ('RSI', 0.06847395),
 ('Bollinger_High', 0.06281318),
 ('Bollinger_Low', 0.053441912),
 ('EMA', 0.051715955)]

In [71]:
# After fitting the model, you can get feature importances
feature_importances = model_24.feature_importances_

# Mapping feature importances to feature names and sorting them
feature_importance_dict = dict(zip(features_extended, feature_importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print('Feature importance for model 24')
sorted_features

Feature importance for model 24


[('Percentage_Returns', 0.19741473),
 ('Month', 0.18365867),
 ('VWAP', 0.13552561),
 ('Bollinger_Low', 0.124791294),
 ('EMA', 0.11823178),
 ('RSI', 0.111368366),
 ('Bollinger_High', 0.09301056),
 ('Volume USD', 0.03599898),
 ('Year', 0.0)]

In [72]:
# After fitting the model, you can get feature importances
feature_importances = model_168.feature_importances_

# Mapping feature importances to feature names and sorting them
feature_importance_dict = dict(zip(features_extended, feature_importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print('Feature importance for model 168')
sorted_features

Feature importance for model 168


[('Month', 0.48278987),
 ('VWAP', 0.20908959),
 ('EMA', 0.10436893),
 ('Bollinger_Low', 0.07665031),
 ('Bollinger_High', 0.07459992),
 ('RSI', 0.036594763),
 ('Volume USD', 0.00905193),
 ('Percentage_Returns', 0.0068547446),
 ('Year', 0.0)]

In [73]:
print('Best params for 1:')
best_params_1

Best params for 1:


{'colsample_bytree': 1,
 'gamma': 0.2,
 'learning_rate': 0.1,
 'max_depth': 3,
 'n_estimators': 50,
 'subsample': 0.8}

In [74]:
print('Best params for 24:')
best_params_24

Best params for 24:


{'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.01,
 'max_depth': 3,
 'n_estimators': 50,
 'subsample': 1}

In [75]:
print('Best params for 168:')
best_params_168

Best params for 168:


{'colsample_bytree': 1,
 'gamma': 0.1,
 'learning_rate': 0.01,
 'max_depth': 9,
 'n_estimators': 100,
 'subsample': 1}