In [42]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
pd.options.display.max_columns = None
from utils import create_time_series_splits
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
df = pd.read_parquet('../data/datasets_2023-2025_1.parquet')
df.timestamp = pd.to_datetime(df.timestamp, unit = 's')
df['next_return'] = df.groupby('filename')['log_return'].shift(-1)

### One model across all names


In [None]:
train_dfs, test_dfs = create_time_series_splits(df, date_column='timestamp')

all_preds = []

for train_df, test_df in zip(train_dfs, test_dfs):
    X_train = train_df.drop(columns=['next_return', 'timestamp', 'filename'])
    y_train = train_df['next_return']
    X_test = test_df.drop(columns=['next_return', 'timestamp', 'filename'])
    y_test = test_df['next_return']
    
    model = RandomForestRegressor(
            n_estimators=100, 
            random_state=42, 
            n_jobs=-1,
            max_depth=10,
            min_samples_split=10,
            min_samples_leaf=3,
    )

    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    test_df['pred'] = preds
    all_preds.append(test_df[['timestamp', 'filename', 'pred', 'next_return']])
all_preds_df = pd.concat(all_preds, ignore_index=True)
all_preds_df.to_parquet('RF_predictions_one_model.parquet', index=False)

In [41]:
# model.feature_importances_ 
pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
18,ltrev_28,0.146723
13,mom_2_1,0.138101
17,strev_6,0.086465
21,price_ema_diff_14,0.066261
7,return,0.051194
6,log_return,0.049537
15,mom_12_7,0.04777
20,ema_diff_norm_20_50,0.045598
14,mom_14_2,0.045137
10,close_to_low,0.044915


In [45]:
mae = mean_absolute_error(all_preds_df['next_return'], all_preds_df['pred'])
mse = mean_squared_error(all_preds_df['next_return'], all_preds_df['pred'])
r2 = r2_score(all_preds_df['next_return'], all_preds_df['pred'])
rmse = np.sqrt(mse)
print(f'MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}, RMSE: {rmse:.4f}')

MAE: 0.0227, MSE: 0.0014, R2: -0.0167, RMSE: 0.0369


### One model for each name

In [100]:
all_preds = []

for file in df.filename.unique():
    cur_df = df[df['filename'] == file]
    if (cur_df.timestamp.max() - cur_df.timestamp.min()).days / 365 < 1.5:
        print(f"Skipping {file} - less than 1.5 years")
        continue
    print(f"Processing {file} -- {round((cur_df.timestamp.max() - cur_df.timestamp.min()).days / 365, 2)} years of data")
    train_dfs, test_dfs = create_time_series_splits(cur_df, 12, 1, date_column='timestamp')

    for train_df, test_df in zip(train_dfs, test_dfs):
        X_train = train_df.drop(columns=['next_return', 'timestamp', 'filename'])
        y_train = train_df['next_return']
        X_test = test_df.drop(columns=['next_return', 'timestamp', 'filename'])
        y_test = test_df['next_return']
        
        model = RandomForestRegressor(
                n_estimators=100, 
                random_state=42, 
                n_jobs=-1,
                max_depth=5,
                min_samples_split=5,
                min_samples_leaf=2,
        )

        model.fit(X_train, y_train)
        
        preds = model.predict(X_test)
        test_df['pred'] = preds
        all_preds.append(test_df[['timestamp', 'filename', 'pred', 'next_return']])

all_preds_df = pd.concat(all_preds, ignore_index=True)
all_preds_df.to_parquet('RF_predictions_ind_models.parquet', index=False)

Processing 1INCHUSD_720.parquet -- 2.0 years of data
Processing AAVEUSD_720.parquet -- 2.0 years of data
Processing ACHUSD_720.parquet -- 2.0 years of data
Processing ADAUSD_720.parquet -- 2.0 years of data
Processing AKTUSD_720.parquet -- 2.0 years of data
Processing ALGOUSD_720.parquet -- 2.0 years of data
Processing ANKRUSD_720.parquet -- 2.0 years of data
Processing APEUSD_720.parquet -- 2.0 years of data
Processing APTUSD_720.parquet -- 2.0 years of data
Processing ASTRUSD_720.parquet -- 2.0 years of data
Processing ATLASUSD_720.parquet -- 2.0 years of data
Processing ATOMUSD_720.parquet -- 2.0 years of data
Processing AUDUSD_720.parquet -- 2.0 years of data
Processing AVAXUSD_720.parquet -- 2.0 years of data
Processing AXSUSD_720.parquet -- 2.0 years of data
Processing BATUSD_720.parquet -- 2.0 years of data
Processing BCHUSD_720.parquet -- 2.0 years of data
Processing BLZUSD_720.parquet -- 2.0 years of data
Processing BSXUSD_720.parquet -- 2.0 years of data
Processing BTTUSD_720

In [101]:
mae = mean_absolute_error(all_preds_df['next_return'], all_preds_df['pred'])
mse = mean_squared_error(all_preds_df['next_return'], all_preds_df['pred'])
r2 = r2_score(all_preds_df['next_return'], all_preds_df['pred'])
rmse = np.sqrt(mse)
print(f'MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}, RMSE: {rmse:.4f}')

MAE: 0.0251, MSE: 0.0016, R2: -0.1133, RMSE: 0.0404


### Old

In [17]:
in_folder_path = "../data/Kraken_features/"

all_res_df = pd.DataFrame({"timestamp": pd.date_range(start="2020-01-01", end="2026-01-01", freq="360min")})
all_res_df.set_index("timestamp", inplace=True)

ctr = 0
for file in os.listdir(in_folder_path):
    df = pd.read_parquet(in_folder_path + file)
    if len(df) < 4 * 365 * 1.5:
        print(f"Skipping {file} - less than 1.5 years")
        continue
    else:
        print(f"Processing {file} -- {round((df.index.max() - df.index.min()).days / 365, 2)} years of data")
    train_dfs, test_dfs = create_time_series_splits(df, 6, 1)

    all_preds = []
    all_targets = []
    all_times = []
    for train_df, test_df in zip(train_dfs, test_dfs):
        X_train = train_df.drop(columns = ['date'])[:-1]
        y_train = train_df['log_return'][1:]

        X_test = test_df.drop(columns = ['date'])[:-1]
        y_test = test_df['log_return'][1:]

        model = RandomForestRegressor(
            n_estimators=100, 
            random_state=42, 
            n_jobs=-1,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
        )
        model.fit(X_train, y_train)
        
        preds = model.predict(X_test)
        
        all_preds.extend(preds)
        all_targets.extend(y_test)
        all_times.extend(test_df.index[1:])

    sym = file.split(".")[0]
    col_pred = f"pred_{sym}"
    col_target = f"target_{sym}"

    if col_pred not in all_res_df.columns:
        all_res_df[col_pred] = np.nan
    if col_target not in all_res_df.columns:
        all_res_df[col_target] = np.nan

    all_res_df.loc[all_times, col_pred] = all_preds
    all_res_df.loc[all_times, col_target] = all_targets
    
    if ctr == 5:
        break
    ctr += 1

Processing SPELLUSD_720.parquet -- 2.92 years of data
Processing AIRUSD_720.parquet -- 3.06 years of data
Processing SUIUSD_720.parquet -- 1.83 years of data
Processing DOTUSD_720.parquet -- 3.22 years of data
Processing CVXUSD_720.parquet -- 2.92 years of data
Processing WOOUSD_720.parquet -- 3.0 years of data
