In [1]:
! pip install statsmodels



You should consider upgrading via the 'c:\users\saidj\onedrive\documentos\projects\forex_ml_bot\forex_ml_bot\mtvenv\scripts\python.exe -m pip install --upgrade pip' command.


In [51]:
import numpy as np
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from backbone.probability_transformer import ProbabilityTransformer 
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from backbone.utils import load_function
from typing import Tuple
import yaml
from sklearn.metrics import classification_report
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from tqdm import tqdm
from statsmodels.tsa.filters.hp_filter import hpfilter
pd.options.display.max_rows = 999

In [3]:
# Función para calcular la volatilidad diaria
def get_daily_volatility(close_prices, span=100):
    returns = close_prices.pct_change()
    volatility = returns.ewm(span=span).std()
    return volatility

# Función para aplicar el filtro CUSUM
def apply_cusum_filter(raw_price, threshold):
    """
    :param raw_price: (series) of close prices.
    :param threshold: (float) when the abs(change) is larger than the threshold, the
    function captures it as an event.
    :return: (datetime index vector) vector of datetimes when the events occurred. This is used later to sample.
    """
    print('Applying Symmetric CUSUM filter.')

    t_events = []
    s_pos = 0
    s_neg = 0

    # log returns
    diff = np.log(raw_price).diff().dropna()

    # Get event time stamps for the entire series
    for i in tqdm(diff.index[1:]):
        pos = float(s_pos + diff.loc[i])
        neg = float(s_neg + diff.loc[i])
        s_pos = max(0.0, pos)
        s_neg = min(0.0, neg)

        if s_neg < -threshold:
            s_neg = 0
            t_events.append(i)

        elif s_pos > threshold:
            s_pos = 0
            t_events.append(i)

    event_timestamps = pd.DatetimeIndex(t_events)
    return event_timestamps

# Función para aplicar el método de triple barrera con filtro CUSUM
def apply_triple_barrier(
    close_prices, 
    max_prices, 
    min_prices, 
    take_profit_in_pips, 
    stop_loss_in_pips, 
    side,
    max_holding_period=50, 
    pip_size=0.0001
    ):

    barriers = []

    for index in range(len(close_prices)):
        if side[index] == 1:
            # Para una señal de compra
            upper_barrier_level = close_prices[index] * (1 + (take_profit_in_pips * pip_size))
            lower_barrier_level = close_prices[index] * (1 - (stop_loss_in_pips * pip_size))
        elif side[index] == -1:
            # Para una señal de venta
            upper_barrier_level = close_prices[index] * (1 + (stop_loss_in_pips * pip_size))
            lower_barrier_level = close_prices[index] * (1 - (take_profit_in_pips * pip_size))
        else:
            # Si no hay señal, saltar al siguiente índice
            continue
        
        # Evaluar los precios futuros dentro del período máximo de mantenimiento
        for j in range(index + 1, min(index + max_holding_period, len(close_prices))):
            if side[index] == 1:
                # Señal de compra: tomar ganancias si se alcanza la barrera superior
                if close_prices[j] >= upper_barrier_level or max_prices[j] >= upper_barrier_level:
                    barriers.append((index, 1))  # Etiqueta 1 para toma de ganancias
                    break
                elif close_prices[j] <= lower_barrier_level or min_prices[j] <= lower_barrier_level:
                    barriers.append((index, 0))  # Etiqueta 0 para stop-loss
                    break
            elif side[index] == -1:
                # Señal de venta: tomar ganancias si se alcanza la barrera inferior
                if close_prices[j] <= lower_barrier_level or min_prices[j] <= lower_barrier_level:
                    barriers.append((index, 1))  # Etiqueta 1 para toma de ganancias
                    break
                elif close_prices[j] >= upper_barrier_level or max_prices[j] >= upper_barrier_level:
                    barriers.append((index, 0))  # Etiqueta 0 para stop-loss
                    break
        else:
            barriers.append((index, 2))  # Etiqueta 2 si no se alcanza ninguna barrera
    
    # Revisar los eventos etiquetados como 2 para determinar si son ganancias o pérdidas
    for idx, (event_index, label) in enumerate(barriers):
        if label == 2:
            # Determinar si el precio final fue una ganancia o una pérdida
            final_price = close_prices[min(event_index + max_holding_period, len(close_prices) - 1)]
            initial_price = close_prices[event_index]
            
            if side[event_index] == 1:
                # Para una señal de compra
                if final_price >= initial_price:
                    barriers[idx] = (event_index, 1)  # Etiqueta 1 para toma de ganancias
                elif final_price < initial_price:
                    barriers[idx] = (event_index, 0)  # Etiqueta 0 para stop-loss
            elif side[event_index] == -1:
                # Para una señal de venta
                if final_price <= initial_price:
                    barriers[idx] = (event_index, 1)  # Etiqueta 1 para toma de ganancias
                elif final_price > initial_price:
                    barriers[idx] = (event_index, 0)  # Etiqueta 0 para stop-loss

    return barriers

# Función principal que usa el método de triple barrera con filtro CUSUM
def triple_barrier_labeling(
        close_prices, 
        max_prices, 
        min_prices,  
        take_profit_in_pips, 
        stop_loss_in_pips, 
        side,
        max_holding_period=50, 
        pip_size=0.0001,
    ):

    
    labels = apply_triple_barrier(
        close_prices,
        max_prices,
        min_prices,
        take_profit_in_pips, 
        stop_loss_in_pips, 
        side,
        max_holding_period, 
        pip_size
    )
    
    target = [label for _, label in labels]
    return target


In [64]:
symbols_path = './backbone/data/backtest/symbols/EURUSD.csv'
df = pd.read_csv(symbols_path)

print('Creando target')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date')
df = df.set_index('Date')

instrument = df.copy()
# compute bband sides

instrument['side'] = np.nan

# long_signals = (instrument['Close'] <= instrument['lower_bband'])
# short_signals = (instrument['Close'] >= instrument['upper_bband'])
# instrument.loc[long_signals, 'side'] = 1
# instrument.loc[short_signals, 'side'] = -1

# compute macd sides
# long_signals = (instrument['macd'] > instrument['macdsignal']) & (instrument['macd'].shift(1) <= instrument['macdsignal'].shift(1))
# instrument.loc[long_signals, 'side'] = 1
# short_signals = (instrument['macd'] < instrument['macdsignal']) & (instrument['macd'].shift(1) >= instrument['macdsignal'].shift(1))
# instrument.loc[short_signals, 'side'] = -1

cycle, trend = hpfilter(instrument['Close'], lamb=1000)
instrument['trend'] = trend
instrument['SMA20'] = instrument['trend'].rolling(window=20).mean()
# instrument['SMA200'] = instrument['trend'].rolling(window=200).mean()
long_signals = (instrument['trend'] > instrument['SMA20']) & (instrument['trend'].shift(1) <= instrument['SMA20'].shift(1))
short_signals = (instrument['trend'] < instrument['SMA20']) & (instrument['trend'].shift(1) >= instrument['SMA20'].shift(1))
instrument.loc[long_signals, 'side'] = 1
instrument.loc[short_signals, 'side'] = -1

# Remove Look ahead biase by lagging the signal
instrument['side'] = instrument['side'].shift(1)

# Drop the NaN values from our data set
# volatility = get_daily_volatility(instrument.Close, span=120)

# cusum_events = apply_cusum_filter(instrument.Close, threshold=volatility.mean()*0.25)

# instrument = instrument.loc[cusum_events]

instrument.dropna(inplace=True)

print(instrument.side.value_counts())

instrument['target'] = triple_barrier_labeling(
    close_prices=instrument['Close'], 
    min_prices=instrument['Low'], 
    max_prices=instrument['High'], 
    take_profit_in_pips=30, 
    stop_loss_in_pips=15, 
    max_holding_period=24, 
    pip_size=0.0001,
    side=instrument['side']
)


Creando target
side
 1.0    181
-1.0    180
Name: count, dtype: int64


In [65]:
instrument['target'].value_counts()

target
1    238
0    123
Name: count, dtype: int64

In [66]:
df.loc[instrument.index, 'side'] = instrument.side
df.loc[instrument.index, 'target'] = instrument.target
df.fillna(0, inplace=True)
df.reset_index(inplace=True)

In [67]:
df['year'] = df.Date.dt.year
df['month'] = df.Date.dt.month

In [68]:
pd.DataFrame(df[df['side'] != 0].groupby(by=['year','month', 'side']).agg({'target':'count'}))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,target
year,month,side,Unnamed: 3_level_1
2019,9,-1.0,2
2019,9,1.0,2
2019,10,-1.0,6
2019,10,1.0,6
2019,11,-1.0,6
2019,11,1.0,7
2019,12,-1.0,6
2019,12,1.0,6
2020,1,-1.0,6
2020,1,1.0,6


# Accuracy

In [69]:
instrument[(instrument.side != 0) & (instrument.target==1)].shape[0] / instrument[(instrument.side!=0)].shape[0]

0.6592797783933518

In [8]:
# import pandas as pd
# import os
# pd.set_option('display.max_columns', None)

# periods_forward = 5

# tickers = ['EURUSD']
# symbols_path = './backbone/data/backtest/symbols'
# instruments = {}
# df = pd.DataFrame()

# for ticker in tickers:
#     instruments[ticker] = pd.read_csv(os.path.join(symbols_path, f'{ticker}.csv'))
  
#     instruments[ticker]['ticker'] = ticker
  
#     print('Creando target')
   
#     instruments[ticker] = instruments[ticker].sort_values(by='Date')

#     instruments[ticker]['target'] = triple_barrier_labeling(instruments[ticker], upper_barrier=0.015, lower_barrier=0.015, max_holding_period=48, span=100)
    
#     df = pd.concat([
#         df,
#         instruments[ticker]
#     ])

#     df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:00:00')

#     df = df.sort_values(by='Date')

In [34]:
date_format = '%Y-%m-%d %H:00:00'
window = 5760

actual_date = datetime(2021,8,1,0,0,0)

date_to = actual_date - timedelta(hours=24+1) 
date_from = date_to - timedelta(hours=window)

date_from_test = actual_date
date_to_test = date_from_test + timedelta(hours=48)

date_from_str = date_from.strftime(date_format)
date_to_str = date_to.strftime(date_format)
date_from_test_str = date_from_test.strftime(date_format)
date_to_test_str = date_to_test.strftime(date_format)


train = df[(df['Date']>date_from_str) & (df['Date']<date_to_str) & (df.side != 0)]
test = df[(df['Date']>date_from_test_str) & (df['Date']<date_to_test_str) & (df.side != 0)]

# Inicio undersampling
# class_0 = train[train['target']==0]
# class_2 = train[train['target']==2]
# avg_examples = (class_0.shape[0] + class_2.shape[0]) / 2
# class_1 = train[train['target']==1].tail(int(avg_examples)).sample(frac=1)

# train = pd.concat([class_0, class_1, class_2])
# fin undersampling

train.target.value_counts()

target
1.0    11
0.0     9
Name: count, dtype: int64

In [35]:
test.target.value_counts()

Series([], Name: count, dtype: int64)

In [37]:
scaler = StandardScaler()
log_reg = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000)
model = XGBClassifier()

pipe = Pipeline([
    ('scaler', scaler),
    ('prob_transf', ProbabilityTransformer(model)),
    ('log_reg', log_reg)
])

with open('configs/model_config.yml', 'r') as file:
    model_configs = yaml.safe_load(file)

param_grid = model_configs['gradient_boosting']['param_grid']

n_splits = 5
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

search = GridSearchCV(
    pipe,
    param_grid,
    n_jobs=-1,
    cv=stratified_kfold,
    scoring=make_scorer(precision_score, average='weighted')
)

search.fit(train.drop(columns=['target', 'Date']), train.target)

ValueError: Invalid parameter 'stacking' for estimator Pipeline(steps=[('scaler', StandardScaler()),
                ('prob_transf',
                 ProbabilityTransformer(model=XGBClassifier(base_score=None,
                                                            booster=None,
                                                            callbacks=None,
                                                            colsample_bylevel=None,
                                                            colsample_bynode=None,
                                                            colsample_bytree=None,
                                                            device=None,
                                                            early_stopping_rounds=None,
                                                            enable_categorical=False,
                                                            eval_metric=None,
                                                            feature_types=None,
                                                            gamma=None,
                                                            grow_policy=None,
                                                            importanc...
                                                            interaction_constraints=None,
                                                            learning_rate=None,
                                                            max_bin=None,
                                                            max_cat_threshold=None,
                                                            max_cat_to_onehot=None,
                                                            max_delta_step=None,
                                                            max_depth=None,
                                                            max_leaves=None,
                                                            min_child_weight=None,
                                                            missing=nan,
                                                            monotone_constraints=None,
                                                            multi_strategy=None,
                                                            n_estimators=None,
                                                            n_jobs=None,
                                                            num_parallel_tree=None,
                                                            random_state=None, ...))),
                ('log_reg', LogisticRegression(max_iter=1000))]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [None]:
x = train.drop(columns=['target', 'Date', 'ticker'])
y = train.target

In [None]:
y.value_counts()

# Train performance

In [None]:
pipeline = search.best_estimator_

predictions = pipeline.predict_proba(train.drop(columns=['target', 'Date', 'ticker']))
max_probabilities = np.max(predictions, axis=1)
max_indices = np.argmax(predictions, axis=1)

precision = precision_score(train.target, max_indices, average='weighted')
recall = recall_score(train.target, max_indices, average='weighted')
f1 = f1_score(train.target, max_indices, average='weighted')

print(precision)
print(recall)
print(f1) 

target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(train.target, max_indices, target_names=target_names))

# Test performance

In [None]:
predictions = pipeline.predict_proba(test.drop(columns=['target', 'Date', 'ticker']))
max_probabilities = np.max(predictions, axis=1)
max_indices = np.argmax(predictions, axis=1)

precision = precision_score(test.target, max_indices, average='weighted')
recall = recall_score(test.target, max_indices, average='weighted')
f1 = f1_score(test.target, max_indices, average='weighted')

print(precision)
print(recall)
print(f1)

target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(test.target, max_indices, target_names=target_names))

In [None]:
symbols_path = './backbone/data/backtest/symbols/dataset.csv'

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(symbols_path)
df.head(50)

In [None]:
df[df['target']==2]