# PENDIENTES
- Realizacion de Tests:
    - Validacion de funcionamiento de tests de indicators, performance, y
    statistics - de la carpeta de metrics; y del funcionamiento de tests 
    de features y labeling - de la carpeta de processing
    - Realizacion de test para cleaning y pipeline, de la carpeta processing

- pipeline: Agregar a la funcion "run" los precios de cierre para poder hacer el backtesting.


# Quantitative Finance Library Examples

## Importing Necessary Libraries

In [None]:
from finml_core.data.loader import MarketDataLoader
from finml_core.processing.features import FeatureGenerator
from finml_core.processing.labeling import TripleBarrierLabeling

## Downloading Yahoo Finance Data

In [15]:
# 1. Configuration
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'AVGO']
start = '2015-01-01'
end = '2025-12-11'

# 2. Assign Market Data Loader Instance
market_data_loader = MarketDataLoader('yfinance')

# 3. Get Yahoo Finance Data
df = market_data_loader.get_yfinance_data(tickers, start, end)

print(df)

Downloading 8 tickers + ^GSPC reference...


[*********************100%***********************]  9 of 9 completed

Data is clean.
                        Close        High         Low        Open     Volume
Date       Ticker                                                           
2015-01-02 AAPL     24.237549   24.705318   23.798599   24.694233  212818400
           AMZN     15.426000   15.737500   15.348000   15.629000   55664000
           AVGO      7.574806    7.735248    7.502154    7.638378   13500000
           GOOGL    26.278944   26.589101   26.196068   26.430299   26480000
           META     77.969337   78.446400   77.223933   78.098546   18177500
...                       ...         ...         ...         ...        ...
2025-12-10 GOOGL   320.209991  321.309998  314.679993  315.829987   33428900
           META    650.130005  654.510010  643.400024  649.950012   16910900
           MSFT    478.559998  484.250000  475.079987  484.029999   35756200
           NVDA    183.779999  185.479996  182.039993  184.970001  162785400
           TSLA    451.450012  456.880005  443.609985  446.07




## Generating Metrics

In [20]:
# 1. Configuration
config = {
    "log_ret": dict(period=1),
    "rsi": dict(period=14),
    "bollinger": dict(window=20, std=2.0),
    "macd": dict(fast=12, slow=26, signal=9),
    "rvol": dict(window=20)
}

col_map = {
    'open': 'Open',
    'high': 'High',
    'low': 'Low',
    'close': 'Close',
    'volume': 'Volume'
}

# 2. Assign Feature Generator Instance
feature_generator = FeatureGenerator(col_map, 'Ticker', config)

# 3. Calculate Metrics
metrics_df = feature_generator.compute_indicators(df)

print(metrics_df)

                        Close        High         Low        Open     Volume  \
Date       Ticker                                                              
2015-01-02 AAPL     24.237549   24.705318   23.798599   24.694233  212818400   
           AMZN     15.426000   15.737500   15.348000   15.629000   55664000   
           AVGO      7.574806    7.735248    7.502154    7.638378   13500000   
           GOOGL    26.278944   26.589101   26.196068   26.430299   26480000   
           META     77.969337   78.446400   77.223933   78.098546   18177500   
...                       ...         ...         ...         ...        ...   
2025-12-10 GOOGL   320.209991  321.309998  314.679993  315.829987   33428900   
           META    650.130005  654.510010  643.400024  649.950012   16910900   
           MSFT    478.559998  484.250000  475.079987  484.029999   35756200   
           NVDA    183.779999  185.479996  182.039993  184.970001  162785400   
           TSLA    451.450012  456.88000

## Selecting Features and Lags

In [22]:
# 1. Configuration
selection = {
    'log_ret': [],
    'rsi': [],
    'rsi_diff': [],
    'bb_pct_b': [],
    'bb_width': [],
    'macd_rel_hist': [],
    'rvol': []
}

for sel in selection:
    selection[sel] = [i for i in range(1, 4)]

# 2. Prepare Features
X = feature_generator.construct_feature_matrix(metrics_df, selection)

print(X)

                    log_ret  log_ret_lag_1  log_ret_lag_2  log_ret_lag_3  \
Date       Ticker                                                          
2015-01-02 AAPL         NaN            NaN            NaN            NaN   
           AMZN         NaN            NaN            NaN            NaN   
           AVGO         NaN            NaN            NaN            NaN   
           GOOGL        NaN            NaN            NaN            NaN   
           META         NaN            NaN            NaN            NaN   
...                     ...            ...            ...            ...   
2025-12-10 GOOGL   0.009823       0.010653      -0.023127       0.011426   
           META   -0.010451      -0.014867      -0.009879       0.017814   
           MSFT   -0.027738       0.002035       0.016137       0.004813   
           NVDA   -0.006454      -0.003131       0.017067      -0.005304   
           TSLA    0.014008       0.012637      -0.034478       0.001034   

           

In [25]:
triple_barrier_method = TripleBarrierLabeling(col_map, "Ticker")

tbm_data = triple_barrier_method.compute_outcomes(df)

y = tbm_data['target_side']

In [26]:
X['y'] = y

# Full Cycle Example

In [16]:
# Importing our First Pipeline
from finml_core.pipelines.data_factory import DatasetGenerator
# Importing Financial Splitting Methods
from finml_core.model_selection.split import PurgedKFold, purged_train_test_split

# External Libraries
import pandas as pd
import numpy as np

# Machine learning external modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

## Config for the Dataset Generation

In [15]:
# Configuration

# Choosing financial and market theoretical metrics that we want as our features.
features_config = {
    "log_ret": dict(period=1),
    "rsi": dict(period=14),
    "bollinger": dict(window=20, std=2.0),
    "macd": dict(fast=12, slow=26, signal=9),
    "rvol": dict(window=20)
}

# Choosing metrics an calculations that we're going to choose as our features,
# and including lag 5 for al our features
# The following are the most recomended metrics to use as featues.
feature_selection = {
    'log_ret': [],
    'rsi': [],
    'rsi_diff': [],
    'bb_pct_b': [],
    'bb_width': [],
    'macd_rel_hist': [],
    'rvol': []
}

# 5 lags of information per feature
for sel in feature_selection:
    feature_selection[sel] = [i for i in range(1, 4)]

# Setting parameters to compute the triple barrier method labeling
label_config = dict(
    stop_loss_multiplier = 1,
    take_profit_multiplier = 1,
    time_limit = 10,
    vol_span = 50
)

# Setting the desired stocks to download and the start date, as well as the
# etf reference date
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'AVGO']
start = '2015-01-01'
etf_reference = '^GSPC'

## Generating Dataset

In [None]:
# Create Dataset generator instance
dataset_generator = DatasetGenerator(
    data_source='yfinance',
    feature_config=features_config,
    feature_selection=feature_selection,
    label_config=label_config
)

# Getting X and Y
X, y = dataset_generator.run(
    tickers=tickers,
    etf_reference=etf_reference,
    start_date=start
)

# This contains all the different calculations made by the feature and the
# labeling modules, excluding the lags
analysis_data = dataset_generator.analysis_data

--- 1. Starting Ingestion (1 tickers) ---
Downloading 1 tickers + ^GSPC reference...


[*********************100%***********************]  2 of 2 completed

--- 2. Generating Features (X) ---
--- 3. Generating Labels (y) ---
--- 4. Consolidating & Cleaning ---
--- Pipeline Finished. Final Dataset: 2708 rows ---





## Purged K-Fold CV Example with Random Forest

In [None]:
# Creating purged k-fold cross validation instance
purged_cv = PurgedKFold(
    n_splits=5,             # Number of folds
    t1=analysis_data['t1'], # labels exit dates
    date_level='Date',      # Date level index reference
    pct_embargo=0.01        # Pct. Embargo
)

# Crearting random forest instance
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    random_state=42
)

# Computing Purged K-Fold Cross Validation and Calculating Scores
scores = cross_val_score(
    estimator=rf_model,
    X=X_train,
    y=y,
    cv=purged_cv,
    scoring='accuracy',
    n_jobs=-1,  # Usa todos los núcleos del CPU para acabar rápido
    verbose=1   # Te mostrará una barra de progreso o logs básicos
)

print("\n--- Resultados de PurgedKFold ---")
print(f"Scores por Fold: {scores}")
print(f"Accuracy Promedio: {np.mean(scores):.2%}")
print(f"Desviación Estándar: {np.std(scores):.4f}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.



--- Resultados de PurgedKFold ---
Scores por Fold: [0.47689464 0.43738447 0.48544362 0.41226852 0.45601852]
Accuracy Promedio: 45.36%
Desviación Estándar: 0.0266


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.4s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.5s finished


# Purged K-Fold CV Analysis

In [4]:
X_train, y_train, X_test, y_test = purged_train_test_split(
    X,
    y,
    analysis_data['t1'],
    'Date'
)

train_valid_values_mask = analysis_data.index.isin(X_train.index)
train_analysis_data = analysis_data.loc[train_valid_values_mask]
train_t1 = train_analysis_data['t1']

In [5]:
# Creating purged k-fold cross validation instance
purged_cv = PurgedKFold(
    n_splits=5,             # Number of folds
    t1=train_t1, # labels exit dates
    date_level='Date',      # Date level index reference
    pct_embargo=0.01        # Pct. Embargo
)

In [6]:
all_folds = list(purged_cv.split(X_train, y_train))

actual_fold = all_folds[1]
train_idx, test_idx = actual_fold

close_prices = analysis_data['Close']
close_prices_train = close_prices.iloc[train_idx]
close_prices_test = close_prices.iloc[test_idx]

aapl_train = close_prices_train[close_prices_train.index.get_level_values('Ticker') == 'AAPL']
aapl_test = close_prices_test[close_prices_test.index.get_level_values('Ticker') == 'AAPL']

In [9]:
purging_t1 = purged_cv.purging_t1[1]
embargo_t1 = purged_cv.embargo_t1[1]

In [39]:
def extract_subsets(serie:pd.Series):
    # 1. Identifying valid times
    is_valid = serie.notna()
    
    # 2. Generar un "ID de grupo" que cambie cada vez que pasamos de NaT a Valor o viceversa
    # (es_valido != es_valido.shift()) detecta el cambio de estado
    # .cumsum() incrementa el ID en cada cambio
    group_id = (is_valid != is_valid.shift()).cumsum()
    
    # 3. Agrupar la serie original usando esos IDs
    groups = serie.groupby(group_id)
    
    # 4. Filtrar: Solo queremos los grupos donde los datos son válidos
    # (Verificamos si el primer elemento del grupo es válido)
    subsets = [group for _, group in groups if is_valid[group.index[0]]]
    
    return subsets

In [None]:
# Left purged area
# Left Interval:
# - min(left_purged)

# Right Interval
# - max()

In [59]:
aapl_close = analysis_data.loc[analysis_data.index.get_level_values('Ticker') == 'AAPL', 'Close']
purging_aligned = purging_t1.reindex(aapl_close.index)

purge_in_subsets = extract_subsets(purging_aligned)

left_purged_subset = purge_in_subsets[0]

# Left Purged Linf
left_linf = left_purged_subset.index.get_level_values('Date').min()

# Left Purged Lsup
aaple_dates = aapl_close.index.get_level_values('Date')
left_lsup = aaple_dates[aaple_dates >= left_purged_subset.max()].min()

In [75]:
# Right Purged Subset
right_purged_subset = purge_in_subsets[1]

aapl_t1 = analysis_data.loc[aapl_close.index, 't1']

aapl_t1[aapl_t1.index.get_level_values('Date') <= right_purged_subset.max()]

Date        Ticker
2015-03-17  AAPL     2015-03-31
2015-03-18  AAPL     2015-04-01
2015-03-19  AAPL     2015-04-02
2015-03-20  AAPL     2015-04-06
2015-03-23  AAPL     2015-04-07
                        ...    
2019-07-25  AAPL     2019-07-31
2019-07-26  AAPL     2019-07-31
2019-07-29  AAPL     2019-07-31
2019-07-30  AAPL     2019-07-31
2019-07-31  AAPL     2019-08-02
Name: t1, Length: 1102, dtype: datetime64[ns]

In [74]:
right_purged_subset.max()

Timestamp('2019-07-31 00:00:00')

In [48]:
analysis_data.loc[aapl_close.index, 't1']

Date        Ticker
2015-03-17  AAPL     2015-03-31
2015-03-18  AAPL     2015-04-01
2015-03-19  AAPL     2015-04-02
2015-03-20  AAPL     2015-04-06
2015-03-23  AAPL     2015-04-07
                        ...    
2025-12-11  AAPL     2025-12-18
2025-12-12  AAPL     2025-12-18
2025-12-15  AAPL     2025-12-30
2025-12-16  AAPL     2025-12-31
2025-12-17  AAPL     2026-01-02
Name: t1, Length: 2707, dtype: datetime64[ns]

In [12]:
aapl_close = analysis_data.loc[analysis_data.index.get_level_values('Ticker') == 'AAPL', 'Close']
purging_aligned = purging_t1.reindex(aapl_close.index)

purge_in_subsets = extract_subsets(purging_aligned)

# Left Purging Area
left_purge_last_date = purge_in_subsets[0].index.get_level_values('Date').max()
first_test_date = aapl_test.index.get_level_values('Date').min()

# Right Purging Area
right_purge_first_date = purge_in_subsets[1].index.get_level_values('Date').min()
last_test_date = aapl_test.index.get_level_values('Date').max()

In [None]:
# minimum train entry date
left_purge_train_min = purge_in_subsets[0].index.get_level_values('Date').min()

# minimum between earliest test entry date & min purged ENTRY date
# (I think thats the error, instead of exit date it should be the entry date
# for the purged info...)
left_purge_end = max(
    aapl_test.index.get_level_values('Date').max(),
    purge_in_subsets[0].max()
)

left_purge_int = [left_purge_train_min, left_purge_end]

Timestamp('2019-07-02 00:00:00')

In [48]:
fig = go.Figure()

def plot_purging_and_embargo(close:pd.Series, purging:pd.Series, test:pd.Series):
    # 1. Graficar el Precio (Línea principal)
    fig.add_trace(go.Scatter(
        x=close.index.get_level_values('Date'),
        y=close,
        mode='lines',
        line=dict(color='black', width=1.5),
        name='Close Price'
    ))

    purging_aligned = purging.reindex(close.index)

    purge_in_subsets = extract_subsets(purging_aligned)

    # Left Purging Area
    left_purge_last_date = purge_in_subsets[0].index.get_level_values('Date').max()
    first_test_date = test.index.get_level_values('Date').min()

    # Right Purging Area
    right_purge_first_date = purge_in_subsets[1].index.get_level_values('Date').min()
    last_test_date = test.index.get_level_values('Date').max()

    def add_vertical_line(intervals:list, name:str, color=str):

        # Iteramos sobre la serie (Index=Inicio, Value=Fin)
        # items() devuelve pares (index, value)
        fig.add_vrect(
            x0=intervals[0], 
            x1=intervals[1],
            fillcolor=color, 
            opacity=0.3,          # Transparencia
            layer="below",        # IMPORTANTE: Que quede detrás de la línea
            line_width=0,         # Sin bordes
            annotation_position="top left"
        )

    vertical_list = [
        [[left_purge_last_date, first_test_date], "Left Purged Area", "blue"],
        [[right_purge_first_date, last_test_date], "Right Purged Area", "red"]
    ]

    for vl in vertical_list:
        add_vertical_line(vl[0], vl[1], vl[2])
    
    fig.show()

In [17]:
fig = go.Figure()

# 1. Graficar el Precio (Línea principal)
fig.add_trace(go.Scatter(
    x=aapl_close.index.get_level_values('Date'),
    y=aapl_close,
    mode='lines',
    line=dict(color='black', width=1.5),
    name='Close Price'
))

purging_aligned = purging_t1.reindex(aapl_close.index)

purge_in_subsets = extract_subsets(purging_aligned)

# Left Purging Area
left_purge_last_date = purge_in_subsets[0].index.get_level_values('Date').max()
first_test_date = aapl_test.index.get_level_values('Date').min()

# Right Purging Area
right_purge_first_date = purge_in_subsets[1].index.get_level_values('Date').min()
last_test_date = aapl_test.index.get_level_values('Date').max()

# Embargo Area
embargo_dates = embargo_t1.index.get_level_values('Date')

fig.add_vrect(
    x0=left_purge_last_date, 
    x1=first_test_date,
    fillcolor='red', 
    opacity=.7,          # Transparencia
    layer="below",        # IMPORTANTE: Que quede detrás de la línea
    line_width=1,         # Sin bordes
    line_color='red',
    annotation_position="top left",
    annotation_text="Left Purged Area"
)

fig.add_vrect(
    x0=last_test_date, 
    x1=right_purge_first_date,
    fillcolor='red', 
    opacity=.7,          # Transparencia
    layer="below",        # IMPORTANTE: Que quede detrás de la línea
    line_width=1,         # Sin bordes
    line_color='red',
    annotation_position="top left",
    annotation_text="Right Purged Area"
)

fig.add_vrect(
    x0=embargo_dates.min(), 
    x1=embargo_dates.max(),
    fillcolor='blue', 
    opacity=.7,          # Transparencia
    layer="below",        # IMPORTANTE: Que quede detrás de la línea
    line_width=1,         # Sin bordes
    line_color='blue',
    annotation_position="top left",
    annotation_text="Right Purged Area"
)

In [15]:
embargo_aligned.index.get_level_values('Date')

DatetimeIndex(['2015-03-17', '2015-03-18', '2015-03-19', '2015-03-20',
               '2015-03-23', '2015-03-24', '2015-03-25', '2015-03-26',
               '2015-03-27', '2015-03-30',
               ...
               '2025-12-01', '2025-12-02', '2025-12-03', '2025-12-04',
               '2025-12-05', '2025-12-08', '2025-12-09', '2025-12-10',
               '2025-12-11', '2025-12-12'],
              dtype='datetime64[ns]', name='Date', length=2704, freq=None)

In [69]:
embargo_dates.max()

Timestamp('2025-12-11 00:00:00')

In [None]:
from plotly import graph_objects as go
import pandas as pd

def plot_close(
        close: pd.Series,
        train: pd.Series,
        test:pd.Series,
        overlap_data:pd.Series
):
    # Align data to close
    train_aligned = train.reindex(close.index)
    test_aligned = test.reindex(close.index)
    overlap_aligned = overlap_data.reindex(close.index)

    fig = go.Figure()

    # 1. Capa Train
    fig.add_trace(go.Scatter(
        x=train_aligned.index.get_level_values('Date'),
        y=train_aligned,
        mode='lines',
        line=dict(color='blue', width=2),
        name='Train Set'
    ))

    # 2. Capa Test
    fig.add_trace(go.Scatter(
        x=test_aligned.index.get_level_values('Date'),
        y=test_aligned,
        mode='lines',
        line=dict(color='red', width=2),
        name='Test Set'
    ))

    # 3. Capa Overlap
    fig.add_trace(go.Scatter(
        x=overlap_aligned.index.get_level_values('Date'),
        y=overlap_aligned['Close'],
        mode='lines',
        line=dict(color='black', width=2),
        name='Test Set'
    ))

    fig.update_layout(title="Opción 1: Líneas Multicolores", template="plotly_white")

    fig.show()


aapl_close = analysis_data.loc[analysis_data.index.get_level_values('Ticker') == 'AAPL', 'Close']

lines_dict = {
    'Train Latest Entry': left_purge,
    'Train Earliest Entry': right_purge,
    'Embargo ': embargo_end
}
plot_close(aapl_close, aapl_train, aapl_test, analysis_data.loc[left_purge.index, ['t1', 'Close']])

In [61]:
# Validar que el ultimo exit del train set por la izquierda sea menor
# que el primer entry del test set
left_idx = aapl_train[
    aapl_train.index.get_level_values('Date') <= pd.Timestamp(year=2017, month=5, day=9)
].index

latest_train_exit = analysis_data.loc[left_idx, 't1'].max()

earliest_test_entry = aapl_test.index.get_level_values('Date').min()

latest_train_exit < earliest_test_entry

True

In [63]:
# Validar que el primer entry del train set por la derecha sea mayor que
# el último exit del test set
earliest_train_entry = aapl_train[
    aapl_train.index.get_level_values('Date') >= pd.Timestamp(year=2019, month=7, day=2)
].index.get_level_values('Date').min()

latest_test_exit = analysis_data.loc[aapl_test.index, 't1'].max()

earliest_train_entry > latest_test_exit

True

In [6]:
# Visualizaciones
import plotly.graph_objects as go
from typing import Sequence, Optional, Union

def create_pie_chart(
    labels: Sequence[str],
    values: Sequence[Union[int, float]],
    title: Optional[str] = None,
    hole: float = 0.0,
    pull: Optional[Sequence[float]] = None,
    textinfo: str = "percent+label",
    hoverinfo: str = "label+percent",
    showlegend: bool = True,
    sort: bool = False
) -> go.Figure:
    if len(labels) != len(values):
        raise ValueError("labels y values deben tener la misma longitud.")
    if pull is not None and len(pull) != len(labels):
        raise ValueError("Si se proporciona 'pull', debe tener la misma longitud que labels.")
    
    if sort:
        pairs = sorted(zip(values, labels, (pull or [0]*len(labels))), reverse=True)
        values, labels, pull = zip(*pairs)

    fig = go.Figure(
        data=[
            go.Pie(
                labels=list(labels),
                values=list(values),
                hole=hole,
                pull=list(pull) if pull is not None else None,
                textinfo=textinfo,
                hoverinfo=hoverinfo,
                showlegend=showlegend,
            )
        ]
    )
    fig.update_layout(margin=dict(t=40, b=20, l=20, r=20))
    if title:
        fig.update_layout(title=title, height=600)
    return fig

# Triple Barrier Method Labeling Analysis

## Label Proportion Analysis

In [7]:
# Supongamos que 'y' es tu salida del pipeline (pd.Series con valores -1, 0, 1)

# 1. Calcular frecuencias
conteo = y.value_counts()

# 2. Definir Mapa de Nombres (Diccionario)
# Esto traduce tus etiquetas numéricas a texto financiero
label_map = {
    1.0: "Long (Take Profit)",
    0.0: "Hold (Time Limit)",
    -1.0: "Short (Stop Loss)"
}

# 3. Preparar los datos para tu función
# List comprehension para traducir los índices que existen en tus datos
chart_labels = [label_map.get(idx, str(idx)) for idx in conteo.index]
chart_values = conteo.values

# 4. Generar el gráfico
fig = create_pie_chart(
    labels=chart_labels,
    values=chart_values,
    title="Distribución de Etiquetas - Triple Barrier Method",
    hole=0.4,          # Un gráfico de Dona se ve más moderno (0.0 para pastel completo)
    textinfo="percent+label",
    sort=True          # Ordena para que el pedazo más grande salga primero
)

fig.show()

###

In [8]:
from plotly import graph_objects as go
data_raw = dataset_generator.analysis_data

apple_X = data_raw.xs('AAPL', level='Ticker').iloc[-100:]
apple_y = data_raw.xs('AAPL', level='Ticker').iloc[-100:]

filtro_lower = apple_y.loc[apple_y['target_side'] == -1, 'lower_barrier']
filtro_upper = apple_y.loc[apple_y['target_side'] == 1, 'upper_barrier']
filtro_keep = apple_X.loc[apple_y['target_side'] == 0, 'Close']

fig = go.Figure()
close = go.Scatter(x=apple_X.index, y=apple_X['Close'], mode='lines', name='Close Price', line=dict(color='blue'))

stop_losses = go.Scatter(x=filtro_lower.index, y=filtro_lower, mode='markers', name='Stop Loss', marker=dict(symbol='arrow', color='red'))
take_profits = go.Scatter(x=filtro_upper.index, y=filtro_upper, mode='markers', name='Take Profit', marker=dict(symbol='arrow', color='green'))
time_limit_close = go.Scatter(x=filtro_keep.index, y=filtro_keep, mode='markers', name='Sell at Market Price', marker=dict(symbol='circle-open', color='blue'))

low_price = go.Scatter(x=apple_X.index, y=apple_X['Low'], mode='lines', name='Low Price', line=dict(color='red'))
high_price = go.Scatter(x=apple_X.index, y=apple_X['High'], mode='lines', name='High Price', line=dict(color='green'))

fig.update_xaxes(
    showspikes=True,
    spikemode='across',
    spikesnap='cursor',
    spikethickness=1,
    spikecolor='black'
)

fig.update_yaxes(
    showspikes=True,
    spikemode='across',
    spikesnap='cursor',
    spikethickness=1,
    spikecolor='black'
)

fig.add_traces([high_price, low_price, take_profits, stop_losses, time_limit_close]).update_layout(template='simple_white')

high_price.hoverinfo = 'skip'
low_price.hoverinfo = 'skip'
take_profits.hoverinfo = 'skip'
stop_losses.hoverinfo = 'skip'
time_limit_close.hoverinfo = 'skip'

fig.show()

# Data Corruption for Data Cleaning Function Validation

In [3]:
import numpy as np
import pandas as pd
import numpy as np
from typing import Tuple

def inject_random_invalid_data(
    df: pd.DataFrame, 
    n_samples: int,
    type_data
) -> Tuple[pd.DataFrame, np.ndarray]:
    """
    Injects np.inf and -np.inf at random positions in the DataFrame.

    Args:
        df: The numeric DataFrame to corrupt.
        n_samples: Number of infinite values to inject.

    Returns:
        Tuple containing the corrupted DataFrame and the coordinate grid.
    """
    # Trabajamos sobre una copia para no dañar el original
    corrupted_df = df.copy()
    n_rows, n_cols = corrupted_df.shape

    # 1. Generar coordenadas aleatorias
    # Ojo: low es inclusivo, high es exclusivo
    rand_rows = np.random.randint(0, n_rows, size=n_samples)
    rand_cols = np.random.randint(0, n_cols, size=n_samples)

    # 2. Crear el Grid (Tu estructura solicitada)
    # Stackeamos para tener [[r, c], [r, c]...]
    grid = np.column_stack((rand_rows, rand_cols))

    # 3. Inyectar los valores
    for row_idx, col_idx in grid:
        # Elegimos aleatoriamente entre positivo y negativo infinito
        val = np.random.choice([type_data, -type_data])
        
        # Usamos iat para acceso rápido por índice entero
        corrupted_df.iat[row_idx, col_idx] = val

    return corrupted_df, grid

In [4]:
X_corrupted, grid = inject_random_invalid_data(X, 10, np.NaN)
X_corrupted, grid = inject_random_invalid_data(X_corrupted, 10, np.inf)

In [None]:
from finml_core.processing.cleaning import DataCleaner

data_cleaner = DataCleaner('Ticker', 'ffill')

is_clean = data_cleaner.validate_and_report(X_corrupted)

Data contains gaps. Generating report...
--- Data Health Report ---

Infs Report:
        rsi  rsi_diff_lag_2  rsi_diff_lag_3  macd_rel_hist_lag_1  \
Ticker                                                             
AMZN      1               0               0                    0   
AVGO      0               0               0                    0   
GOOGL     0               0               0                    0   
MSFT      1               1               0                    1   
NVDA      0               1               0                    0   
TSLA      0               0               1                    0   

        macd_rel_hist_lag_2  rvol  rvol_lag_2  
Ticker                                         
AMZN                      0     0           0  
AVGO                      1     0           0  
GOOGL                     0     0           1  
MSFT                      0     0           1  
NVDA                      0     0           0  
TSLA                      0     1    