In [1]:
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv


def find_project_root(marker: str = ".env") -> Path:
    """Encuentra la ra√≠z del proyecto buscando el marcador (.env)."""
    current = Path().resolve()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent
    return current


ROOT_PATH = find_project_root()
load_dotenv(ROOT_PATH / ".env")


def get_data_path(relative_path: str | Path) -> str:
    """Retorna ruta absoluta como str con fallback a *_sample si existe."""
    path = Path(relative_path)
    if not path.is_absolute():
        path = ROOT_PATH / path

    if path.exists():
        return str(path)

    sample_path = path.with_name(path.stem + "_sample" + path.suffix)
    if sample_path.exists():
        print(f"‚ö†Ô∏è Usando muestra: {sample_path.name}")
        return str(sample_path)

    raise FileNotFoundError(f"No se encontr√≥ el archivo {path} ni su muestra {sample_path}")


print(f"‚úÖ Ra√≠z detectada: {ROOT_PATH}")

‚úÖ Ra√≠z detectada: /home/els4nchez/Videos/Harmeregildo


## 1Ô∏è‚É£ Importaci√≥n de Librer√≠as

In [2]:
# Manipulaci√≥n de datos
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

# Visualizaci√≥n
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Machine Learning para Isolation Forest
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Estad√≠stica
from scipy import stats as scipy_stats

# Configuraci√≥n
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("‚úÖ Librer√≠as importadas correctamente")

‚úÖ Librer√≠as importadas correctamente


---

## 2Ô∏è‚É£ Carga de Datos

In [3]:
# Configurar rutas del proyecto
BASE_DIR = ROOT_PATH
DATA_DIR = BASE_DIR / 'data'
UNIFICACION_DIR = BASE_DIR / 'unificacion'
DATA_PROCESADO_DIR = UNIFICACION_DIR / 'datos_procesados'
FIGURAS_DIR = UNIFICACION_DIR / 'figuras'

# Crear directorio de figuras si no existe
FIGURAS_DIR.mkdir(parents=True, exist_ok=True)

# Cargar datos enriquecidos del EDA
ruta_precios = get_data_path(DATA_PROCESADO_DIR / 'precios_oro_eda.csv')

# Cargar datos
df = pd.read_csv(ruta_precios, parse_dates=['Date'], index_col='Date')

print("‚úÖ Datos cargados exitosamente")
print(f"   Registros: {df.shape[0]:,} d√≠as")
print(f"   Variables: {df.shape[1]}")
print(f"   Per√≠odo: {df.index.min().date()} ‚Üí {df.index.max().date()}")

‚úÖ Datos cargados exitosamente
   Registros: 3,614 d√≠as
   Variables: 7
   Per√≠odo: 2016-01-03 ‚Üí 2025-11-24


---

## 3Ô∏è‚É£ M√©todo 1: IQR (Interquartile Range)

**Teor√≠a:**
- **IQR = Q3 - Q1** (rango intercuart√≠lico)
- **Outliers:**
  - Valor < Q1 - 1.5 √ó IQR  (outlier inferior)
  - Valor > Q3 + 1.5 √ó IQR  (outlier superior)

**Ventajas:** Robusto a valores extremos, no asume distribuci√≥n normal  
**Desventajas:** Puede no detectar outliers sutiles en series temporales

In [4]:
def detect_outliers_iqr(data, column, multiplier=1.5):
    """
    Detecta outliers usando el m√©todo IQR.
    
    Parameters:
    -----------
    data : pd.DataFrame
        DataFrame con los datos
    column : str
        Nombre de la columna a analizar
    multiplier : float
        Multiplicador del IQR (por defecto 1.5, m√°s estricto = 1.0, m√°s permisivo = 2.0)
    
    Returns:
    --------
    pd.Series : M√°scara booleana con True para outliers
    """
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    
    outliers = (data[column] < lower_bound) | (data[column] > upper_bound)
    
    print(f"üìä Estad√≠sticas IQR para '{column}':")
    print(f"   Q1 (percentil 25): {Q1:.2f}")
    print(f"   Q3 (percentil 75): {Q3:.2f}")
    print(f"   IQR: {IQR:.2f}")
    print(f"   L√≠mite inferior: {lower_bound:.2f}")
    print(f"   L√≠mite superior: {upper_bound:.2f}")
    print(f"   Outliers detectados: {outliers.sum()} ({outliers.sum()/len(data)*100:.2f}%)")
    
    return outliers

# Aplicar m√©todo IQR al precio de cierre
print("üîç M√âTODO 1: IQR (Interquartile Range)")
print("="*70)
df['outlier_iqr'] = detect_outliers_iqr(df, 'Close')

üîç M√âTODO 1: IQR (Interquartile Range)
üìä Estad√≠sticas IQR para 'Close':
   Q1 (percentil 25): 1297.35
   Q3 (percentil 75): 1956.94
   IQR: 659.60
   L√≠mite inferior: 307.95
   L√≠mite superior: 2946.34
   Outliers detectados: 258 (7.14%)


In [5]:
# Visualizar outliers IQR
fig = go.Figure()

# Precios normales
fig.add_trace(go.Scatter(
    x=df[~df['outlier_iqr']].index,
    y=df[~df['outlier_iqr']]['Close'],
    mode='lines',
    name='Precio Normal',
    line=dict(color='lightblue', width=1.5)
))

# Outliers
fig.add_trace(go.Scatter(
    x=df[df['outlier_iqr']].index,
    y=df[df['outlier_iqr']]['Close'],
    mode='markers',
    name='Outliers (IQR)',
    marker=dict(color='red', size=8, symbol='x'),
    hovertemplate='<b>Fecha</b>: %{x|%Y-%m-%d}<br><b>Precio</b>: $%{y:.2f}<extra></extra>'
))

fig.update_layout(
    title='üîç Detecci√≥n de Anomal√≠as - M√©todo IQR',
    xaxis_title='Fecha',
    yaxis_title='Precio de Cierre (USD/oz)',
    template='plotly_white',
    height=500,
    hovermode='x unified'
)

fig.show()

---

## 4Ô∏è‚É£ M√©todo 2: Z-Score

**Teor√≠a:**
- **Z-Score = (X - Œº) / œÉ** (cu√°ntas desviaciones est√°ndar se aleja de la media)
- **Outliers:** |Z-Score| > umbral (t√≠picamente 2.5 o 3)

**Ventajas:** Simple, interpretable  
**Desventajas:** Asume distribuci√≥n normal, sensible a valores extremos

In [6]:
def detect_outliers_zscore(data, column, threshold=3):
    """
    Detecta outliers usando Z-Score.
    
    Parameters:
    -----------
    data : pd.DataFrame
        DataFrame con los datos
    column : str
        Nombre de la columna a analizar
    threshold : float
        Umbral de Z-Score (por defecto 3, m√°s estricto = 2.5, m√°s permisivo = 3.5)
    
    Returns:
    --------
    pd.Series : M√°scara booleana con True para outliers
    """
    mean = data[column].mean()
    std = data[column].std()
    
    z_scores = np.abs((data[column] - mean) / std)
    outliers = z_scores > threshold
    
    print(f"üìä Estad√≠sticas Z-Score para '{column}':")
    print(f"   Media (Œº): {mean:.2f}")
    print(f"   Desviaci√≥n est√°ndar (œÉ): {std:.2f}")
    print(f"   Umbral de Z-Score: ¬±{threshold}")
    print(f"   Outliers detectados: {outliers.sum()} ({outliers.sum()/len(data)*100:.2f}%)")
    
    # Guardar Z-scores para an√°lisis posterior
    data['z_score'] = z_scores
    
    return outliers

# Aplicar m√©todo Z-Score
print("\nüîç M√âTODO 2: Z-Score")
print("="*70)
df['outlier_zscore'] = detect_outliers_zscore(df, 'Close', threshold=3)


üîç M√âTODO 2: Z-Score
üìä Estad√≠sticas Z-Score para 'Close':
   Media (Œº): 1806.03
   Desviaci√≥n est√°ndar (œÉ): 621.50
   Umbral de Z-Score: ¬±3
   Outliers detectados: 69 (1.91%)


In [7]:
# Visualizar Z-Scores
fig = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    vertical_spacing=0.08,
    subplot_titles=('Precio de Cierre', 'Z-Score'),
    row_heights=[0.6, 0.4]
)

# Subplot 1: Precios con outliers
fig.add_trace(go.Scatter(
    x=df.index,
    y=df['Close'],
    mode='lines',
    name='Precio',
    line=dict(color='lightblue', width=1.5)
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=df[df['outlier_zscore']].index,
    y=df[df['outlier_zscore']]['Close'],
    mode='markers',
    name='Outliers',
    marker=dict(color='red', size=8, symbol='x')
), row=1, col=1)

# Subplot 2: Z-Scores
fig.add_trace(go.Scatter(
    x=df.index,
    y=df['z_score'],
    mode='lines',
    name='Z-Score',
    line=dict(color='darkblue', width=1.5),
    showlegend=False
), row=2, col=1)

# L√≠neas de umbral
fig.add_hline(y=3, line_dash="dash", line_color="red", row=2, col=1, annotation_text="Umbral +3")
fig.add_hline(y=-3, line_dash="dash", line_color="red", row=2, col=1, annotation_text="Umbral -3")
fig.add_hline(y=0, line_dash="dot", line_color="gray", row=2, col=1)

fig.update_xaxes(title_text="Fecha", row=2, col=1)
fig.update_yaxes(title_text="Precio (USD/oz)", row=1, col=1)
fig.update_yaxes(title_text="Z-Score", row=2, col=1)

fig.update_layout(
    title_text="üîç Detecci√≥n de Anomal√≠as - M√©todo Z-Score",
    height=700,
    template='plotly_white',
    hovermode='x unified'
)

fig.show()


---

## 5Ô∏è‚É£ M√©todo 3: Isolation Forest

**Teor√≠a:**
- Algoritmo de **Machine Learning** basado en √°rboles de decisi√≥n
- A√≠sla observaciones creando particiones aleatorias
- Outliers requieren **menos particiones** para ser aislados

**Ventajas:** No asume distribuci√≥n, detecta anomal√≠as multivariadas  
**Desventajas:** Requiere tuning de hiperpar√°metros, menos interpretable

In [8]:
def detect_outliers_isolation_forest(data, features, contamination=0.05, random_state=42):
    """
    Detecta outliers usando Isolation Forest.
    
    Parameters:
    -----------
    data : pd.DataFrame
        DataFrame con los datos
    features : list
        Lista de nombres de columnas a usar como features
    contamination : float
        Proporci√≥n esperada de outliers (0.05 = 5%)
    random_state : int
        Semilla para reproducibilidad
    
    Returns:
    --------
    pd.Series : M√°scara booleana con True para outliers
    """
    # Preparar datos
    X = data[features].copy()
    
    # Manejar valores nulos
    X = X.fillna(X.mean())
    
    # Normalizar features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Entrenar Isolation Forest
    iso_forest = IsolationForest(
        contamination=contamination,
        random_state=random_state,
        n_estimators=100,
        max_samples='auto'
    )
    
    predictions = iso_forest.fit_predict(X_scaled)
    
    # -1 = outlier, 1 = inlier
    outliers = predictions == -1
    
    # Guardar anomaly scores
    data['anomaly_score'] = iso_forest.score_samples(X_scaled)
    
    print(f"üìä Estad√≠sticas Isolation Forest:")
    print(f"   Features usados: {features}")
    print(f"   Contaminaci√≥n esperada: {contamination*100:.1f}%")
    print(f"   Outliers detectados: {outliers.sum()} ({outliers.sum()/len(data)*100:.2f}%)")
    print(f"   Anomaly score promedio: {data['anomaly_score'].mean():.4f}")
    
    return outliers

# Aplicar Isolation Forest usando OHLC y Returns
print("\nüîç M√âTODO 3: Isolation Forest")
print("="*70)

features_to_use = ['Open', 'High', 'Low', 'Close', 'Returns']
df['outlier_iforest'] = detect_outliers_isolation_forest(
    df, 
    features=features_to_use,
    contamination=0.05
)


üîç M√âTODO 3: Isolation Forest
üìä Estad√≠sticas Isolation Forest:
   Features usados: ['Open', 'High', 'Low', 'Close', 'Returns']
   Contaminaci√≥n esperada: 5.0%
   Outliers detectados: 181 (5.01%)
   Anomaly score promedio: -0.4536


In [9]:
# Visualizar Isolation Forest
fig = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    vertical_spacing=0.08,
    subplot_titles=('Precio de Cierre', 'Anomaly Score (Isolation Forest)'),
    row_heights=[0.6, 0.4]
)

# Subplot 1: Precios con outliers
fig.add_trace(go.Scatter(
    x=df.index,
    y=df['Close'],
    mode='lines',
    name='Precio',
    line=dict(color='lightblue', width=1.5)
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=df[df['outlier_iforest']].index,
    y=df[df['outlier_iforest']]['Close'],
    mode='markers',
    name='Outliers',
    marker=dict(color='red', size=8, symbol='x')
), row=1, col=1)

# Subplot 2: Anomaly Scores
colors = ['red' if x else 'blue' for x in df['outlier_iforest']]
fig.add_trace(go.Scatter(
    x=df.index,
    y=df['anomaly_score'],
    mode='markers',
    name='Anomaly Score',
    marker=dict(color=df['anomaly_score'], colorscale='RdYlBu_r', size=4, 
                colorbar=dict(title="Score", x=1.1)),
    showlegend=False
), row=2, col=1)

fig.update_xaxes(title_text="Fecha", row=2, col=1)
fig.update_yaxes(title_text="Precio (USD/oz)", row=1, col=1)
fig.update_yaxes(title_text="Anomaly Score", row=2, col=1)

fig.update_layout(
    title_text="üîç Detecci√≥n de Anomal√≠as - Isolation Forest",
    height=700,
    template='plotly_white',
    hovermode='x unified'
)

fig.show()

print("üí° Interpretaci√≥n:")
print("   - Scores negativos m√°s bajos ‚Üí Mayor anomal√≠a")
print("   - Isolation Forest detecta anomal√≠as multivariadas (OHLC + Returns)")

üí° Interpretaci√≥n:
   - Scores negativos m√°s bajos ‚Üí Mayor anomal√≠a
   - Isolation Forest detecta anomal√≠as multivariadas (OHLC + Returns)


---

## 6Ô∏è‚É£ Comparaci√≥n de M√©todos

In [10]:
# Resumen de detecciones
print("üìä COMPARACI√ìN DE M√âTODOS DE DETECCI√ìN")
print("="*70)

metodos = {
    'IQR': df['outlier_iqr'].sum(),
    'Z-Score': df['outlier_zscore'].sum(),
    'Isolation Forest': df['outlier_iforest'].sum()
}

for metodo, count in metodos.items():
    porcentaje = count / len(df) * 100
    print(f"   {metodo:20s}: {count:4d} outliers ({porcentaje:5.2f}%)")

# Consenso (outliers detectados por al menos 2 m√©todos)
df['outlier_count'] = (df['outlier_iqr'].astype(int) + 
                       df['outlier_zscore'].astype(int) + 
                       df['outlier_iforest'].astype(int))

df['outlier_consensus'] = df['outlier_count'] >= 2

print(f"\nüéØ Consenso (‚â•2 m√©todos):")
print(f"   Outliers por consenso: {df['outlier_consensus'].sum()} ({df['outlier_consensus'].sum()/len(df)*100:.2f}%)")

# Distribuci√≥n del consenso
print(f"\nüìä Distribuci√≥n del consenso:")
consenso_dist = df['outlier_count'].value_counts().sort_index()
for count, freq in consenso_dist.items():
    print(f"   {count} m√©todo(s): {freq:4d} casos ({freq/len(df)*100:5.2f}%)")

üìä COMPARACI√ìN DE M√âTODOS DE DETECCI√ìN
   IQR                 :  258 outliers ( 7.14%)
   Z-Score             :   69 outliers ( 1.91%)
   Isolation Forest    :  181 outliers ( 5.01%)

üéØ Consenso (‚â•2 m√©todos):
   Outliers por consenso: 132 (3.65%)

üìä Distribuci√≥n del consenso:
   0 m√©todo(s): 3307 casos (91.51%)
   1 m√©todo(s):  175 casos ( 4.84%)
   2 m√©todo(s):   63 casos ( 1.74%)
   3 m√©todo(s):   69 casos ( 1.91%)


In [11]:
# Diagrama de Venn conceptual (tabla de contingencia)
import itertools

print("\nüîç AN√ÅLISIS DE SUPERPOSICI√ìN ENTRE M√âTODOS")
print("="*70)

# Todas las combinaciones
combinaciones = [
    ('IQR solo', df['outlier_iqr'] & ~df['outlier_zscore'] & ~df['outlier_iforest']),
    ('Z-Score solo', ~df['outlier_iqr'] & df['outlier_zscore'] & ~df['outlier_iforest']),
    ('Isolation Forest solo', ~df['outlier_iqr'] & ~df['outlier_zscore'] & df['outlier_iforest']),
    ('IQR ‚à© Z-Score', df['outlier_iqr'] & df['outlier_zscore'] & ~df['outlier_iforest']),
    ('IQR ‚à© Isolation Forest', df['outlier_iqr'] & ~df['outlier_zscore'] & df['outlier_iforest']),
    ('Z-Score ‚à© Isolation Forest', ~df['outlier_iqr'] & df['outlier_zscore'] & df['outlier_iforest']),
    ('IQR ‚à© Z-Score ‚à© Isolation Forest', df['outlier_iqr'] & df['outlier_zscore'] & df['outlier_iforest'])
]

for nombre, mask in combinaciones:
    count = mask.sum()
    if count > 0:
        print(f"   {nombre:35s}: {count:3d} casos")


üîç AN√ÅLISIS DE SUPERPOSICI√ìN ENTRE M√âTODOS
   IQR solo                           : 126 casos
   Isolation Forest solo              :  49 casos
   IQR ‚à© Isolation Forest             :  63 casos
   IQR ‚à© Z-Score ‚à© Isolation Forest   :  69 casos


In [12]:
# Gr√°fico de barras comparativo
metodos_nombres = list(metodos.keys())
metodos_counts = list(metodos.values())

fig = go.Figure()

fig.add_trace(go.Bar(
    x=metodos_nombres,
    y=metodos_counts,
    marker=dict(color=['steelblue', 'coral', 'lightgreen']),
    text=metodos_counts,
    textposition='outside'
))

fig.update_layout(
    title='üìä Comparaci√≥n de M√©todos de Detecci√≥n de Anomal√≠as',
    xaxis_title='M√©todo',
    yaxis_title='N√∫mero de Outliers Detectados',
    template='plotly_white',
    height=500
)

fig.show()

In [13]:
# Visualizaci√≥n comparativa en la serie temporal
fig = go.Figure()

# Serie de precios
fig.add_trace(go.Scatter(
    x=df.index,
    y=df['Close'],
    mode='lines',
    name='Precio Normal',
    line=dict(color='lightgray', width=1)
))

# Outliers por cada m√©todo
fig.add_trace(go.Scatter(
    x=df[df['outlier_iqr']].index,
    y=df[df['outlier_iqr']]['Close'],
    mode='markers',
    name='IQR',
    marker=dict(color='blue', size=10, symbol='circle', opacity=0.6)
))

fig.add_trace(go.Scatter(
    x=df[df['outlier_zscore']].index,
    y=df[df['outlier_zscore']]['Close'],
    mode='markers',
    name='Z-Score',
    marker=dict(color='orange', size=10, symbol='square', opacity=0.6)
))

fig.add_trace(go.Scatter(
    x=df[df['outlier_iforest']].index,
    y=df[df['outlier_iforest']]['Close'],
    mode='markers',
    name='Isolation Forest',
    marker=dict(color='green', size=10, symbol='diamond', opacity=0.6)
))

# Consenso (destacado)
fig.add_trace(go.Scatter(
    x=df[df['outlier_consensus']].index,
    y=df[df['outlier_consensus']]['Close'],
    mode='markers',
    name='Consenso (‚â•2 m√©todos)',
    marker=dict(color='red', size=15, symbol='x', line=dict(width=2, color='darkred'))
))

fig.update_layout(
    title='üîç Comparaci√≥n Visual de M√©todos de Detecci√≥n',
    xaxis_title='Fecha',
    yaxis_title='Precio de Cierre (USD/oz)',
    template='plotly_white',
    height=600,
    hovermode='x unified'
)

fig.show()

---

## 7Ô∏è‚É£ An√°lisis Detallado de Outliers por Consenso

In [14]:
# Extraer outliers por consenso
df_outliers = df[df['outlier_consensus']].copy()

print(f"üìä AN√ÅLISIS DE OUTLIERS POR CONSENSO")
print("="*70)
print(f"\nTotal de outliers: {len(df_outliers)}")
print(f"\nüîù Top 10 outliers por consenso (ordenados por fecha):")
print("="*70)

# Mostrar informaci√≥n detallada
outliers_info = df_outliers[['Close', 'Returns', 'z_score', 'anomaly_score', 'outlier_count']].copy()
outliers_info = outliers_info.sort_index()

print(outliers_info.head(10))

# Top outliers por magnitud de cambio (retorno absoluto)
print(f"\nüö® Top 10 outliers por magnitud de cambio (|Returns|):")
print("="*70)
top_returns = df_outliers.nlargest(10, 'Returns', keep='all')[['Close', 'Returns', 'z_score', 'anomaly_score']]
print(top_returns)

üìä AN√ÅLISIS DE OUTLIERS POR CONSENSO

Total de outliers: 132

üîù Top 10 outliers por consenso (ordenados por fecha):
               Close   Returns   z_score  anomaly_score  outlier_count
Date                                                                  
2025-03-13  2986.195  0.016296  1.898915      -0.619534              2
2025-03-18  3031.528  0.010615  1.971856      -0.609536              2
2025-03-19  3049.875  0.006034  2.001377      -0.595972              2
2025-03-21  3022.705 -0.007758  1.957660      -0.605731              2
2025-03-24  3009.595 -0.004945  1.936566      -0.598301              2
2025-03-27  3055.275  0.011151  2.010066      -0.607663              2
2025-03-28  3083.898  0.009325  2.056121      -0.607332              2
2025-03-31  3120.348  0.010130  2.114769      -0.615125              2
2025-04-02  3162.475  0.015126  2.182553      -0.632493              2
2025-04-03  3112.625 -0.015889  2.102343      -0.644250              2

üö® Top 10 outliers por 

In [15]:
# Estad√≠sticas de outliers
print("\nüìä ESTAD√çSTICAS DE OUTLIERS")
print("="*70)

print(f"\nPrecios en d√≠as normales:")
print(df[~df['outlier_consensus']]['Close'].describe())

print(f"\nPrecios en d√≠as con outliers:")
print(df_outliers['Close'].describe())

print(f"\nRetornos en d√≠as normales:")
print(df[~df['outlier_consensus']]['Returns'].describe())

print(f"\nRetornos en d√≠as con outliers:")
print(df_outliers['Returns'].describe())


üìä ESTAD√çSTICAS DE OUTLIERS

Precios en d√≠as normales:
count    3482.000000
mean     1735.108807
std       507.638472
min      1063.059000
25%      1292.817750
50%      1733.418000
75%      1930.708000
max      3447.685000
Name: Close, dtype: float64

Precios en d√≠as con outliers:
count     132.000000
mean     3676.740583
std       381.356087
min      2981.075000
25%      3329.330000
50%      3685.277000
75%      4013.187500
max      4365.225000
Name: Close, dtype: float64

Retornos en d√≠as normales:
count    3481.000000
mean        0.000282
std         0.007364
min        -0.060532
25%        -0.002127
50%         0.000000
75%         0.002807
max         0.072794
Name: Returns, dtype: float64

Retornos en d√≠as con outliers:
count    132.000000
mean       0.002676
std        0.014101
min       -0.063265
25%       -0.005079
50%        0.002845
75%        0.010621
max        0.036377
Name: Returns, dtype: float64


In [16]:
# Distribuci√≥n temporal de outliers
df_outliers['a√±o'] = df_outliers.index.year
df_outliers['mes'] = df_outliers.index.month

outliers_por_a√±o = df_outliers.groupby('a√±o').size()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=outliers_por_a√±o.index,
    y=outliers_por_a√±o.values,
    marker=dict(color='crimson'),
    text=outliers_por_a√±o.values,
    textposition='outside'
))

fig.update_layout(
    title='üìÖ Distribuci√≥n Temporal de Outliers por A√±o',
    xaxis_title='A√±o',
    yaxis_title='N√∫mero de Outliers',
    template='plotly_white',
    height=500
)

fig.show()

print(f"\nüí° A√±os con m√°s outliers:")
print(outliers_por_a√±o.sort_values(ascending=False).head(5))


üí° A√±os con m√°s outliers:
a√±o
2025    132
dtype: int64


---

## 8Ô∏è‚É£ An√°lisis Contextual de Outliers Principales

In [17]:
# Identificar outliers m√°s extremos (top 20)
print("üö® TOP 20 OUTLIERS M√ÅS EXTREMOS")
print("="*70)
print("\nCriterio: Mayor valor absoluto de retornos")
print("\nEstos eventos pueden estar asociados a:")
print("   ‚Ä¢ Crisis financieras globales")
print("   ‚Ä¢ Decisiones de pol√≠tica monetaria (FED, BCE)")
print("   ‚Ä¢ Eventos geopol√≠ticos importantes")
print("   ‚Ä¢ Anuncios econ√≥micos relevantes")
print("\n" + "="*70)

# Seleccionar top 20 por retorno absoluto
df_outliers['abs_returns'] = df_outliers['Returns'].abs()
top_20_outliers = df_outliers.nlargest(20, 'abs_returns').sort_values('abs_returns', ascending=False)

for i, (fecha, row) in enumerate(top_20_outliers.iterrows(), 1):
    direccion = "üìà ALZA" if row['Returns'] > 0 else "üìâ CA√çDA"
    print(f"{i:2d}. {fecha.strftime('%Y-%m-%d')} ‚Üí {direccion:8s} | Precio: ${row['Close']:7.2f} | Retorno: {row['Returns']*100:+6.2f}% | Z-Score: {row['z_score']:5.2f}")

üö® TOP 20 OUTLIERS M√ÅS EXTREMOS

Criterio: Mayor valor absoluto de retornos

Estos eventos pueden estar asociados a:
   ‚Ä¢ Crisis financieras globales
   ‚Ä¢ Decisiones de pol√≠tica monetaria (FED, BCE)
   ‚Ä¢ Eventos geopol√≠ticos importantes
   ‚Ä¢ Anuncios econ√≥micos relevantes

 1. 2025-10-21 ‚Üí üìâ CA√çDA  | Precio: $4092.62 | Retorno:  -6.33% | Z-Score:  3.68
 2. 2025-10-16 ‚Üí üìà ALZA   | Precio: $4365.22 | Retorno:  +3.64% | Z-Score:  4.12
 3. 2025-04-09 ‚Üí üìà ALZA   | Precio: $3086.49 | Retorno:  +3.57% | Z-Score:  2.06
 4. 2025-04-10 ‚Üí üìà ALZA   | Precio: $3188.78 | Retorno:  +3.26% | Z-Score:  2.22
 5. 2025-04-16 ‚Üí üìà ALZA   | Precio: $3349.89 | Retorno:  +2.98% | Z-Score:  2.48
 6. 2025-04-22 ‚Üí üìâ CA√çDA  | Precio: $3336.32 | Retorno:  -2.91% | Z-Score:  2.46
 7. 2025-10-20 ‚Üí üìà ALZA   | Precio: $4359.90 | Retorno:  +2.85% | Z-Score:  4.11
 8. 2025-11-10 ‚Üí üìà ALZA   | Precio: $4119.70 | Retorno:  +2.66% | Z-Score:  3.72
 9. 2025-10-17 ‚Üí üì

In [18]:
# Visualizaci√≥n de outliers principales con anotaciones
fig = go.Figure()

# Serie completa
fig.add_trace(go.Scatter(
    x=df.index,
    y=df['Close'],
    mode='lines',
    name='Precio',
    line=dict(color='lightblue', width=1.5)
))

# Top 10 outliers
top_10 = df_outliers.nlargest(10, 'abs_returns')

fig.add_trace(go.Scatter(
    x=top_10.index,
    y=top_10['Close'],
    mode='markers+text',
    name='Top 10 Outliers',
    marker=dict(color='red', size=15, symbol='star'),
    text=[f"{fecha.strftime('%Y-%m-%d')}<br>{row['Returns']*100:+.1f}%" 
          for fecha, row in top_10.iterrows()],
    textposition='top center',
    textfont=dict(size=9, color='darkred')
))

fig.update_layout(
    title='‚≠ê Top 10 Outliers M√°s Extremos en Serie Temporal',
    xaxis_title='Fecha',
    yaxis_title='Precio de Cierre (USD/oz)',
    template='plotly_white',
    height=600,
    hovermode='x unified'
)

fig.show()

---

## 9Ô∏è‚É£ Exportaci√≥n de Resultados

In [19]:
# Preparar dataset consolidado de outliers
df_export_outliers = df_outliers[[
    'Open', 'High', 'Low', 'Close', 'Volume',
    'Returns', 'Volatility_30',
    'z_score', 'anomaly_score', 'outlier_count',
    'outlier_iqr', 'outlier_zscore', 'outlier_iforest'
]].copy()

# Agregar clasificaci√≥n de severidad
df_export_outliers['severity'] = pd.cut(
    df_export_outliers['outlier_count'],
    bins=[0, 1, 2, 3],
    labels=['Moderado', 'Alto', 'Cr√≠tico']
)

# Agregar direcci√≥n del movimiento
df_export_outliers['direccion'] = df_export_outliers['Returns'].apply(
    lambda x: 'ALZA' if x > 0 else 'CA√çDA'
)

# Exportar
archivo_outliers = DATA_PROCESADO_DIR / 'outliers_precios_oro.csv'
df_export_outliers.to_csv(archivo_outliers)

print(f"‚úÖ Dataset de outliers exportado a: {archivo_outliers}")
print(f"\nüìä Variables incluidas: {list(df_export_outliers.columns)}")
print(f"\nüìà Total de outliers exportados: {len(df_export_outliers)}")

‚úÖ Dataset de outliers exportado a: /home/els4nchez/Videos/Harmeregildo/unificacion/datos_procesados/outliers_precios_oro.csv

üìä Variables incluidas: ['Open', 'High', 'Low', 'Close', 'Volume', 'Returns', 'Volatility_30', 'z_score', 'anomaly_score', 'outlier_count', 'outlier_iqr', 'outlier_zscore', 'outlier_iforest', 'severity', 'direccion']

üìà Total de outliers exportados: 132


In [20]:
# Agregar columnas de detecci√≥n al dataset principal
df_export_main = df.copy()

# Exportar dataset completo con flags de outliers
archivo_precios_completo = DATA_PROCESADO_DIR / 'precios_oro_con_outliers.csv'
df_export_main.to_csv(archivo_precios_completo)

print(f"\n‚úÖ Dataset completo con detecci√≥n de outliers exportado a: {archivo_precios_completo}")
print(f"\nüìä Variables incluidas: {list(df_export_main.columns)}")
print(f"\nüìà Total de registros: {len(df_export_main):,}")


‚úÖ Dataset completo con detecci√≥n de outliers exportado a: /home/els4nchez/Videos/Harmeregildo/unificacion/datos_procesados/precios_oro_con_outliers.csv

üìä Variables incluidas: ['Open', 'High', 'Low', 'Close', 'Volume', 'Returns', 'Volatility_30', 'outlier_iqr', 'z_score', 'outlier_zscore', 'anomaly_score', 'outlier_iforest', 'outlier_count', 'outlier_consensus']

üìà Total de registros: 3,614


In [21]:
# Guardar estad√≠sticas de outliers
import json

outlier_stats = {
    'total_registros': len(df),
    'metodos': {
        'IQR': {
            'outliers': int(df['outlier_iqr'].sum()),
            'porcentaje': float(df['outlier_iqr'].sum() / len(df) * 100)
        },
        'Z-Score': {
            'outliers': int(df['outlier_zscore'].sum()),
            'porcentaje': float(df['outlier_zscore'].sum() / len(df) * 100)
        },
        'Isolation_Forest': {
            'outliers': int(df['outlier_iforest'].sum()),
            'porcentaje': float(df['outlier_iforest'].sum() / len(df) * 100)
        }
    },
    'consenso': {
        'outliers': int(df['outlier_consensus'].sum()),
        'porcentaje': float(df['outlier_consensus'].sum() / len(df) * 100)
    },
    'top_10_fechas': [fecha.strftime('%Y-%m-%d') for fecha in top_10.index],
    'estadisticas_outliers': {
        'precio_promedio': float(df_outliers['Close'].mean()),
        'retorno_promedio': float(df_outliers['Returns'].mean()),
        'volatilidad_promedio': float(df_outliers['Volatility_30'].mean())
    }
}

archivo_stats = DATA_PROCESADO_DIR / 'estadisticas_outliers.json'

with open(archivo_stats, 'w', encoding='utf-8') as f:
    json.dump(outlier_stats, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Estad√≠sticas de outliers guardadas en: {archivo_stats}")

‚úÖ Estad√≠sticas de outliers guardadas en: /home/els4nchez/Videos/Harmeregildo/unificacion/datos_procesados/estadisticas_outliers.json


---

## üîü Resumen Ejecutivo

In [22]:
# Resumen ejecutivo
print("""
‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë              üîç RESUMEN EJECUTIVO - DETECCI√ìN DE ANOMAL√çAS             ‚ïë
‚ï†‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï£
‚ïë                                                                        ‚ïë
‚ïë  ‚úÖ AN√ÅLISIS COMPLETADO EXITOSAMENTE                                   ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  üìä M√âTODOS IMPLEMENTADOS:                                             ‚ïë
‚ïë                                                                        ‚ïë""")

print(f"‚ïë  1. IQR (Interquartile Range):          {metodos['IQR']:4d} outliers ({metodos['IQR']/len(df)*100:5.2f}%)      ‚ïë")
print(f"‚ïë  2. Z-Score (threshold=3):               {metodos['Z-Score']:4d} outliers ({metodos['Z-Score']/len(df)*100:5.2f}%)      ‚ïë")
print(f"‚ïë  3. Isolation Forest (contamination=5%): {metodos['Isolation Forest']:4d} outliers ({metodos['Isolation Forest']/len(df)*100:5.2f}%)      ‚ïë")
print(f"‚ïë  4. Consenso (‚â•2 m√©todos):               {df['outlier_consensus'].sum():4d} outliers ({df['outlier_consensus'].sum()/len(df)*100:5.2f}%)      ‚ïë")

print("""
‚ïë                                                                        ‚ïë
‚ïë  üìà HALLAZGOS PRINCIPALES:                                             ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  ‚Ä¢ M√©todos complementarios: cada uno detecta diferentes aspectos      ‚ïë
‚ïë  ‚Ä¢ IQR: Robusto para outliers univariados extremos                    ‚ïë
‚ïë  ‚Ä¢ Z-Score: Sensible a desviaciones est√°ndar                          ‚ïë
‚ïë  ‚Ä¢ Isolation Forest: Captura anomal√≠as multivariadas complejas        ‚ïë
‚ïë  ‚Ä¢ Consenso identifica anomal√≠as m√°s confiables                       ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  üîß PREPARACI√ìN PARA AN√ÅLISIS DE CORRELACI√ìN:                          ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  ‚Ä¢ Outliers etiquetados por m√©todo y severidad                        ‚ïë
‚ïë  ‚Ä¢ Fechas de anomal√≠as listas para cruce con noticias                 ‚ïë
‚ïë  ‚Ä¢ M√©tricas de anomal√≠a (Z-Score, Anomaly Score) calculadas           ‚ïë
‚ïë  ‚Ä¢ Dataset consolidado exportado                                      ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  üìÇ ARCHIVOS GENERADOS:                                                ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  ‚Ä¢ outliers_precios_oro.csv (outliers por consenso)                   ‚ïë
‚ïë  ‚Ä¢ precios_oro_con_outliers.csv (dataset completo con flags)          ‚ïë
‚ïë  ‚Ä¢ estadisticas_outliers.json (m√©tricas y top outliers)               ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  ‚û°Ô∏è  SIGUIENTE PASO:                                                   ‚ïë
‚ïë     Notebook 05 - An√°lisis de Sentimientos (FinBERT)                 ‚ïë
‚ïë                                                                        ‚ïë
‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
""")


‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
‚ïë              üîç RESUMEN EJECUTIVO - DETECCI√ìN DE ANOMAL√çAS             ‚ïë
‚ï†‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï£
‚ïë                                                                        ‚ïë
‚ïë  ‚úÖ AN√ÅLISIS COMPLETADO EXITOSAMENTE                                   ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  üìä M√âTODOS IMPLEMENTADOS:                                             ‚ïë
‚ïë                                                                        ‚ïë
‚ïë  1. IQR (Interquartile Range):           258 outliers ( 7.14%)    