In [1]:
!pip install imbalanced-learn



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
df = pd.read_csv('SPY_dataset_project.csv', parse_dates=['Date'], index_col='Date')
df.head()

Unnamed: 0_level_0,Close,Volume,return,signal,market_sentiment,recession_expectation,growing_sector,investor_type,news_impact,policy_uncertainty
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-06-21,424.915955,76982300,-0.005124,0,negative,medium,health,retail,neutral,low
2023-06-22,426.449768,70637200,0.00361,1,positive,high,health,retail,negative,medium
2023-06-23,423.2258,92074500,-0.00756,0,negative,medium,finance,institutional,neutral,low
2023-06-26,421.496613,72823600,-0.004086,0,negative,low,health,retail,negative,low
2023-06-27,426.117554,72813700,0.010963,1,positive,low,manufacturing,retail,positive,low


In [4]:
# Retornos
df['return']     = df['Close'].pct_change()
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))

# SMA y EWMA
df['SMA_5']   = df['Close'].rolling(5).mean()
df['SMA_10']  = df['Close'].rolling(10).mean()
df['EWMA_5']  = df['Close'].ewm(span=5,  adjust=False).mean()
df['EWMA_10'] = df['Close'].ewm(span=10, adjust=False).mean()

# Volatilidad a 10 días
df['Volatility_10'] = df['Close'].rolling(10).std()

# RSI (14 días)
delta     = df['Close'].diff()
gain      = delta.clip(lower=0)
loss      = -delta.clip(upper=0)
avg_gain  = gain.rolling(14).mean()
avg_loss  = loss.rolling(14).mean()
rs        = avg_gain / avg_loss
df['RSI_14'] = 100 - (100 / (1 + rs))

# Bollinger Bands (20 días)
df['SMA_20']   = df['Close'].rolling(20).mean()
rolling_std    = df['Close'].rolling(20).std()
df['BB_Upper'] = df['SMA_20'] + 2 * rolling_std
df['BB_Lower'] = df['SMA_20'] - 2 * rolling_std

# Cambio de volumen y momentum
df['Volume_change'] = df['Volume'].pct_change()
df['Momentum_5']    = df['Close'] - df['Close'].shift(5)

# Eliminar filas con NaN
df.dropna(inplace=True)

In [5]:
categorical_features = [
    'market_sentiment','recession_expectation','growing_sector',
    'investor_type','news_impact','policy_uncertainty'
]
numerical_features = [
    'SMA_5','SMA_10','EWMA_5','EWMA_10','Volatility_10',
    'RSI_14','BB_Upper','BB_Lower','Volume_change','Momentum_5','log_return'
]

X = df[categorical_features + numerical_features]
y = df['signal']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [7]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

pipeline1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote',        SMOTE(random_state=42)),
    ('classifier',   LogisticRegression(solver='liblinear', random_state=42))
])

In [8]:
pipeline1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['market_sentiment',
                                                   'recession_expectation',
                                                   'growing_sector',
                                                   'investor_type',
                                                   'news_impact',
                                                   'policy_uncertainty']),
                                                 ('num', StandardScaler(),
                                                  ['SMA_5', 'SMA_10', 'EWMA_5',
                                                   'EWMA_10', 'Volatility_10',
                                                   'RSI_14', 'BB_Upper',
                                                   'BB_Lower', 'Volume_change',
        

In [9]:
# Conjunto de entrenamiento
print("=== Training Set ===")
print(classification_report(y_train, pipeline1.predict(X_train)))
print("Confusion Matrix:\n", confusion_matrix(y_train, pipeline1.predict(X_train)))

# Conjunto de prueba
print("\n=== Test Set ===")
print(classification_report(y_test, pipeline1.predict(X_test)))
print("Confusion Matrix:\n", confusion_matrix(y_test, pipeline1.predict(X_test)))

=== Training Set ===
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       143
           1       0.98      0.98      0.98       194

    accuracy                           0.98       337
   macro avg       0.98      0.98      0.98       337
weighted avg       0.98      0.98      0.98       337

Confusion Matrix:
 [[139   4]
 [  4 190]]

=== Test Set ===
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        62
           1       0.95      0.98      0.96        83

    accuracy                           0.96       145
   macro avg       0.96      0.96      0.96       145
weighted avg       0.96      0.96      0.96       145

Confusion Matrix:
 [[58  4]
 [ 2 81]]


In [10]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd

# 1) Generar reportes (output_dict=True) para extraer métricas
report_train = classification_report(y_train, pipeline1.predict(X_train), output_dict=True)
report_test  = classification_report(y_test,  pipeline1.predict(X_test),  output_dict=True)

# 2) Convertir a DataFrame y seleccionar columnas relevantes
df_train_metrics = pd.DataFrame(report_train).T[['precision','recall','f1-score','support']]
df_test_metrics  = pd.DataFrame(report_test).T[['precision','recall','f1-score','support']]

# 3) Añadir la accuracy general como fila adicional
acc_train = accuracy_score(y_train, pipeline1.predict(X_train))
acc_test  = accuracy_score(y_test,  pipeline1.predict(X_test))
df_train_metrics.loc['accuracy'] = [acc_train, acc_train, acc_train, y_train.shape[0]]
df_test_metrics.loc['accuracy']   = [acc_test,  acc_test,  acc_test,  y_test.shape[0]]

# 4) Mostrar tablas de métricas
print("=== Métricas – Training Set ===")
print(df_train_metrics)
print("\n=== Métricas – Test Set ===")
print(df_test_metrics)

# 5) Mostrar matrices de confusión
print("\n=== Confusion Matrix: Training Set ===")
print(confusion_matrix(y_train, pipeline1.predict(X_train)))
print("\n=== Confusion Matrix: Test Set ===")
print(confusion_matrix(y_test, pipeline1.predict(X_test)))

=== Métricas – Training Set ===
              precision    recall  f1-score  support
0              0.972028  0.972028  0.972028    143.0
1              0.979381  0.979381  0.979381    194.0
accuracy       0.976261  0.976261  0.976261    337.0
macro avg      0.975705  0.975705  0.975705    337.0
weighted avg   0.976261  0.976261  0.976261    337.0

=== Métricas – Test Set ===
              precision    recall  f1-score  support
0              0.966667  0.935484  0.950820     62.0
1              0.952941  0.975904  0.964286     83.0
accuracy       0.958621  0.958621  0.958621    145.0
macro avg      0.959804  0.955694  0.957553    145.0
weighted avg   0.958810  0.958621  0.958528    145.0

=== Confusion Matrix: Training Set ===
[[139   4]
 [  4 190]]

=== Confusion Matrix: Test Set ===
[[58  4]
 [ 2 81]]


In [11]:
# Descripción en el conjunto de entrenamiento
desc_train = X_train[numerical_features].describe().T[['mean','std','min','max']]
print("=== Indicadores Técnicos – Training Set ===")
print(desc_train)

# Descripción en el conjunto de prueba
desc_test  = X_test[numerical_features].describe().T[['mean','std','min','max']]
print("\n=== Indicadores Técnicos – Test Set ===")
print(desc_test)

=== Indicadores Técnicos – Training Set ===
                     mean        std         min         max
SMA_5          517.654802  61.842703  406.781274  609.013953
SMA_10         516.613157  61.568761  410.683267  605.662036
EWMA_5         517.644265  61.749945  407.885763  608.307475
EWMA_10        516.670783  61.488193  411.328000  606.308464
Volatility_10    5.695952   3.421980    1.111150   26.172231
RSI_14          58.323585  17.748074   19.366852   96.715886
BB_Upper       531.220184  63.836143  429.865604  624.685912
BB_Lower       498.279777  59.765673  397.083077  595.254028
Volume_change    0.054803   0.325011   -0.541542    1.163652
Momentum_5       1.874686  10.689483  -50.379974   29.859985
log_return       0.000209   0.010036   -0.060327    0.032513

=== Indicadores Técnicos – Test Set ===
                     mean        std         min         max
SMA_5          529.418059  53.188231  413.583289  606.094739
SMA_10         528.984138  53.299482  412.431314  604.671021


In [12]:
from sklearn.decomposition import PCA
X_train_proc = preprocessor.fit_transform(X_train)
pca_full = PCA().fit(X_train_proc)
cum_var = np.cumsum(pca_full.explained_variance_ratio_)
n_components_80 = int(np.argmax(cum_var >= 0.8) + 1)
print(f"Número de componentes para 80% de varianza: {n_components_80}")

Número de componentes para 80% de varianza: 8


In [13]:
from imblearn.pipeline import Pipeline  # o sklearn.pipeline.Pipeline si no usas SMOTE
from imblearn.over_sampling import SMOTE

pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA()),               # n_components lo pondremos en el grid
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

In [14]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {
    'pca__n_components': [n_components_80],
    'classifier__C': [0.01, 0.1, 1, 10]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline2,
    param_grid,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

In [15]:
grid_search.fit(X_train, y_train)

print("Mejores parámetros:", grid_search.best_params_)
print("Mejor F1-score CV:", grid_search.best_score_)

best_model2 = grid_search.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Mejores parámetros: {'classifier__C': 10, 'pca__n_components': 8}
Mejor F1-score CV: 0.8910476190476191


In [16]:
from sklearn.metrics import classification_report, confusion_matrix

print("=== Part 2: Training Set ===")
print(classification_report(y_train, best_model2.predict(X_train)))
print("Confusion Matrix:\n", confusion_matrix(y_train, best_model2.predict(X_train)))

print("\n=== Part 2: Test Set ===")
print(classification_report(y_test, best_model2.predict(X_test)))
print("Confusion Matrix:\n", confusion_matrix(y_test, best_model2.predict(X_test)))

=== Part 2: Training Set ===
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       143
           1       0.91      0.89      0.90       194

    accuracy                           0.88       337
   macro avg       0.88      0.88      0.88       337
weighted avg       0.88      0.88      0.88       337

Confusion Matrix:
 [[125  18]
 [ 22 172]]

=== Part 2: Test Set ===
              precision    recall  f1-score   support

           0       0.87      0.84      0.85        62
           1       0.88      0.90      0.89        83

    accuracy                           0.88       145
   macro avg       0.87      0.87      0.87       145
weighted avg       0.88      0.88      0.88       145

Confusion Matrix:
 [[52 10]
 [ 8 75]]


### 3.1 Impacto de falsos positivos (FP) y falsos negativos (FN)

- **Falsos positivos (FP)**  
  El modelo predice “Buy” (1) cuando en realidad el siguiente retorno es negativo.  
  - Consecuencias:  
    - Se abre una posición perdedora.  
    - Se incurren costes de transacción.  
    - En entornos apalancados, un FP puede disparar drawdowns significativos.  

- **Falsos negativos (FN)**  
  El modelo predice “Do Not Buy” (0) cuando el retorno real iba a ser positivo.  
  - Consecuencias:  
    - Se pierden oportunidades de ganancia.  
    - Reduce el rendimiento global y la eficiencia del capital.  
    - En mercados con fuerte tendencia alcista, implica baja participación en la subida.

### 3.2 ¿Qué métrica debe importar más a la firma?

La elección depende de la relación pérdida/ganancia y de la tolerancia al riesgo:

| Métrica      | Definición                         | ¿Cuándo priorizarla?                                                                 |
|--------------|------------------------------------|---------------------------------------------------------------------------------------|
| **Precision**| TP / (TP + FP)                     | Cuando un FP es muy caro (costes altos, apalancamiento).                              |
| **Recall**   | TP / (TP + FN)                     | Cuando perder oportunidades de ganancia penaliza fuertemente el alpha.                |
| **F1-score** | 2·(Precision·Recall)/(Precision+Recall) | Cuando se desea un compromiso equilibrado entre FP y FN.                              |

- **Priorizar Precision**: si el coste de un FP supera con creces la ganancia de un FN.  
- **Priorizar Recall**: si maximizar capture trades rentable es crítico.  
- **F1-score**: buena métrica intermedia para balancear ambos tipos de error.

### 3.3 Conclusión: ventajas de ML sobre solo indicadores técnicos

1. **Aprendizaje de patrones complejos**  
   Combina múltiples indicadores y variables exógenas para capturar relaciones no lineales.  
2. **Adaptabilidad**  
   Reentrenas el modelo con datos nuevos para adaptarte a cambios de régimen sin recalibrar manualmente.  
3. **Interpretabilidad cuantitativa**  
   La regresión logística ofrece coeficientes que cuantifican el peso de cada indicador en la probabilidad de subida.  
4. **Backtesting y robustez**  
   Validación cruzada y SMOTE mejoran la generalización y reducen overfitting.

En definitiva, una ML Pipeline potencia tus indicadores técnicos, aportando señales más precisas, adaptables y respaldadas por métricas objetivas.