# Clasificación de tono monetario (Hawkish/Dovish/Neutral) con LSTM
**Anexo del ensayo** — Repositorio listo para GitHub.  
Última limpieza automática: 2025-10-02 13:26:23

## Descripción
Este cuaderno implementa una tubería (pipeline) de NLP para clasificar el tono de minutas del Banco de México en *hawkish*, *dovish* o *neutral* mediante una red LSTM.

## Notas de uso
- Este cuaderno se entrega **sin salidas** y con celdas de código limpias.
- Todos los comentarios de código (`# ...`) fueron eliminados para dejar el código limpio.
- Revise el archivo `README.md` para instrucciones reproducibles, dependencias y estructura del repositorio.

In [None]:
!pip install nltk

In [None]:
import os, random
import numpy as np
import tensorflow as tf

def set_seed(seed: int = 13):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seed(2)   # ← cambia 13 por 7, 21, etc. para probar otras semillas


In [None]:
import nltk

In [None]:
from pathlib import Path
import pandas as pd

carpeta = Path(r"C:\Users\Scarl\Documents\CFA\Curso\Python Data Science and AI\Unit 4\Practice ensayo")

txt_files = sorted(carpeta.glob("*.txt"))
print("TXT encontrados:")
for i, p in enumerate(txt_files):
    print(f"[{i}] {p.name}")

idx_a_usar = 0  # <-- cambia este número si quieres otro archivo
path = txt_files[idx_a_usar]
print("\nLeyendo:", path)

def leer_txt_robusto(p: Path):
    errores = []
    for enc in ("utf-8-sig", "utf-8", "latin-1"):
        try:
            return pd.read_csv(p, sep=None, engine="python", encoding=enc)
        except UnicodeDecodeError as e:
            errores.append((enc, "UnicodeDecodeError"))
        except pd.errors.ParserError as e:
            for sep in ["\t", ",", ";", "|"]:
                try:
                    return pd.read_csv(p, sep=sep, engine="python", encoding=enc)
                except Exception:
                    continue
            errores.append((enc, "ParserError"))
        except Exception as e:
            errores.append((enc, repr(e)))
    raise RuntimeError(f"No se pudo leer el archivo. Errores: {errores}")

df_txt = leer_txt_robusto(path)

pd.set_option("display.max_colwidth", 160)
print("Shape:", df_txt.shape)
print("Columnas:", list(df_txt.columns))
df_txt.head(10)

In [None]:
df = df_txt[["sentences", "stance_label"]].rename(
    columns={"sentences": "Text", "stance_label": "Label"}
)

df.head(10)

In [None]:
df.shape

In [None]:
df['Label'].value_counts()

In [None]:
df.info()

In [None]:
import re

In [None]:
import nltk
from nltk.corpus import stopwords

_ = nltk.download('stopwords', quiet=True)  # silencia mensajes de descarga
EN_STOPWORDS = set(stopwords.words('english'))  # no imprime al asignar
print(f"Stopwords cargadas: {len(EN_STOPWORDS)}")  # confirmación breve


In [None]:


def clean_text(text):


    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    text = text.lower()

    stop_words = set(stopwords.words('english'))
    stop_words.add('heshe')  # <- añadir tu stopword personalizada (en minúsculas)
    words = text.split()
    words = [word for word in words if word not in stop_words]

    cleaned_text = ' '.join(words)

    return cleaned_text

In [None]:
df['Cleaned Text'] = df['Text'].apply(clean_text)

In [None]:
df['Label'] = df['Label'].str.replace(r'^\s*irrelevant\s*$', 'neutral', case=False, regex=True)

In [None]:
df

In [None]:
df.to_excel("df.xlsx", index=False)

In [None]:

!pip install wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt


def generate_word_cloud(text):
    
    custom_stopwords = {"http", "china", "us", "united states", "political", "politics","stock","stocks", "trump"}

    stopwords = set(STOPWORDS)
    stopwords.update(custom_stopwords)

    wordcloud = WordCloud(width = 1600, height = 800, stopwords = stopwords, min_font_size = 10).generate(text)

    plt.figure(figsize = (12, 12))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

In [None]:
generate_word_cloud(" ".join(df[df['Label'] == 'hawkish']['Cleaned Text']))

In [None]:
generate_word_cloud(" ".join(df[df['Label'] == 'neutral']['Cleaned Text']))

In [None]:
generate_word_cloud(" ".join(df[df['Label'] == 'dovish']['Cleaned Text']))

In [None]:

!pip install transformers

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

In [None]:
def tokenization_padding(df):

    df['Encoded Text'] = [tokenizer.encode(news) for news in df['Cleaned Text'].tolist()]

    encoded_news_tensor = [torch.tensor(encoded_news) for encoded_news in df['Encoded Text'].tolist()]
    padded_sequence = pad_sequence(encoded_news_tensor, batch_first = True, padding_value = 0).numpy()

    return padded_sequence

In [None]:
X = tokenization_padding(df)
X

In [None]:
X.shape

In [None]:
df

In [None]:
df['Encoded Label'] = df['Label'].replace('hawkish', 0).replace('dovish', 1).replace('neutral', 2)

In [None]:
y = df['Encoded Label'] 
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, shuffle = True)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5, shuffle = True)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

In [None]:


import tensorflow as tf
from tensorflow import keras


In [None]:
import tensorflow as tf

inputs = tf.keras.Input(shape=(X_train.shape[1],), dtype="int32")

x = tf.keras.layers.Embedding(
    input_dim=tokenizer.vocab_size,
    output_dim=128,
    mask_zero=True
)(inputs)

x = tf.keras.layers.SpatialDropout1D(0.25)(x)

x = tf.keras.layers.LSTM(
    32, return_sequences=True, activation='tanh', dropout=0.20
)(x)


gmax = tf.keras.layers.GlobalMaxPooling1D()(x)
gavg = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Concatenate()([gmax, gavg])   # ← nuevo pooling combinado


outputs = tf.keras.layers.Dense(3, activation='softmax')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)


In [None]:
from tensorflow.keras.optimizers import Adam

In [None]:
model.compile(optimizer = 'adam', 
              loss = 'sparse_categorical_crossentropy', 
              metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight


history = model.fit(X_train,
                    y_train,
                    validation_data = (X_val, y_val),
                    batch_size = 64, #32 original
                    verbose = 1,
                    epochs = 15)

In [None]:
results = model.evaluate(X_test, y_test)

In [None]:
print("Test Accuracy: {:.2f}%".format(results[1] * 100))

In [None]:
predictions = model.predict(X_test)


In [None]:
import numpy as np

y_predict = []

for i in predictions:
  y_predict.append(np.argmax(i))



In [None]:
y_test

In [None]:
import seaborn as sns

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm, fmt = 'd', annot = True)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))