In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# Cargar FinBERT
model_name = "yiyanghkust/finbert-tone"  # el modelo FinBERT es para tonos financieros
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
# tokenizar
def preprocess_text(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    return inputs

In [None]:
def get_sentiment_score(text):
    inputs = preprocess_text(text)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)
    sentiment = torch.argmax(probabilities, dim=1)
    # Regresa el sentimiento (0: negativo, 1: neutral, 2: positivo) y la probabilidad asociada
    return sentiment.item(), probabilities[0].tolist()

In [None]:
import pandas as pd
import re

In [None]:
# Limpiar
data = pd.read_csv("Data_1996_Today_FED.csv")
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Mantener solo letras y espacios
    return text.lower()  # Convertir todo a minúsculas para uniformidad
data["Speech"] = data["Speech"].fillna("").astype(str)  # Rellenar NaNs y asegurar que son strings
data["Speech"] = data["Speech"].apply(clean_text)
data["Speech"] = data["Speech"].fillna("").astype(str)
data = data[data["Speech"].apply(lambda x: isinstance(x, str))] 

In [None]:
data.head()

In [None]:
# Aplicación del modelo FinBERT
sentiments = []
probabilities = []

for speech in data["Speech"]:
    try:
        sentiment, probability = get_sentiment_score(speech)
        sentiments.append(sentiment)
        probabilities.append(probability)
    except Exception as e:
        print(f"Error en el discurso: {speech[:30]}... - Error: {e}")
        sentiments.append(None)
        probabilities.append([None, None, None])

data["Sentiment"] = sentiments
data["Sentiment_Probabilities"] = probabilities

In [None]:
# Guardar el dataset actualizado
data.to_csv("Data_1996_Today_FED_with_sentiment.csv", index=False)
print("Dataset actualizado con la columna de sentimiento agregado.")