In [2]:
# train_and_save_models.py

import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.preprocessing import MinMaxScaler
from keras.src.models.sequential import Sequential
from keras.src.layers.rnn.lstm import LSTM
from keras.src.layers.core.dense import Dense
from keras.src.optimizers.adam import Adam
from keras.src.callbacks.early_stopping import EarlyStopping
import joblib  # Para guardar el scaler

In [5]:
def cargar_datos():
    dtype = {'text': 'string', 'time': 'int64'}  # Ajusta según tus datos
    df = pd.read_csv("../data/Sintetica.csv", usecols=['text', 'time'], dtype=dtype)
    
    # Convertir 'time' a formato fecha
    if pd.api.types.is_numeric_dtype(df['time']):
        df['time'] = pd.to_datetime(df['time'], unit='ms', errors='coerce')
    else:
        df['time'] = pd.to_datetime(df['time'], errors='coerce')
    
    # Eliminar fechas inválidas
    df = df.dropna(subset=['time'])
    
    return df

def clasificar_sentimientos_bert(data, sentiment_pipeline):
    textos = data['text'].astype(str).tolist()
    resultados = sentiment_pipeline(textos, batch_size=32)
    data['sentiment_label'] = ['Positivo' if res['label'] == 'POSITIVE' else 'Negativo' for res in resultados]
    return data

def preparar_datos(df, sentiment_label):
    sentiment_df = df[df['sentiment_label'] == sentiment_label]
    grouped = sentiment_df.groupby(sentiment_df['time'].dt.to_period("M")).size().reset_index(name='count')
    grouped['time'] = grouped['time'].dt.to_timestamp()
    return grouped

def crear_secuencias(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

def entrenar_modelo_lstm(data, window_size=12, epochs=20, batch_size=16):
    # Escalar datos
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data['count'].values.reshape(-1, 1))
    joblib.dump(scaler, f'scaler_{data["sentiment_label"].iloc[0]}.joblib')  # Guardar scaler

    # Crear secuencias
    X, y = crear_secuencias(scaled_data, window_size)
    X = X.reshape((X.shape[0], X.shape[1], 1))

    # Dividir en entrenamiento y prueba
    train_size = int(len(X) * 0.8)
    X_train, y_train = X[:train_size], y[:train_size]
    X_test, y_test = X[train_size:], y[train_size:]

    # Construir modelo LSTM
    model = Sequential([
        LSTM(20, activation='relu', input_shape=(window_size, 1)),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.01), loss='mse')

    # Entrenar modelo con Early Stopping
    early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[early_stop])

    # Guardar el modelo
    model.save(f'modelo_lstm_{data["sentiment_label"].iloc[0]}.h5')

def main():
    # Cargar y procesar datos
    df = cargar_datos()

    # Inicializar el pipeline de sentimientos
    sentiment_pipeline = pipeline('sentiment-analysis')

    # Clasificar sentimientos
    df = clasificar_sentimientos_bert(df, sentiment_pipeline)

    # Preparar y entrenar modelos para cada sentimiento
    for sentiment in ['Positivo', 'Negativo']:
        sentiment_data = preparar_datos(df, sentiment)
        if len(sentiment_data) > 12:
            sentiment_data['sentiment_label'] = sentiment  # Añadir etiqueta para guardar el scaler
            entrenar_modelo_lstm(sentiment_data, window_size=12, epochs=20, batch_size=16)
            print(f"Modelo entrenado y guardado para el sentimiento: {sentiment}")
        else:
            print(f"No hay suficientes datos para entrenar el modelo para el sentimiento {sentiment}.")

if __name__ == "__main__":
    main()
# train_and_save_models.py



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.






All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Device set to use 0
  super().__init__(**kwargs)


Epoch 1/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - loss: 0.2753
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0536
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0416
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0413
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0412
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0422
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0515 
Epoch 8/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0386
Epoch 9/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0459
Epoch 10/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0413
Epoch 11/20
[1m6/6



Modelo entrenado y guardado para el sentimiento: Positivo
Epoch 1/20


  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 0.2650
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0565
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0448
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0472 
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.0414
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0448 
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 0.0397
Epoch 8/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0364
Epoch 9/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0424
Epoch 10/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0449
Epoch 11/20
[1m6/6[0m [32m━



Modelo entrenado y guardado para el sentimiento: Negativo
