In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tokenization_module import TokenizerModule

from tensorflow.keras import layers, models, optimizers, callbacks, preprocessing

In [2]:
#Cargar datasets
train = pd.read_csv("dataset_clean_v2/train_clean_v2.csv")
val = pd.read_csv("dataset_clean_v2/validation_clean_v2.csv")
test = pd.read_csv("dataset_clean_v2/test_clean_v2.csv")

In [3]:
#Cargar tokenizador
tok = TokenizerModule()
tok.load_vectorizer("vectorizer")
encoder = tok.vectorizer
vocab = encoder.get_vocabulary()

In [4]:
#Vectorizar los datasets
X_train = tok.vectorize_texts(train['review_body'].astype(str))
X_val = tok.vectorize_texts(val['review_body'].astype(str))
X_test = tok.vectorize_texts(test['review_body'].astype(str))

#Tomar labels
Y_train = train["label"].astype('int32')
Y_val = val["label"].astype('int32')
Y_test = test["label"].astype('int32')

In [5]:
#Crear modelo
model = models.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(vocab),
        output_dim=100,
        mask_zero=True,
        embeddings_regularizer=tf.keras.regularizers.l2(1e-5)
        ),
    tf.keras.layers.LSTM(64, dropout=0.3, recurrent_dropout=0.3),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [6]:
sample_text = ["Este es un texto de prueba en español, me gusta mucho hola satisfecho hola hola"]

# Vectorizar primero
sample_seq = encoder(sample_text)  # Tensor de enteros

# Predecir
predictions = model.predict(sample_seq)
print(predictions[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 459ms/step
[0.50318253]


In [7]:
model.compile(
    loss='binary_crossentropy',  # Cambiado a binary_crossentropy
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),  # Reducido
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

In [8]:
model.summary()

In [9]:
#Hiperparámetros
epochs = 10
batch_size = 256

In [10]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

In [11]:
history = model.fit(
    X_train, Y_train,
    validation_data=(X_val, Y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[callback]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 382ms/step - accuracy: 0.8519 - loss: 0.3722 - precision: 0.8683 - recall: 0.8297 - val_accuracy: 0.8783 - val_loss: 0.3022 - val_precision: 0.9100 - val_recall: 0.8395
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m236s[0m 378ms/step - accuracy: 0.8967 - loss: 0.2803 - precision: 0.9033 - recall: 0.8886 - val_accuracy: 0.8898 - val_loss: 0.2755 - val_precision: 0.8979 - val_recall: 0.8795
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 378ms/step - accuracy: 0.9086 - loss: 0.2518 - precision: 0.9161 - recall: 0.8996 - val_accuracy: 0.8947 - val_loss: 0.2678 - val_precision: 0.9067 - val_recall: 0.8800
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m240s[0m 383ms/step - accuracy: 0.9162 - loss: 0.2327 - precision: 0.9232 - recall: 0.9079 - val_accuracy: 0.8990 - val_loss: 0.2621 - val_precision: 0.9092 - val_recall: 0.8865
Epoch 5/

In [14]:
results = model.evaluate(X_test, Y_test)
print(dict(zip(model.metrics_names, results)))

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9087 - loss: 0.2513 - precision: 0.9123 - recall: 0.9045
{'loss': 0.25133952498435974, 'compile_metrics': 0.9087499976158142}


In [17]:
model.save("Models/model_lstm_v1.keras")