In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tokenization_module import TokenizerModule

from tensorflow.keras import layers, models, optimizers, callbacks, preprocessing

In [38]:
#Cargar datasets
train = pd.read_csv("dataset_clean_v2/train_clean_v2.csv")
val = pd.read_csv("dataset_clean_v2/validation_clean_v2.csv")
test = pd.read_csv("dataset_clean_v2/test_clean_v2.csv")

In [39]:
#Cargar tokenizador
tok = TokenizerModule()
tok.load_vectorizer("vectorizer")
encoder = tok.vectorizer
vocab = encoder.get_vocabulary()

In [40]:
#Vectorizar los datasets
X_train = tok.vectorize_texts(train['review_body'].astype(str))
X_val = tok.vectorize_texts(val['review_body'].astype(str))
X_test = tok.vectorize_texts(test['review_body'].astype(str))

#Tomar labels
Y_train = train["label"].astype('int32')
Y_val = val["label"].astype('int32')
Y_test = test["label"].astype('int32')

In [41]:
#Crear modelo
model = models.Sequential([
    tf.keras.layers.Embedding(
        input_dim=len(vocab),
        output_dim=100,
        mask_zero=True,
        embeddings_regularizer=tf.keras.regularizers.l2(1e-5)
        ),
    tf.keras.layers.LSTM(64, dropout=0.3, recurrent_dropout=0.3),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
sample_text = ["Este es un texto de prueba en español, me gusta mucho hola satisfecho hola hola"]

# Vectorizar primero
sample_seq = encoder(sample_text)  # Tensor de enteros

# Predecir
predictions = model.predict(sample_seq)
print(predictions[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 609ms/step
[0.5090112]


In [43]:
model.compile(
    loss='binary_crossentropy',  # Cambiado a binary_crossentropy
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),  # Reducido
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

In [44]:
model.summary()

In [45]:
#Hiperparámetros
epochs = 10
batch_size = 256

In [46]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

In [47]:
history = model.fit(
    X_train, Y_train,
    validation_data=(X_val, Y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[callback]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m411s[0m 654ms/step - accuracy: 0.8533 - loss: 0.3731 - precision: 0.8670 - recall: 0.8348 - val_accuracy: 0.8827 - val_loss: 0.2889 - val_precision: 0.9057 - val_recall: 0.8545
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 696ms/step - accuracy: 0.8967 - loss: 0.2804 - precision: 0.9045 - recall: 0.8870 - val_accuracy: 0.8867 - val_loss: 0.2790 - val_precision: 0.8993 - val_recall: 0.8710
Epoch 3/10
[1m 40/625[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m6:35[0m 676ms/step - accuracy: 0.9107 - loss: 0.2462 - precision: 0.9175 - recall: 0.9058

KeyboardInterrupt: 