In [2]:
import re
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, LSTM, Bidirectional, Embedding
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
def generar_diccionario(textos, sw, N):
    cv = CountVectorizer(stop_words = sw, max_features = N)
    cv.fit_transform(textos)
    diccionario = cv.vocabulary_
    diccionario = dict([(palabra, i+2) for i, palabra in enumerate(diccionario)])
    diccionario['DESC'] = 0
    diccionario['PAD'] = 1 
    return diccionario

def procesar_cadena(texto, diccionario, stop_words, T):
    # Identificar palabras en el texto
    palabras = re.findall(r'\b\w+\b', texto.lower())
    palabras = list(filter(lambda x: x not in stop_words, palabras))
    resultado = [] 
    for i in range(0, T):
        if i < len(palabras): 
            if palabras[i] in diccionario:
                resultado.append(diccionario[palabras[i]])
            else:
                resultado.append(diccionario['DESC'])
        else:
            resultado.append(diccionario['PAD'])
    return np.array(resultado)

In [5]:
path = "/kaggle/input/tripadvisor-reviews/tripadvisor_hotel_reviews.csv"

datos = pd.read_csv(path)

# Variable objetivo
y = datos.iloc[:, -1]
num_clases = len(y.unique())
# Codificamos con "one hot" las posibles categorías finales
y = pd.get_dummies(datos['Rating'], columns=['Rating'])

# Textos de reseñas
textos = datos['Review']
X = []
diccionario = generar_diccionario(textos, stop_words, N)
for texto in textos:
    X.append(procesar_cadena(texto, diccionario, stop_words, T))
# X contendrá las reseñas codificadas y rellenas con la misma longitud
X = np.array(X)



In [6]:
X

array([[    2,     3,     4, ...,     1,     1,     1],
       [   77,    78,    79, ...,   131,   144,   130],
       [    2,   229,    38, ...,   277,    22,   278],
       ...,
       [   77,    55,  1628, ...,     1,     1,     1],
       [    3, 10487,  3935, ..., 15279,     0,     0],
       [   51,    52,   684, ...,     1,     1,     1]])

In [7]:
X.shape

(20491, 100)

In [8]:
modelo = Sequential()
modelo.add(Input((X.shape[1], 1)))
modelo.add(LSTM(units=NEURONAS_CAPA, return_sequences=True))
modelo.add(Bidirectional(LSTM(units=NEURONAS_CAPA)))
modelo.add(Dense(units=num_clases, activation='softmax'))

modelo.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [9]:
modelo.fit(X, y, validation_split=0.2, epochs=EPOCHS, batch_size=128)

Epoch 1/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.4457 - loss: 1.3641 - val_accuracy: 0.5109 - val_loss: 1.2764
Epoch 2/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.4700 - loss: 1.3162 - val_accuracy: 0.5213 - val_loss: 1.2432
Epoch 3/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.4698 - loss: 1.3094 - val_accuracy: 0.5184 - val_loss: 1.2454
Epoch 4/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.4696 - loss: 1.2979 - val_accuracy: 0.5174 - val_loss: 1.2261
Epoch 5/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.4730 - loss: 1.3042 - val_accuracy: 0.5228 - val_loss: 1.2205
Epoch 6/10
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.4774 - loss: 1.2830 - val_accuracy: 0.5279 - val_loss: 1.2267
Epoch 7/10
[1m129/129

<keras.src.callbacks.history.History at 0x7d09523a2c50>

In [10]:
import numpy as np
from keras.models import Model
from keras.layers import Input, Embedding

N = 5
D = 8

capa_entrada = Input(shape=(None,), dtype='int32')

embedding = Embedding(input_dim=N, output_dim=D)(capa_entrada)

modelo = Model(capa_entrada,embedding)
modelo.summary()

codificacion_entera = [4,1,3,3,3]
codificacion_embedding = modelo.predict(np.asarray([codificacion_entera]))

print()
print('Representación de {}'.format( str(codificacion_entera) ))
print(codificacion_embedding)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step

Representación de [4, 1, 3, 3, 3]
[[[-0.01378427 -0.0389277   0.04386974 -0.0036249   0.02885877
   -0.0436363   0.0236915  -0.02506156]
  [-0.00625841  0.00869348  0.02515589  0.04868311 -0.04498123
   -0.02511599  0.01106633 -0.00466142]
  [-0.03504137 -0.00300815 -0.00655142  0.01696291 -0.01972781
   -0.02173389  0.04273179  0.04724601]
  [-0.03504137 -0.00300815 -0.00655142  0.01696291 -0.01972781
   -0.02173389  0.04273179  0.04724601]
  [-0.03504137 -0.00300815 -0.00655142  0.01696291 -0.01972781
   -0.02173389  0.04273179  0.04724601]]]


In [16]:
N = 20000
T = 100
D = 128             # Dimensiones de los vectores de embedding
EPOCHS = 30
NEURONAS_CAPA = 128

In [17]:
modelo = Sequential()

# Añadimos 2 unidades más al tamaño para incluir los códigos de
# palabras desconocidas y de padding
modelo.add(Embedding(input_dim=N+2, output_dim=D))
modelo.add(Bidirectional(LSTM(units=NEURONAS_CAPA, return_sequences=True)))
modelo.add(Bidirectional(LSTM(units=NEURONAS_CAPA)))
modelo.add(Dense(units=num_clases, activation='softmax'))

modelo.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [18]:
modelo.fit(X, y, validation_split=0.2, epochs=EPOCHS, batch_size=128)

Epoch 1/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.4822 - loss: 1.2063 - val_accuracy: 0.6211 - val_loss: 0.8400
Epoch 2/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.6684 - loss: 0.7668 - val_accuracy: 0.6445 - val_loss: 0.8519
Epoch 3/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.7484 - loss: 0.5982 - val_accuracy: 0.6255 - val_loss: 0.8891
Epoch 4/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.8141 - loss: 0.4720 - val_accuracy: 0.6072 - val_loss: 1.0565
Epoch 5/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.8585 - loss: 0.3794 - val_accuracy: 0.6001 - val_loss: 1.1629
Epoch 6/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.8924 - loss: 0.3030 - val_accuracy: 0.5931 - val_loss: 1.3458
Epoch 7/30
[1m129/12

<keras.src.callbacks.history.History at 0x7d08c43a7a30>

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2

modelo = Sequential()

modelo.add(Embedding(input_dim=N+2, output_dim=D))
modelo.add(Bidirectional(LSTM(units=NEURONAS_CAPA//2, return_sequences=True)))
modelo.add(Dropout(0.5))
modelo.add(Bidirectional(LSTM(units=NEURONAS_CAPA//2)))
modelo.add(Dropout(0.4))
modelo.add(Dense(units=num_clases, activation='softmax', kernel_regularizer=l2(0.1)))

modelo.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

modelo.fit(X, y, validation_split=0.2, epochs=EPOCHS, batch_size=64)

Epoch 1/30
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.4634 - loss: 1.7731 - val_accuracy: 0.6192 - val_loss: 0.9506
Epoch 2/30
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.6473 - loss: 0.8956 - val_accuracy: 0.6189 - val_loss: 0.9436
Epoch 3/30
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.6958 - loss: 0.8211 - val_accuracy: 0.6202 - val_loss: 0.9358
Epoch 4/30
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.7497 - loss: 0.7107 - val_accuracy: 0.6245 - val_loss: 1.0003
Epoch 5/30
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.7904 - loss: 0.6371 - val_accuracy: 0.6145 - val_loss: 1.0196
Epoch 6/30
[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.8261 - loss: 0.5730 - val_accuracy: 0.5977 - val_loss: 1.0559
Epoch 7/30
[1m257/25

<keras.src.callbacks.history.History at 0x7d08c5d77340>

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

modelo = Sequential()

# Capa de embedding
modelo.add(Embedding(input_dim=N+2, output_dim=D))
modelo.add(Bidirectional(LSTM(units=NEURONAS_CAPA//3, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)))
modelo.add(Bidirectional(LSTM(units=NEURONAS_CAPA//3, dropout=0.4, recurrent_dropout=0.4)))
modelo.add(Dense(units=num_clases, activation='softmax', kernel_regularizer=l2(0.1)))

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
modelo.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
modelo.fit(X, y, validation_split=0.2, epochs=EPOCHS, batch_size=128, callbacks=[early_stopping])

Epoch 1/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 494ms/step - accuracy: 0.4239 - loss: 2.0681 - val_accuracy: 0.5980 - val_loss: 1.1863
Epoch 2/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 478ms/step - accuracy: 0.5865 - loss: 1.1539 - val_accuracy: 0.6053 - val_loss: 1.0105
Epoch 3/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 469ms/step - accuracy: 0.6234 - loss: 0.9852 - val_accuracy: 0.6138 - val_loss: 0.9832
Epoch 4/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 504ms/step - accuracy: 0.6526 - loss: 0.9036 - val_accuracy: 0.6236 - val_loss: 0.9532
Epoch 5/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 504ms/step - accuracy: 0.6876 - loss: 0.8455 - val_accuracy: 0.6145 - val_loss: 0.9738
Epoch 6/30
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 497ms/step - accuracy: 0.7043 - loss: 0.8274 - val_accuracy: 0.6106 - val_loss: 0.9904
Epoch 7/30

<keras.src.callbacks.history.History at 0x7d08945ff9a0>