# LSTM MODEL

In [24]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5
Note: you may need to restart the kernel to use updated packages.


In [25]:
# load libraries
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix
import keras_tuner as kt


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# load dataset
data_t5 = pd.read_csv('C:/Users/Usuario/emotion-detection-txa/Pretrained_Model Implementation/t5_model_final.csv')


In [4]:
# set stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

# Remove stopwords from the text data
data_t5['text'] = data_t5['text'].apply(remove_stopwords)

# Prepare data
X = data_t5['text']
y = data_t5['predicted_label']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(set(y_encoded))
y_encoded_cat = to_categorical(y_encoded, num_classes=num_classes)

In [5]:
data_t5_aux = data_t5
data_t5_aux['token_count'] = data_t5_aux['text'].apply(lambda x: len(x.split()))
max_length = data_t5_aux['token_count'].max()
print(max_length)

72


In [6]:
# Tokenization
max_features = 100 #parameter to optimize
tokenizer = Tokenizer(num_words = max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
vocab_size = len(tokenizer.word_index) + 1

# Padding sequences
max_sequence_length = max_length  # Maximum length of input sequences
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)
vocab_size = max_features

In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded_cat, test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))     
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))  # Use 'softmax' for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=6, batch_size=2, validation_split=0.1)



Epoch 1/6
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2521s[0m 57ms/step - accuracy: 0.4695 - loss: 1.2725 - val_accuracy: 0.5188 - val_loss: 1.1768
Epoch 2/6
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2125s[0m 48ms/step - accuracy: 0.5299 - loss: 1.1549 - val_accuracy: 0.5453 - val_loss: 1.1123
Epoch 3/6
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2141s[0m 49ms/step - accuracy: 0.5624 - loss: 1.0887 - val_accuracy: 0.5863 - val_loss: 1.0641
Epoch 4/6
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2178s[0m 49ms/step - accuracy: 0.5788 - loss: 1.0598 - val_accuracy: 0.5816 - val_loss: 1.0638
Epoch 5/6
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2232s[0m 51ms/step - accuracy: 0.5878 - loss: 1.0438 - val_accuracy: 0.5981 - val_loss: 1.0353
Epoch 6/6
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2212s[0m 50ms/step - accuracy: 0.5953 - loss: 1.0290 - val_accuracy: 0.5948 - val

<keras.src.callbacks.history.History at 0x26c9fb05280>

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_test_1d = np.argmax(y_test, axis=1)
y_test_1d

[1m765/765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 34ms/step - accuracy: 0.5953 - loss: 1.0375
Test Accuracy: 0.5991


In [22]:
# Classification report
print(classification_report(y_test_1d, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.59      0.57      7338
           1       0.48      0.27      0.35      2970
           2       0.63      0.77      0.69     11001
           3       0.28      0.04      0.08       112
           4       0.72      0.37      0.49      2809
           5       0.95      0.07      0.14       244

    accuracy                           0.60     24474
   macro avg       0.60      0.35      0.38     24474
weighted avg       0.60      0.60      0.58     24474



In [26]:
# Función para construir el modelo
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=hp.Choice('output_dim', [64, 128, 256]), input_length=max_sequence_length))
    model.add(LSTM(hp.Int('lstm_units1', min_value=64, max_value=256, step=64), return_sequences=True))
    model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(hp.Int('lstm_units2', min_value=32, max_value=128, step=32)))
    model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Configurar la búsqueda
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=25,  # Número de configuraciones a probar
    executions_per_trial=2,  # Número de ejecuciones por configuración
    directory='tuner_results',
    project_name='emotion_classification'
)

# Ejecutar la búsqueda
tuner.search(X_train, y_train, epochs=5, validation_split=0.2, batch_size=32)


Trial 25 Complete [02h 22m 39s]
val_accuracy: 0.5897344052791595

Best val_accuracy So Far: 0.5988508760929108
Total elapsed time: 1d 12h 52m 29s


In [28]:
# Mostrar los mejores hiperparámetros
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters:", best_hps.values)

Best hyperparameters: {'output_dim': 256, 'lstm_units1': 256, 'dropout_rate': 0.30000000000000004, 'lstm_units2': 64}


In [29]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=256, input_length=max_sequence_length))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))     
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))  # Use 'softmax' for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=7, batch_size=2, validation_split=0.1)



Epoch 1/7
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7154s[0m 162ms/step - accuracy: 0.4583 - loss: 1.2869 - val_accuracy: 0.5190 - val_loss: 1.1790
Epoch 2/7
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6512s[0m 148ms/step - accuracy: 0.5286 - loss: 1.1673 - val_accuracy: 0.5495 - val_loss: 1.1233
Epoch 3/7
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6237s[0m 142ms/step - accuracy: 0.5528 - loss: 1.1212 - val_accuracy: 0.5617 - val_loss: 1.1022
Epoch 4/7
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6308s[0m 143ms/step - accuracy: 0.5571 - loss: 1.1073 - val_accuracy: 0.5659 - val_loss: 1.0834
Epoch 5/7
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6381s[0m 145ms/step - accuracy: 0.5644 - loss: 1.0911 - val_accuracy: 0.5731 - val_loss: 1.0724
Epoch 6/7
[1m44053/44053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6511s[0m 148ms/step - accuracy: 0.5660 - loss: 1.0889 - val_accuracy: 0.5756

<keras.src.callbacks.history.History at 0x26cd19aa180>

In [30]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_test_1d = np.argmax(y_test, axis=1)
y_test_1d

[1m765/765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 265ms/step - accuracy: 0.5740 - loss: 1.0753
Test Accuracy: 0.5785
[1m765/765[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 265ms/step


array([0, 1, 0, ..., 0, 1, 2], dtype=int64)

In [31]:
# Classification report
print(classification_report(y_test_1d, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.57      0.54      7338
           1       0.53      0.16      0.24      2970
           2       0.61      0.78      0.68     11001
           3       0.41      0.06      0.11       112
           4       0.74      0.31      0.44      2809
           5       0.91      0.04      0.08       244

    accuracy                           0.58     24474
   macro avg       0.62      0.32      0.35     24474
weighted avg       0.59      0.58      0.55     24474

