In [1]:
!git clone https://github.com/PaulPark2022/Emotions-NLP-Classification-A01709885.git

Cloning into 'Emotions-NLP-Classification-A01709885'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (11/11), 709.20 KiB | 4.67 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Función para leer txt y separar texto/emoción
def load_txt(path):
    texts, labels = [], []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            text, label = line.strip().split(';')
            texts.append(text)
            labels.append(label)
    return texts, labels

# Cargar datasets
train_texts, train_labels = load_txt('train.txt')
val_texts, val_labels = load_txt('val.txt')
test_texts, test_labels = load_txt('test.txt')

# Convertir etiquetas a números
le = LabelEncoder()
train_labels_enc = le.fit_transform(train_labels)
val_labels_enc = le.transform(val_labels)
test_labels_enc = le.transform(test_labels)

# Tokenizar textos
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)
train_seq = tokenizer.texts_to_sequences(train_texts)
val_seq = tokenizer.texts_to_sequences(val_texts)
test_seq = tokenizer.texts_to_sequences(test_texts)

# Padding
max_len = 50
train_pad = pad_sequences(train_seq, maxlen=max_len, padding='post', truncating='post')
val_pad = pad_sequences(val_seq, maxlen=max_len, padding='post', truncating='post')
test_pad = pad_sequences(test_seq, maxlen=max_len, padding='post', truncating='post')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

num_classes = len(le.classes_)

model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(
    train_pad, train_labels_enc,
    validation_data=(val_pad, val_labels_enc),
    epochs=10,
    batch_size=32
)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='train acc')
plt.plot(history.history['val_accuracy'], label='val acc')
plt.legend()
plt.show()

In [None]:
# Evaluar en test
loss, acc = model.evaluate(test_pad, test_labels_enc)
print(f"Test Accuracy: {acc:.2f}")

# Opciones de mejora:
# - Ajustar tamaño de LSTM (32, 128)
# - Cambiar Dropout (0.3, 0.7)
# - Usar Embedding preentrenado (GloVe)
# - Probar GRU en vez de LSTM

In [None]:
# Función de predicción
def predict_emotion(text):
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    pred = model.predict(pad)
    emotion = le.inverse_transform([pred.argmax()])[0]
    return emotion

# Ejemplos
print(predict_emotion("I feel so happy today!"))  # -> joy
print(predict_emotion("I can't stop crying."))    # -> sadness