In [2]:
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.preprocessing import LabelEncoder


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [7]:
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, RNN, LSTM, SimpleRNN, Dropout
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

2025-01-07 21:26:20.131445: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [10]:
def preprocess_texts(txts):
    preprocessed_texts = []
    stop_words = set(stopwords.words('english'))
    for txt in txts:
        words = word_tokenize(txt.lower())  # Convert to lowercase
        filtered_txt = [word for word in words if word not in stop_words and word not in string.punctuation]
        preprocessed_texts.append(filtered_txt)  # Keep tokenized words
    return preprocessed_texts

def prepare_data(sentences, labels, w2v_model, label_encoder, max_len=50):
    sequences = [
        np.array([w2v_model.wv[word] for word in sent if word in w2v_model.wv]) 
        for sent in sentences
    ]
    vectors = pad_sequences(sequences, maxlen=max_len, dtype='float32', padding='post')
    encoded_labels = label_encoder.fit_transform(labels)
    y = to_categorical(encoded_labels)
    return vectors, y

def build_lstm_model(maxlen=50, vector_size=100, num_classes=6):
    model = Sequential([
        Input(shape=(maxlen, vector_size)),  
        LSTM(128, return_sequences=True),  # Add more LSTM layers
        Dropout(0.5),
        LSTM(64),
        Dense(64, activation='relu', kernel_regularizer='l2'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [15]:
def train_word2vec(sentences, vector_size=100):
    model = Word2Vec(sentences=sentences, vector_size=vector_size, window=5, min_count=1)
    return model

In [18]:
def get_encoder(label):
    label_reshaped = np.array(label).reshape(-1, 1)
    encoder = LabelEncoder()
    encoder = encoder.fit(label_reshaped)
    return encoder

In [11]:
def load_text(path):
    texts = []
    labels =[]
    with open(path,'r') as file:
        txts = [x.split(';') for x in file.readlines()]
        for text,label in txts:
            texts.append(text)
            labels.append(label.rstrip('\n'))
    return texts,labels

In [5]:
train_path = '../data/emotion/train.txt'
test_path = '../data/emotion/test.txt'
val_path = '../data/emotion/val.txt'

In [6]:
X_train, y_train = load_text(train_path)
X_val, y_val = load_text(val_path)
X_test, y_test = load_text(test_path)

In [12]:
X_train_p = preprocess_texts(X_train)

In [13]:
X_val_p = preprocess_texts(X_val)
X_test_p = preprocess_texts(X_test)

In [16]:
w2v_model = train_word2vec(X_train_p, vector_size=100)

In [21]:
encoder = get_encoder(y_train)

  y = column_or_1d(y, warn=True)


In [22]:
X_train_vec, y_train_en = prepare_data(X_train_p, y_train, w2v_model, encoder)

In [23]:
X_train_vec.shape

(16000, 50, 100)

In [24]:
model = build_lstm_model()

In [26]:
model.summary()

In [27]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min' ,restore_best_weights=True)

In [30]:
X_val_vec, y_val_en = prepare_data(X_val_p, y_val, w2v_model, encoder)

In [31]:
EPOCHS = 150
BATCH_SIZE = 512

In [32]:
history = model.fit(X_train_vec, y_train_en, 
                        validation_data=(X_val_vec, y_val_en), epochs=EPOCHS, batch_size=BATCH_SIZE,
                       callbacks=[early_stopping])

Epoch 1/150
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 490ms/step - accuracy: 0.2897 - loss: 2.2672 - val_accuracy: 0.3520 - val_loss: 2.0049
Epoch 2/150
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 467ms/step - accuracy: 0.3263 - loss: 1.9780 - val_accuracy: 0.3520 - val_loss: 1.8626
Epoch 3/150
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 537ms/step - accuracy: 0.3215 - loss: 1.8511 - val_accuracy: 0.3520 - val_loss: 1.7628
Epoch 4/150
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 544ms/step - accuracy: 0.3254 - loss: 1.7592 - val_accuracy: 0.3520 - val_loss: 1.6990
Epoch 5/150
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 470ms/step - accuracy: 0.3181 - loss: 1.6975 - val_accuracy: 0.3520 - val_loss: 1.6565
Epoch 6/150
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 473ms/step - accuracy: 0.3309 - loss: 1.6596 - val_accuracy: 0.3520 - val_loss: 1.6310
Epoch 7/150
[1m

KeyboardInterrupt: 