# LSTM MODEL

In [2]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorf

In [4]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords


In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naomi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# load dataset
data_t5 = pd.read_csv('C:/Users/naomi/Downloads/labeled_data_reddit_text_yangswei_85.csv')

In [8]:
data_t5.head()

Unnamed: 0,subreddit,link_id,parent_id,created_utc,upvotes,text,predictions
0,remotework,1fy22yp,t3_1fy22yp,2024-10-07 13:03:07,17,Oh how offices have changed the movie office s...,joy
1,remotework,1fy22yp,t3_1fy22yp,2024-10-07 11:38:45,79,Oh how offices have changed when i was in the ...,sadness
2,remotework,1fy22yp,t3_1fy22yp,2024-10-07 11:41:54,66,Oh how offices have changed you were also sexu...,anger
3,remotework,1fy22yp,t3_1fy22yp,2024-10-07 16:17:46,11,Oh how offices have changed overwhelmingly mos...,sadness
4,remotework,1fy22yp,t3_1fy22yp,2024-10-07 11:43:37,28,Oh how offices have changed this isn't even cl...,joy


In [9]:
# set stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

# Remove stopwords from the text data
data_t5['text'] = data_t5['text'].apply(remove_stopwords)

# Prepare data
X = data_t5['text']
y = data_t5['predictions']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(set(y_encoded))
y_encoded_cat = to_categorical(y_encoded, num_classes=num_classes)

In [10]:
data_t5_aux = data_t5
data_t5_aux['token_count'] = data_t5_aux['text'].apply(lambda x: len(x.split()))
max_length = data_t5_aux['token_count'].max()
print(max_length)

72


In [11]:
# Tokenization
max_features = 100 #parameter to optimize
tokenizer = Tokenizer(num_words = max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
vocab_size = len(tokenizer.word_index) + 1

# Padding sequences
max_sequence_length = max_length  # Maximum length of input sequences
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)
vocab_size = max_features

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
max_features = 5000
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=max_features)  
X_tfidf = tfidf.fit_transform(data_t5['text']).toarray()

In [13]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded_cat, test_size=0.2, random_state=42)

In [None]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))     
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))  # Use 'softmax' for multi-class classification

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=2, validation_split=0.1)



Epoch 1/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1167s[0m 29ms/step - accuracy: 0.5958 - loss: 1.1661 - val_accuracy: 0.6039 - val_loss: 1.1067
Epoch 2/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m908s[0m 23ms/step - accuracy: 0.6007 - loss: 1.1154 - val_accuracy: 0.6131 - val_loss: 1.0794
Epoch 3/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m840s[0m 21ms/step - accuracy: 0.6144 - loss: 1.0807 - val_accuracy: 0.6203 - val_loss: 1.0548
Epoch 4/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m811s[0m 20ms/step - accuracy: 0.6210 - loss: 1.0577 - val_accuracy: 0.6246 - val_loss: 1.0472
Epoch 5/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m841s[0m 21ms/step - accuracy: 0.6256 - loss: 1.0445 - val_accuracy: 0.6249 - val_loss: 1.0364
Epoch 6/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m841s[0m 21ms/step - accuracy: 0.6271 - loss: 1.0422 - val_accuracy: 0.6390 - va

<keras.src.callbacks.history.History at 0x1d050ba7c50>

In [17]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 20ms/step - accuracy: 0.6418 - loss: 1.0093
Test Accuracy: 0.6411


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred_prob = model.predict(X_test)

y_pred = np.argmax(y_pred_prob, axis=1)

y_test_1d = np.argmax(y_test, axis=1)
y_test_1d

print(classification_report(y_test_1d, y_pred))

[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


'              precision    recall  f1-score   support\n\n           0       0.55      0.21      0.31      3332\n           1       0.68      0.11      0.20      1097\n           2       0.65      0.95      0.77     13447\n           3       0.00      0.00      0.00       164\n           4       0.62      0.16      0.26      3955\n           5       0.88      0.05      0.10       294\n\n    accuracy                           0.64     22289\n   macro avg       0.56      0.25      0.27     22289\nweighted avg       0.63      0.64      0.57     22289\n'

In [None]:
print(classification_report(y_test_1d, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.21      0.31      3332
           1       0.68      0.11      0.20      1097
           2       0.65      0.95      0.77     13447
           3       0.00      0.00      0.00       164
           4       0.62      0.16      0.26      3955
           5       0.88      0.05      0.10       294

    accuracy                           0.64     22289
   macro avg       0.56      0.25      0.27     22289
weighted avg       0.63      0.64      0.57     22289



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(classification_report(y_test_1d, y_pred))

In [20]:
pip install keras-tuner

Collecting keras-tunerNote: you may need to restart the kernel to use updated packages.

  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [21]:
import keras_tuner as kt

In [22]:
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=hp.Choice('output_dim', [64, 128, 256]), input_length=max_sequence_length))
    model.add(LSTM(hp.Int('lstm_units1', min_value=64, max_value=256, step=64), return_sequences=True))
    model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(LSTM(hp.Int('lstm_units2', min_value=32, max_value=128, step=32)))
    model.add(Dropout(hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [23]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=25,  
    executions_per_trial=2,  
    directory='tuner_results',
    project_name='emotion_classification'
)



In [24]:
tuner.search(X_train, y_train, epochs=7, validation_split=0.2, batch_size=32)

Trial 25 Complete [02h 24m 13s]
val_accuracy: 0.6335034370422363

Best val_accuracy So Far: 0.6374291777610779
Total elapsed time: 1d 11h 14m 21s


In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters:", best_hps.values)

In [14]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=256, input_length=max_sequence_length))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(96))     
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))  

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=2, validation_split=0.1)



Epoch 1/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3737s[0m 93ms/step - accuracy: 0.5987 - loss: 1.1699 - val_accuracy: 0.6026 - val_loss: 1.1260
Epoch 2/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2711s[0m 68ms/step - accuracy: 0.6005 - loss: 1.1319 - val_accuracy: 0.6094 - val_loss: 1.1041
Epoch 3/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2757s[0m 69ms/step - accuracy: 0.6020 - loss: 1.1167 - val_accuracy: 0.6123 - val_loss: 1.0854
Epoch 4/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2855s[0m 71ms/step - accuracy: 0.6064 - loss: 1.0997 - val_accuracy: 0.6169 - val_loss: 1.0780
Epoch 5/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2761s[0m 69ms/step - accuracy: 0.6106 - loss: 1.0987 - val_accuracy: 0.6165 - val_loss: 1.0772
Epoch 6/10
[1m40118/40118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2822s[0m 70ms/step - accuracy: 0.6130 - loss: 1.0885 - val_accuracy: 0.6184

<keras.src.callbacks.history.History at 0x239104a52e0>

In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 41ms/step - accuracy: 0.6206 - loss: 1.0654
Test Accuracy: 0.6191


In [18]:
y_pred_prob = model.predict(X_test)

y_pred = np.argmax(y_pred_prob, axis=1)

y_test_1d = np.argmax(y_test, axis=1)
y_test_1d

[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 37ms/step


array([2, 2, 0, ..., 0, 2, 2])

In [19]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test_1d, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.08      0.15      3332
           1       0.74      0.02      0.04      1097
           2       0.62      1.00      0.76     13447
           3       0.00      0.00      0.00       164
           4       0.77      0.03      0.06      3955
           5       0.00      0.00      0.00       294

    accuracy                           0.62     22289
   macro avg       0.48      0.19      0.17     22289
weighted avg       0.66      0.62      0.49     22289



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
