In [None]:
%pip install tensorflow

In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split


[reddit comments data from huggingFace](https://huggingface.co/datasets/beenakurian/reddit_comments_subreddit_canada)

In [18]:
import csv

texts = []
labels = []

with open('../../data/clean/reddit_sentiments.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    has_header = csv.Sniffer().has_header(csvfile.read(1024))
    csvfile.seek(0) 
    if has_header:
        next(reader)  # just skip the header
    for row in reader:
        texts.append(row[0])
        labels.append(row[1])

print("texts:", texts[0])
print("labels:", labels[0])


texts: well there are thousands of international students here illegally so we gotta ramp it up
labels: NEG


In [19]:

# change the labels to numbers
label_to_index = {"POS": 0, "NEG": 1, "NEU": 2}
labels = [label_to_index[label] for label in labels]


In [20]:
# Tokenize the texts
max_words = 1000  # We will only consider the 1000 most common words in the dataset
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_len = max(len(sequence) for sequence in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len)


In [21]:

# transfer the labels to one-hot encodings
num_classes = len(label_to_index)
labels = tf.keras.utils.to_categorical(labels, num_classes)


In [23]:

# split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [24]:

# create the model
embedding_dim = 50 # embedding dimension
lstm_units = 128 # number of LSTM units

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim),  # embedding layer
    LSTM(units=lstm_units), # LSTM layer
    Dense(units=num_classes, activation='softmax') # output layer
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [25]:

# train the model
batch_size = 32
epochs = 10
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

# save the model
model.save("../../src/model1/my_model.keras")


Epoch 1/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - accuracy: 0.5388 - loss: 0.9408 - val_accuracy: 0.5769 - val_loss: 0.8931
Epoch 2/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step - accuracy: 0.6418 - loss: 0.7991 - val_accuracy: 0.6221 - val_loss: 0.7928
Epoch 3/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.7033 - loss: 0.6732 - val_accuracy: 0.6528 - val_loss: 0.7705
Epoch 4/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.7239 - loss: 0.6185 - val_accuracy: 0.6420 - val_loss: 0.7882
Epoch 5/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 33ms/step - accuracy: 0.7547 - loss: 0.5605 - val_accuracy: 0.6420 - val_loss: 0.7884
Epoch 6/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.7770 - loss: 0.5154 - val_accuracy: 0.6203 - val_loss: 0.8404
Epoch 7/10
[1m156/1

In [7]:
model = tf.keras.models.load_model("../../src/model1/my_model.keras")


In [26]:

# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6218 - loss: 1.0580
Test Loss: 1.0532234907150269
Test Accuracy: 0.629522442817688


In [27]:
def predict_emotion(sentence, model, tokenizer, max_len, label_to_index):
    
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    prediction = model.predict(padded_sequence)
    predicted_class_index = np.argmax(prediction)

    index_to_label = {index: label for label, index in label_to_index.items()}
    predicted_class_label = index_to_label[predicted_class_index]

    print("Predicted Emotion:", predicted_class_label)


In [29]:
sentence = input("Enter a sentence: ")
predict_emotion(sentence, model, tokenizer, max_len, label_to_index)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted Emotion: NEU
