In [1]:
%pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split


[reddit comments data from huggingFace](https://huggingface.co/datasets/beenakurian/reddit_comments_subreddit_canada)

In [2]:
import csv

texts = []
labels = []

with open('../../data/clean/reddit_sentiments.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    has_header = csv.Sniffer().has_header(csvfile.read(1024))
    csvfile.seek(0) 
    if has_header:
        next(reader)  # just skip the header
    for row in reader:
        texts.append(row[0])
        labels.append(row[1])

print("texts:", texts[0])
print("labels:", labels[0])


texts: well there are thousands of international students here illegally so we gotta ramp it up
labels: NEG


In [3]:

# change the labels to numbers
label_to_index = {"POS": 0, "NEG": 1, "NEU": 2}
labels = [label_to_index[label] for label in labels]


In [4]:
# Tokenize the texts
max_words = 1000  # We will only consider the 1000 most common words in the dataset
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_len = max(len(sequence) for sequence in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len)


In [5]:

# transfer the labels to one-hot encodings
num_classes = len(label_to_index)
labels = tf.keras.utils.to_categorical(labels, num_classes)


In [6]:

# split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [7]:

# create the model
embedding_dim = 50
lstm_units = 128

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim),  # 移除input_length参数
    LSTM(units=lstm_units),
    Dense(units=num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [10]:

# train the model
batch_size = 32
epochs = 10
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

# save the model
model.save("../../src/model1/my_model.keras")


Epoch 1/10


[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.9508 - loss: 0.1371 - val_accuracy: 0.6076 - val_loss: 2.1924
Epoch 2/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 25ms/step - accuracy: 0.9568 - loss: 0.1245 - val_accuracy: 0.5986 - val_loss: 2.2367
Epoch 3/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 57ms/step - accuracy: 0.9545 - loss: 0.1203 - val_accuracy: 0.5949 - val_loss: 2.3632
Epoch 4/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 132ms/step - accuracy: 0.9635 - loss: 0.1068 - val_accuracy: 0.5949 - val_loss: 2.2866
Epoch 5/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 95ms/step - accuracy: 0.9590 - loss: 0.1241 - val_accuracy: 0.5967 - val_loss: 2.2032
Epoch 6/10
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 61ms/step - accuracy: 0.9537 - loss: 0.1188 - val_accuracy: 0.5967 - val_loss: 2.5824
Epoch 7/10
[1m156/156[0m 

In [11]:
model = tf.keras.models.load_model("../../src/model1/my_model.keras")
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.5703 - loss: 2.7800
Test Loss: 2.6641271114349365
Test Accuracy: 0.5926194190979004


In [12]:
def predict_emotion(sentence, model, tokenizer, max_len, label_to_index):
    
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    prediction = model.predict(padded_sequence)
    predicted_class_index = np.argmax(prediction)

    index_to_label = {index: label for label, index in label_to_index.items()}
    predicted_class_label = index_to_label[predicted_class_index]

    print("Predicted Emotion:", predicted_class_label)


In [14]:
sentence = input("Enter a sentence: ")
predict_emotion(sentence, model, tokenizer, max_len, label_to_index)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Predicted Emotion: NEG
