In [30]:
# !pip install tensorflow

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
import nltk
import re
from nltk.stem import PorterStemmer

In [33]:
from sklearn.preprocessing import LabelEncoder

In [34]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [35]:
df = pd.read_csv('train.txt', header=None, sep=';', names=['Comment', 'Emotion'], encoding='utf-8')
df.head()

Unnamed: 0,Comment,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [36]:
max(df['Comment'].apply(len))

300

In [37]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
stop_words = nltk.corpus.stopwords.words('english')

In [39]:
def clean_text(df, column, vocab_size, max_len):
  stemmer = PorterStemmer()
  corpus = []

  for text in df[column]:
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    corpus.append(text)
  one_hot_words = [one_hot(words, vocab_size) for words in corpus]
  embedded_words = pad_sequences(one_hot_words, padding='pre', maxlen=max_len)
  return embedded_words

In [40]:
X_train = clean_text(df, 'Comment', vocab_size=11000, max_len=300)

In [41]:
X_train

array([[   0,    0,    0, ..., 6818, 2405, 4347],
       [   0,    0,    0, ..., 5176, 4751,  983],
       [   0,    0,    0, ..., 2405, 1210, 9472],
       ...,
       [   0,    0,    0, ..., 3150,  886, 8952],
       [   0,    0,    0, ..., 9109, 7741, 2775],
       [   0,    0,    0, ..., 2405, 1528,  862]], dtype=int32)

In [42]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(df['Emotion'])

In [43]:
y_train = to_categorical(y_train_encoded)

In [44]:
y_train

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [45]:
model = Sequential()
model.add(Embedding(input_dim=11000, output_dim=150, input_length=300))
model.add(Dropout(0.2))
model.add(LSTM(units=128))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(6, activation='softmax'))



In [46]:
model.summary()

In [47]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [48]:
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.3694 - loss: 1.5264
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.8524 - loss: 0.4599
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9282 - loss: 0.2133
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9456 - loss: 0.1472
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9603 - loss: 0.1117
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.9638 - loss: 0.1007
Epoch 7/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9701 - loss: 0.0818
Epoch 8/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.9764 - loss: 0.0612
Epoch 9/10
[1m250/250[0m [32m

<keras.src.callbacks.history.History at 0x780694246810>

In [53]:
def clean_sentence(sentence):
  stemmer = PorterStemmer()
  corpus = []

  text = re.sub('[^a-zA-Z]', ' ', sentence)
  text = text.lower()
  text = text.split()
  text = [stemmer.stem(word) for word in text if word not in stop_words]
  text = ' '.join(text)
  corpus.append(text)

  one_hot_words = [one_hot(words, 11000) for words in corpus]
  embedded_words = pad_sequences(one_hot_words, padding='pre', maxlen=300)
  return embedded_words

In [54]:
sentences = [
    "I am feeling great",
    "I am feeling lonely",
    "What should I do?",
    "Don't yell at me",
    "He is really sweet"
]

In [57]:
for sentence in sentences:
  cleaned = clean_sentence(sentence)
  print(sentence, label_encoder.inverse_transform(np.argmax(model.predict(cleaned), axis=-1)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
I am feeling great ['joy']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
I am feeling lonely ['sadness']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
What should I do? ['anger']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Don't yell at me ['anger']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
He is really sweet ['joy']


In [58]:
model.save('model.h5')

