In [1]:
import pandas as pd
import numpy as np

In [3]:
from google.colab import files
uploaded = files.upload()

Saving dataset_emotion_detect_lstm.zip to dataset_emotion_detect_lstm.zip


In [5]:
import zipfile
import os

# Unzip the uploaded file
with zipfile.ZipFile("/content/dataset_emotion_detect_lstm.zip", 'r') as zip_ref:
    zip_ref.extractall("emotion_data")

# List files to confirm
os.listdir("emotion_data")

['train.txt', 'test.txt', 'val.txt']

In [6]:
#define a function
# it will separate my text and emotion label in 3 files named train,test,val

In [7]:
def load_data(file_path):
  return pd.read_csv(file_path,sep=';',names=['text','emotion'])

In [8]:
#apply function
df_train=load_data("emotion_data/train.txt")
df_test=load_data("emotion_data/test.txt")
df_val=load_data("emotion_data/val.txt")

In [9]:
#we are combining train and val files for training
df=pd.concat([df_train,df_val],ignore_index=True)

In [10]:
df.sample(5)

Unnamed: 0,text,emotion
9474,i was feeling very nostalgic and very grateful,love
16995,i almost feel hesitant to write about this it ...,fear
15629,i hate that feeling cus thats really bitchy to...,anger
15013,i think for myself i feel everyone is greedy b...,anger
13329,i also feel a strong sexual current flowing th...,joy


In [11]:
# now define a function to clean data
import re

In [12]:
def clean_txt(text):
  text=text.lower()  #lowercase ( because we want I am Happy and i am happy to be act same)
  text=re.sub(r'[^a-zA-Z ]', '', text)  #remove numbers and punctuation (we want only text)

  return text

In [13]:
df['text'] = df['text'].apply(clean_txt)
df_test['text'] = df_test['text'].apply(clean_txt)

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
# to encode emotion labels
label=LabelEncoder()

In [16]:
df['label']=label.fit_transform(df['emotion'])
df_test['label']=label.fit_transform(df_test['emotion'])

In [17]:
# disclaimer - maina sirf 5 classes use ki hain but emotion state ki or bhi class hoti hain, so mera model sarcasm,disguist ko anger ma daalna
#aisa kaam kr skta hai

In [18]:
print(dict(zip(label.classes_, range(len(label.classes_)))))

{'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}


In [19]:
# now tokenize sentences and pad them
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
token=Tokenizer()
token.fit_on_texts(df['text'])

In [21]:
x=token.texts_to_sequences(df['text'])
x_test=token.texts_to_sequences(df_test['text'])

In [22]:
x

[[1, 139, 2, 657],
 [1,
  39,
  104,
  59,
  7,
  14,
  490,
  4,
  14,
  2940,
  551,
  31,
  59,
  60,
  127,
  146,
  76,
  1595,
  3,
  21,
  1305],
 [15, 3284, 6, 1206, 4, 285, 1, 2, 521, 455],
 [1, 22, 174, 7, 645, 26, 5, 4464, 1, 57, 46, 8, 12, 21, 69, 29, 5, 3285],
 [1, 22, 7, 1004],
 [73, 47, 7, 6, 54, 542, 312, 329, 157, 162, 8, 18],
 [73,
  47,
  325,
  34,
  7903,
  34,
  196,
  7904,
  853,
  3,
  73,
  2667,
  1469,
  6,
  159,
  1915,
  19,
  1,
  116,
  2,
  13,
  14,
  398],
 [1, 2, 28, 451, 26, 77, 28, 6, 1816, 34, 28, 782, 28, 6, 194, 266, 359],
 [1,
  20,
  47,
  24,
  5627,
  16,
  223,
  1,
  2,
  8,
  5627,
  99,
  5628,
  135,
  3,
  131,
  6,
  1027,
  4465],
 [1, 2, 635, 95],
 [1, 2, 13, 1, 20, 4, 80, 5, 710, 1, 91, 586, 310, 82],
 [1,
  38,
  2,
  8,
  478,
  21,
  6,
  587,
  412,
  3,
  8,
  1,
  39,
  969,
  4,
  20,
  68,
  826,
  9,
  2022,
  2941],
 [1, 70, 12, 88, 5, 5629, 51, 9, 194, 4, 2, 588],
 [1, 2, 351, 423, 1, 91, 31, 4466],
 [1,
  20,
  4467,
 

In [23]:
# now do padding
maxlen = max(len(x) for x in x)  # longest sentence length
x = pad_sequences(x, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [24]:
# get label values
y=df['label'].values
y_test=df_test['label'].values

In [25]:
y

array([4, 4, 0, ..., 2, 2, 2])

In [26]:
# now split the data in training and validation data
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical  # this to convert y labels into to required vector

In [27]:
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.2,random_state=42)

In [28]:
# One-hot encode labels
y_train_cat = to_categorical(y_train)
y_val_cat = to_categorical(y_val)
y_test_cat = to_categorical(y_test)

In [29]:
y_train_cat

array([[0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]])

In [62]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional

In [31]:
vocab_size=len(token.word_index) + 1
max_len=x_train.shape[1]

In [32]:
vocab_size

16197

In [33]:
max_len

66

In [34]:
# now build a LSTM model

In [136]:
model=Sequential()

model.add(Embedding(input_dim=vocab_size,output_dim=128,input_length=max_len))
model.add(Bidirectional(LSTM(150,return_sequences=True)))
#model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(150,return_sequences=True)))
#model.add(LSTM(150,return_sequences=True))
#model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(150,return_sequences=False)))
#model.add(Dropout(0.4))
model.add(Dense(64, activation='relu'))
model.add(Dense(y_train_cat.shape[1],activation='softmax'))

In [41]:
from tensorflow.keras.layers import Bidirectional

In [137]:
from tensorflow.keras.optimizers import Adam

In [138]:
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [58]:
model.summary()

In [139]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_accuracy',   # or 'val_loss'
    patience=3,               # wait 3 epochs before stopping
    restore_best_weights=True # keeps best model, not last one
)

In [140]:
history = model.fit(
    x_train, y_train_cat,
    validation_data=(x_val, y_val_cat),
    epochs=16,
    batch_size=32,
    callbacks=[early_stop]
)

Epoch 1/16
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 29ms/step - accuracy: 0.4433 - loss: 1.4113 - val_accuracy: 0.8356 - val_loss: 0.4733
Epoch 2/16
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 28ms/step - accuracy: 0.8858 - loss: 0.3336 - val_accuracy: 0.9022 - val_loss: 0.2425
Epoch 3/16
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 28ms/step - accuracy: 0.9464 - loss: 0.1600 - val_accuracy: 0.9217 - val_loss: 0.1831
Epoch 4/16
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 27ms/step - accuracy: 0.9595 - loss: 0.1067 - val_accuracy: 0.9261 - val_loss: 0.1914
Epoch 5/16
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.9670 - loss: 0.0801 - val_accuracy: 0.9164 - val_loss: 0.2142
Epoch 6/16
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 28ms/step - accuracy: 0.9756 - loss: 0.0679 - val_accuracy: 0.9161 - val_loss: 0.2608
Epoch 7/16
[1m4

In [141]:
loss, acc = model.evaluate(x_test, y_test_cat)
print("Test Accuracy:", acc)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9223 - loss: 0.2139
Test Accuracy: 0.9240000247955322


In [142]:
def predict_emotion(text):
    text_clean = clean_txt(text)
    seq = token.texts_to_sequences([text_clean])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    emotion = label.classes_[pred.argmax()]
    return emotion


In [150]:
print(predict_emotion("I'm so excited to start college!"))
print(predict_emotion("I hate when things go wrong."))
print(predict_emotion("Nothing makes sense anymore..."))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 478ms/step
joy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
fear
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
sadness


In [145]:
model.save("my_model.h5")



In [146]:
from tensorflow.keras.models import load_model
model = load_model("my_model.h5")



In [147]:
from google.colab import files
files.download('my_model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [148]:
!ls

dataset_emotion_detect_lstm.zip  emotion_data  my_model.h5  sample_data


In [151]:
model.summary()