## Data Preprocessing
    Load and preprocess the dataset, remove stop words, tokenize the text, and pad sequence to a fixed length.

In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
# import dataset
df = pd.read_csv("dataset.csv")

In [3]:
df.head()

Unnamed: 0,Emotion,Text
0,neutral,@dannycastillo Houston ticket neitu nen kan in...
1,neutral,cant fall muhil thei lo
2,neutral,Balisage Markup Conference 2009 Program-a No T...
3,neutral,@cynthia_123 i muhil thei lo
4,neutral,Bus bl***y chu ka miss ta!!!!!!!!


In [4]:
df.shape

(11922, 2)

In [5]:
df.Emotion.value_counts()

joy        4217
sadness    2848
fear       1748
neutral    1578
anger      1531
Name: Emotion, dtype: int64

In [6]:
# text processing function 
import re
import string
def clean_text(text):
    # to lower case
    text = text.lower()
    # remove links
    text = re.sub('https:\/\/\S+', '', text) 
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) 
    # remove next line     
    text = re.sub(r'[^ \w\.]', '', text) 
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    
    return text
# Create a new column called "Text" for collecting clean text
df['Text'] = df.Text.apply(lambda x: clean_text(x))

In [7]:
# Stop word in  mizo language
stop_words  = "a, i si a min em  le tak te mai e in he u tih ka va keimah keini kan keimahni nangmah nangma nangmahni ngei pawh ani amah a ta chu ni chumi anni an engnge khawi tunge hei sawmi hengte hi tawh nei ti mek leh mahse chuan emaw avang angin hma laiin tan hmunah tu nen lam kalh karah chhungah tlang hmaah hnuah chungah hnuai ah atangin chunglam hnuailam chhung pawn titawp hla zawk tichuan vawikhat hetah sawtah engtikah khawnge engati nge engtin zavai engpawh pahnihin vek tlem belh ber thildang engemawzat chutiang aih lo  ve chauh inang chuvangin aiin lutuk thei duhdan chiah don tur tunah"

# Convert to lower case
stop_words = stop_words.lower()

# convert string to list
def Convert(string):
    li = list(string.split(" "))
    return li


stop_word_list = Convert(stop_words)

# Remove stop words
df['Text'] = df['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_word_list)]))

In [8]:
# Lable encoding emotions
df["Emotion"] = df["Emotion"].astype('category').cat.codes # label encoding
# Create a dictionary that maps the category code to its corresponding emotion label
label_map = dict(enumerate(df["Emotion"].astype('category').cat.categories))

# Map Integer back to text


In [9]:
df.head()

Unnamed: 0,Emotion,Text
0,3,dannycastillo houston ticket neitu insumdawn t...
1,3,cant fall muhil
2,3,balisage markup conference programa no topic m...
3,3,muhil
4,3,bus bly miss


In [10]:
# Load dataset
texts = df['Text'].values
labels = df['Emotion'].values

In [11]:
!pip install tensorflow





In [12]:
import tensorflow as tf

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [15]:
#pad the sequence to a fixed length
max_length = 10
padded_sequence = pad_sequences(sequences, maxlen=max_length)

In [16]:
# convert the labels to one-hot encoded vector
num_classes = len(set(labels))
one_hot_labels = np.zeros((len(labels),num_classes))
for i, label in enumerate(labels):
    one_hot_labels[i, label] = 1

In [17]:
# Load pre-trained Golove Word Embedding  file into memory
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:], dtype = 'float32')
        embedding_index[word] = coef

In [18]:
# Create Embedding Matrix by selecting the Glove word Embeddings for the word in our vocabulary
word_index = tokenizer.word_index
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [19]:
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras import Sequential
model = Sequential()
model.add(Embedding(len(word_index)+1, embedding_dim, weights = [embedding_matrix], input_length = max_length, trainable = True))
model.add(LSTM(units =64, dropout = 0.2, recurrent_dropout = 0.2, return_sequences = True))
model.add(LSTM(units =32, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation = 'softmax'))

In [20]:
# Compile the model
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 100)           1300600   
                                                                 
 lstm (LSTM)                 (None, 10, 64)            42240     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 64)                2112      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 5)                 325       
                                                                 
Total params: 1,357,693
Trainable params: 1,357,693
Non-

In [23]:
# Split data into training and validation set
from sklearn.model_selection import train_test_split
train_sequence, val_sequence , train_labels, val_labels = train_test_split(padded_sequence, one_hot_labels, test_size=0.2)

# Train the model

history = model.fit(train_sequence, train_labels, epochs=10, batch_size = 32, validation_data=(val_sequence, val_labels))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
test_loss, test_acc = model.evaluate(val_sequence, val_labels)



In [25]:
#pipeline for prediction on new data
def predict_emotion(text, model, tokenizer, max_length):
    # Clean the text
    cleaned_text = clean_text(text)
    cleaned_text = ' '.join([word for word in text.split() if word not in (stop_word_list)])
    # Tokenize the text
    tokenized_text = tokenizer.texts_to_sequences([cleaned_text])
    
    # Pad the sequence to the fixed length
    padded_sequence = pad_sequences(tokenized_text, maxlen=max_length)
    
    # Make the prediction
    prediction = model.predict(padded_sequence)[0]
    
    # Get the label with the highest probability
    label_index = np.argmax(prediction)
    
    # Map the label index to its corresponding emotion label
    emotion = label_map[label_index]
    

In [26]:
# Example usage
new_text = "ama ka duh loh ber"
predicted_emotion = predict_emotion(new_text, model, tokenizer, max_length)
print(predicted_emotion)


None
