In [9]:
import numpy as np
import pandas as pd
import emoji

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [10]:
data = pd.read_csv('emoji_data.csv', header = None)
data.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [11]:
emoji_dict = {
    0: ":red_heart:",
    1: ":baseball:",
    2: ":grinning_face_with_big_eyes:",
    3: ":disappointed_face:",
    4: ":fork_and_knife_with_plate:"
}

def label_to_emoji(label):
    return emoji.emojize(emoji_dict[label])

In [16]:
data[data[1]=='0v2']

Unnamed: 0,0,1
29,I miss you so much,0v2


In [19]:
data[1].replace('0v2',0,inplace=True)

In [12]:
X = data[0].values
Y = data[1].values

In [20]:
Y

array(['4', '3', '3 ', '1 ', '2', '1', '4', '3', '4', '1', '3', '3 ', '2',
       '2', '4', '3', '2', '3 ', '3 ', '1', '3 ', '2', '2', '2', '0', '1',
       '0', '4 ', '2', 0, '2', '0', '0', '3 ', '4', '0', '2', '1', '3',
       '1', '0', '4', '0 ', '3', '0 ', '4', '2', '3 ', '4', '2 ', '2',
       '3', '0', '2', '2', '3 ', '2', '3', '2', '2', '3 ', '3', '0 ', '2',
       '3', '0', '2', '0', '0 ', '2', '3', '2', '4', '1', '3', '3', '0',
       '0', '3', '2', '0', '3', '0', '2', '2', '4', '2', '2', '0', '0',
       '2', '3', '0', '4', '2', '1', '2', '3', '3', '2', '3', '0', '3',
       '0', '2', '0', '2', '3', '4', '3', '1', '3', '4', '3', '2', '3',
       '3', '3', '1', '4', '4', '2', '2', '1', '1', '2', '3', '2', '3',
       '4', '2', '3', '0', '2', '0', '0', '4', '3', '4', '2', '3', '2',
       '3', '4', '2', '1', '2', '4', '3', '1', '3', '2', '3', '2', '2',
       '3', '3', '2', '4', '0', '0', '0', '3', '0', '0', '1', '1', '2',
       '2', '2', '0', '3', '2', '3', '3', '1', '2', '2'

# Embeddings

__[glove.6B.50d.txt](https://www.kaggle.com/datasets/adityajn105/glove6b50d)__

In [25]:
file = open('glove.6B.50d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

# content

In [4]:
embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

In [29]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word2index = tokenizer.word_index

In [32]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

maxlen = get_maxlen(Xtokens)
print(maxlen)

10


In [35]:
Xtokens = tokenizer.texts_to_sequences(X)
Xtrain = pad_sequences(Xtokens, maxlen = maxlen,  padding = 'post', truncating = 'post')

Xtrain

array([[103, 104,   3, ...,   0,   0,   0],
       [106,   3, 107, ...,   0,   0,   0],
       [  1,   7, 108, ...,   0,   0,   0],
       ...,
       [ 14,   3,   5, ...,   0,   0,   0],
       [ 14, 310,  26, ...,   0,   0,   0],
       [  1,  24,  22, ...,   0,   0,   0]], dtype=int32)

In [5]:
Ytrain = to_categorical(Y)

# Model

In [39]:
embed_size = 50
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    embed_vector = embeddings[word]
    embedding_matrix[i] = embed_vector

In [40]:
embedding_matrix

array([[ 0.0000e+00,  0.0000e+00,  0.0000e+00, ...,  0.0000e+00,
         0.0000e+00,  0.0000e+00],
       [ 1.1891e-01,  1.5255e-01, -8.2073e-02, ..., -5.7512e-01,
        -2.6671e-01,  9.2121e-01],
       [-1.0919e-03,  3.3324e-01,  3.5743e-01, ..., -4.5697e-01,
        -4.8969e-02,  1.1316e+00],
       ...,
       [-6.8975e-01,  6.7030e-01, -1.1030e+00, ..., -3.5800e-01,
        -2.5186e-01,  6.1610e-01],
       [ 2.7062e-01, -3.6596e-01,  9.7193e-02, ..., -7.5256e-01,
        -1.7335e-01, -2.2587e-01],
       [ 9.5281e-01, -2.0608e-01,  5.5618e-01, ..., -1.0882e-01,
        -2.2822e-01, -4.6303e-01]])

In [41]:
model = Sequential([
    Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = maxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    
    LSTM(units = 16, return_sequences = True),
    LSTM(units = 4),
    Dense(5, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [42]:
model.fit(Xtrain, Ytrain, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7f07842b18a0>

In [50]:
test = ["I feel good", "I feel loved", "I am sad"]

test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen = maxlen, padding = 'post', truncating = 'post')

y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis = 1)

for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

I feel good 😃
I feel loved ❤️
I am sad 😞
