Importing the required libraries

In [100]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN,Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

Loading a simple dataset

In [76]:
data = pd.read_csv("/content/Emoji_data.csv",header=None)
data

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2
...,...,...
178,lets brunch some day,4
179,dance with me,2
180,she is a bully,3
181,she plays baseball,1


Installing and importing the emoji library for emoji prediction

In [77]:
!pip install emoji
import emoji



Testing our library

In [79]:
emoji.emojize(":thumbs_up:")

'👍'

Creating a sample dictionary with the text and a label respectively

In [80]:
emoji_dict = {
    0: ":red_heart:",
    1: ":baseball:",
    2: ":grinning_face_with_big_eyes:",
    3: ":disappointed_face:",
    4: ":fork_and_knife_with_plate:"
}

emoji.emojize(emoji_dict[1])

'⚾'

A Function to get the emoji based on the text inserted

In [81]:
def label_to_emoji(label):
  return emoji.emojize(emoji_dict[label])

label=label_to_emoji(3)
label

'😞'

In [129]:
x = data[0].values # Getting the values separately from X and Y
y = data[1].values

Downloading and loading the GloVE 100D vector

In [116]:
file=open("/content/glove.6B.100d.txt",'r',encoding='utf8')
content=file.readlines()
file.close()

Creating a dictionary with the key and value pair from the GloVE vectors

In [117]:
embedding = {}
for line in content:
  line=line.split()
  embedding[line[0]] = np.array(line[1:],dtype=float)

In [118]:
token=Tokenizer()
token.fit_on_texts(x)
word_index=token.word_index

In [119]:
word_index.get("assignment")

255

In [120]:
x_tokens=token.texts_to_sequences(x)
x_tokens

[[102, 103, 3, 8, 104],
 [105, 3, 106],
 [1, 6, 107],
 [108, 4, 34],
 [35, 30],
 [36, 3, 19, 109, 26, 49],
 [1, 110, 111],
 [31, 67, 112],
 [1, 20, 113, 27],
 [114, 68, 37, 69, 26],
 [2, 11, 115, 10, 70],
 [116, 50, 71, 51],
 [35, 38],
 [12, 12, 12, 22, 28, 8, 39],
 [1, 32, 21, 5, 117, 118],
 [119, 11, 2, 120, 40],
 [1, 20, 9, 30],
 [1, 72, 52, 53, 13, 10],
 [4, 121, 3, 122],
 [73, 3, 4, 34],
 [1, 6, 123],
 [12, 12, 12, 54],
 [14, 52, 53],
 [15, 23, 5, 124],
 [125, 3, 126, 1, 21],
 [15, 74, 127, 128, 75],
 [1, 18, 9, 4, 129, 55, 130],
 [29, 2, 24, 131],
 [2, 132, 133, 10, 134],
 [1, 24, 19, 135],
 [14, 23, 16, 5, 136],
 [32, 2, 137, 7, 76],
 [2, 138, 4, 139],
 [140, 3, 56, 13, 5, 77],
 [76, 41, 3, 141],
 [42, 8, 78],
 [29, 2, 21, 5, 34],
 [15, 74, 17, 29, 142],
 [15, 79, 26],
 [50, 57, 80, 5, 143, 43, 144],
 [145, 11, 146],
 [15, 3, 5, 35, 147],
 [72, 148, 9, 16, 51],
 [1, 44, 58],
 [27, 3, 33],
 [1, 6, 149, 59],
 [8, 40, 45, 2, 150, 151, 46, 68],
 [29, 2, 20, 9, 152, 16, 13, 43],
 [1,

Finding out the maxlent of the sequences to pad all the remaining smaller sequences

In [121]:
def max_len(value):
  maxlen = 0
  for i in value:
    maxlen=max(maxlen,len(i))
  return maxlen

max_val = max_len(x_tokens)
max_val

10

Padding the input to the max length to feed to out model

In [122]:
x_train=pad_sequences(x_tokens,maxlen=max_val,padding='post',truncating='post')

In [123]:
y_train = to_categorical(y) # Converting our int values to categorical

In [124]:
embed_size = 100
embedding_matrix = np.zeros((len(word_index)+1, embed_size))

for word, i in word_index.items():
    embed_vector = embedding[word]
    embedding_matrix[i] = embed_vector

Model Training

In [126]:
model = Sequential([
    Embedding(input_dim = len(word_index) + 1,
              output_dim = embed_size,
              input_length = max_val,
              weights = [embedding_matrix],
              trainable = False
             ),

    LSTM(units = 16, return_sequences = True),
    LSTM(units = 4),
    Dense(5, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [127]:
model.fit(x_train, y_train, epochs = 100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7c80de242770>

Making some predictions

In [128]:
test = ["I feel good", "I feel very bad", "lets eat dinner"]

test_seq = token.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen = max_val, padding = 'post', truncating = 'post')

y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis = 1)

for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

I feel good ❤️
I feel very bad 😞
lets eat dinner 🍽️


Conclusion : Our Model is having 97% accuracy and predicting the emojis as expected