In [1]:
!pip install emoji keras



In [2]:
# Importing the required packages and dependencies
import numpy as np
import pandas as pd

import emoji

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
# Importing the dataset
data = pd.read_csv("emoji_data.csv", header = None)
data.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [4]:
# Making a dictionary of the of the list of emojies to be predicted
emoji_dict = {
    0 : ":red_heart:",
    1 : ":baseball:",
    2 : ":grinning_face_with_big_eyes:",
    3 : ":disappointed_face:",
    4 : ":fork_and_knife_with_plate:"
}

# Making a function that will accept the emoji_dict as the input
def label_to_emoji(label):

  # It will return the correspong emoji
  return emoji.emojize(emoji_dict[label])

In [5]:
# Splitting the Dataset into X and Y
X = data[0].values
Y = data[1].values

In [6]:
# Printing the first 5 lines in the X dataset
print(X[:5])

['French macaroon is so tasty' 'work is horrible' 'I am upset'
 'throw the ball' 'Good joke']


In [7]:
# Printing the first 5 values in the Y dataset
print(Y[:5])

[4 3 3 1 2]


# **Embeddings**

In [8]:
# Opening the vector file that to be used during Embedding
file = open("glove.6B.100d.txt", "r", encoding = "utf8")

# Reading the file
content = file.readlines()
file.close()

In [9]:
# Printing the first 2 words in the glove6b100d file
for line in content[:2]:
  line = line.split()
  print(line)

['the', '-0.038194', '-0.24487', '0.72812', '-0.39961', '0.083172', '0.043953', '-0.39141', '0.3344', '-0.57545', '0.087459', '0.28787', '-0.06731', '0.30906', '-0.26384', '-0.13231', '-0.20757', '0.33395', '-0.33848', '-0.31743', '-0.48336', '0.1464', '-0.37304', '0.34577', '0.052041', '0.44946', '-0.46971', '0.02628', '-0.54155', '-0.15518', '-0.14107', '-0.039722', '0.28277', '0.14393', '0.23464', '-0.31021', '0.086173', '0.20397', '0.52624', '0.17164', '-0.082378', '-0.71787', '-0.41531', '0.20335', '-0.12763', '0.41367', '0.55187', '0.57908', '-0.33477', '-0.36559', '-0.54857', '-0.062892', '0.26584', '0.30205', '0.99775', '-0.80481', '-3.0243', '0.01254', '-0.36942', '2.2167', '0.72201', '-0.24978', '0.92136', '0.034514', '0.46745', '1.1079', '-0.19358', '-0.074575', '0.23353', '-0.052062', '-0.22044', '0.057162', '-0.15806', '-0.30798', '-0.41625', '0.37972', '0.15006', '-0.53212', '-0.2055', '-1.2526', '0.071624', '0.70565', '0.49744', '-0.42063', '0.26148', '-1.538', '-0.30223

In [10]:
# Initializing the empty embedding dictionary
embeddings = {}

# Looping through all the words
for line in content:
  line = line.split()
  # saving the word as key and saving the values as the value
  embeddings[line[0]] = np.array(line[1:], dtype = float)

In [11]:
# Initializing the tokenizer
tokenizer = Tokenizer()

# Fitting the tokenizer on the X dataset
tokenizer.fit_on_texts(X)

# Changing the words into index
word2index = tokenizer.word_index

In [12]:
# changing the text into sequence
Xtokens = tokenizer.texts_to_sequences(X)

In [13]:
# Making a function to get the maximum length of the sequence
def get_maxlen(data):

  # First initializing the maxlen as 0
  maxlen = 0

  # Looping through all the sequences in the dataset
  for sentence in data:
    # Taking only the maximum value
    maxlen = max(maxlen, len(sentence))
  return maxlen

maxlen = get_maxlen(Xtokens)
print(maxlen)

10


In [14]:
# Post Padding all the sequences to the length of maxlen
xtrain = pad_sequences(Xtokens,
                       maxlen = maxlen,
                       padding = "post",
                       truncating = "post")

In [15]:
# Printing the first 2 lines after padding
print(xtrain[:2])

[[103 104   3   6 105   0   0   0   0   0]
 [106   3 107   0   0   0   0   0   0   0]]


In [16]:
# Changing the Y dataset into categorical
ytrain = to_categorical(Y)

# **Model**

In [17]:
# Taking the embedding size = 100 as the glove dataset is of 100 dimension
embd_size = 100

# Initializing a embedding matrix with 0 values
embedding_matrix = np.zeros((len(word2index) + 1, embd_size))

# Looping through the word2index dictionary
for word, i in word2index.items():
  # Taking the values form the embedding dictionary corresponding to the word
  if word in embeddings:
    embd_vector = embeddings[word]
    # Addin the value to the embedding matrix
    embedding_matrix[i] = embd_vector
  else:
    # Handling the case when the word is not found in embeddings
    pass

In [18]:
# Shape of the embedding matrix
print(f"Shape of the Embedding Matrix : {embedding_matrix.shape}")

Shape of the Embedding Matrix : (313, 100)


In [19]:
# Making a Deep Learning Model
model = Sequential([
    # 1st layer is the Embedding Layer
    Embedding(input_dim = len(word2index)+1,
              output_dim = embd_size,
              input_length = maxlen,
              weights = [embedding_matrix],
              trainable = False),

    # Adding 2 LSTM layers
    LSTM(units = 16, return_sequences = True),
    LSTM(units = 4),

    # Output Layer that will predict
    Dense(units = 5, activation = "softmax")
])

# Compiling the Model with the "Adam" Optimizer
model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])

In [20]:
# Summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 100)           31300     
                                                                 
 lstm (LSTM)                 (None, 10, 16)            7488      
                                                                 
 lstm_1 (LSTM)               (None, 4)                 336       
                                                                 
 dense (Dense)               (None, 5)                 25        
                                                                 
Total params: 39149 (152.93 KB)
Trainable params: 7849 (30.66 KB)
Non-trainable params: 31300 (122.27 KB)
_________________________________________________________________


In [29]:
# Training the model
model.fit(xtrain, ytrain, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7a6938de8fd0>

In [30]:
# Now testing the Model on new data
test = ["I feel good", "I feel very bad", "Lets eat dinner", "Hey", "Thank you", "Dinner Time", "Baseball"]

# Tokenizing the Test Data
test_seq = tokenizer.texts_to_sequences(test)
# Padding the data
xtest = pad_sequences(test_seq, maxlen = maxlen, padding = "post", truncating = "post")

# Making the predictions
y_pred = model.predict(xtest)
# Only taking the Maximum value out of the 5 predictions
y_pred = np.argmax(y_pred, axis = 1)

# Printing the Test Data with the Predicted emoji
for i in range(len(xtest)):
  print(test[i], label_to_emoji(y_pred[i]))

I feel good ❤️
I feel very bad 😞
Lets eat dinner 🍽️
Hey 😃
Thank you 😃
Dinner Time 🍽️
Baseball ⚾
