![image](https://github.com/Coding-Lane/Emojify-Text/blob/main/emoji.png?raw=true)

In [31]:
import numpy as np
import pandas as pd
import emoji

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [32]:
data = pd.read_csv('emoji_data.csv', header = None)
data.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [33]:
emoji_dict = {
    0: ":red_heart:",
    1: ":baseball:",
    2: ":grinning_face_with_big_eyes:",
    3: ":disappointed_face:",
    4: ":fork_and_knife_with_plate:"
}

def label_to_emoji(label):
    return emoji.emojize(emoji_dict[label])

In [34]:
X = data[0].values
Y = data[1].values

# Embeddings

In [35]:
file = open('glove.6B.100d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

# content

In [36]:
embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

In [37]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

In [38]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word2index = tokenizer.word_index
word2index

{'i': 1,
 'you': 2,
 'is': 3,
 'the': 4,
 'a': 5,
 'so': 6,
 'am': 7,
 'my': 8,
 'to': 9,
 'this': 10,
 'are': 11,
 'ha': 12,
 'for': 13,
 'she': 14,
 'he': 15,
 'me': 16,
 'not': 17,
 'love': 18,
 'your': 19,
 'want': 20,
 'have': 21,
 'it': 22,
 'got': 23,
 'like': 24,
 'did': 25,
 'baseball': 26,
 'food': 27,
 'was': 28,
 'do': 29,
 'joke': 30,
 'stop': 31,
 'will': 32,
 'miss': 33,
 'life': 34,
 'ball': 35,
 'good': 36,
 'what': 37,
 'go': 38,
 'job': 39,
 'funny': 40,
 'bad': 41,
 'day': 42,
 'great': 43,
 'dinner': 44,
 'that': 45,
 'with': 46,
 'at': 47,
 'of': 48,
 'game': 49,
 'we': 50,
 'again': 51,
 'said': 52,
 'yes': 53,
 'lol': 54,
 'and': 55,
 'down': 56,
 'had': 57,
 'her': 58,
 'fun': 59,
 'smile': 60,
 'lot': 61,
 'working': 62,
 'him': 63,
 'cute': 64,
 'on': 65,
 'lets': 66,
 'messing': 67,
 'us': 68,
 'play': 69,
 'exercise': 70,
 'lost': 71,
 'never': 72,
 'where': 73,
 'can': 74,
 'well': 75,
 'much': 76,
 'valentine': 77,
 'restaurant': 78,
 'awesome': 79,
 'lik

In [39]:
Xtokens = tokenizer.texts_to_sequences(X)
maxlen = get_maxlen(Xtokens)
print(maxlen)

10


In [40]:
Xtrain = pad_sequences(Xtokens, maxlen = maxlen,  padding = 'post', truncating = 'post')

In [43]:
for i in range(len(Y)):
    # Y[i] = Y[i].replace("0v", "")    
    Y[i] = Y[i].replace("0v", "")

In [44]:
Y = Y.astype(float)
Ytrain = to_categorical(Y)
Ytrain

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0.

# Model

In [45]:
embed_size = 100
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    embed_vector = embeddings[word]
    embedding_matrix[i] = embed_vector

In [46]:
embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.046539,  0.61966 ,  0.56647 , ..., -0.37616 , -0.032502,
         0.8062  ],
       [-0.49886 ,  0.76602 ,  0.89751 , ..., -0.41179 ,  0.40539 ,
         0.78504 ],
       ...,
       [-0.46263 ,  0.069864,  0.69095 , ..., -0.29174 ,  0.32041 ,
         0.21202 ],
       [ 0.073242,  0.11134 ,  0.62281 , ...,  0.53417 , -0.1646  ,
        -0.27516 ],
       [ 0.29019 ,  0.80497 ,  0.31187 , ..., -0.33603 ,  0.45998 ,
        -0.11278 ]])

In [47]:
model = Sequential([
    Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = maxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    
    LSTM(units = 16, return_sequences = True),
    LSTM(units = 4),
    Dense(5, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])



In [48]:
model.fit(Xtrain, Ytrain, epochs = 100)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.2994 - loss: 1.5818
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3429 - loss: 1.5523 
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3975 - loss: 1.5278 
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3909 - loss: 1.5121 
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3658 - loss: 1.4887 
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4008 - loss: 1.4697 
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3483 - loss: 1.4636 
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3879 - loss: 1.4368 
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x2555c4634d0>

In [50]:
test = ["I love you", "I feel very bad", "lets eat dinner"]

test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen = maxlen, padding = 'post', truncating = 'post')

y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis = 1)

for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
I love you ❤️
I feel very bad 😞
lets eat dinner 🍽️
