# Emoji Predictor

### Problem Description

For a given sentence predict a most suitable emoji that represents the sentiment present in that sentences. 

In [30]:
# Importing all required libraries.
import pandas as pd
import numpy as np
import emoji
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import *

In [31]:
# Reading dataset in csv form using pandas.
train = pd.read_csv('Datasets/Train_emoji.csv',header=None)
test = pd.read_csv('Datasets/Test_emoji.csv',header=None)

In [32]:
train.head()

Unnamed: 0,0,1,2,3
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,


In [33]:
test.head()

Unnamed: 0,0,1
0,I want to eat\t,4
1,he did not answer\t,3
2,he got a raise\t,2
3,she got me a present\t,0
4,ha ha ha it was so funny\t,2


In [34]:
# Data cleaning.
XTrain = train[0]
XTest = test[0]
YTrain = to_categorical(train[1])
YTest = to_categorical(test[1])

print(XTrain.shape)
print(XTest.shape)
print(YTrain.shape)
print(YTest.shape)

(132,)
(56,)
(132, 5)
(56, 5)


In [35]:
# Creating dictionary to map emoji to numbers for classification.
emoji_dictionary = { "0" : "\u2764\uFE0F",
                     "1" : ":baseball:",
                     "2" : ":beaming_face_with_smiling_eyes:",
                     "3" : ":downcast_face_with_sweat:",
                     "4" : ":fork_and_knife:"
}

In [36]:
# Creating embedding_index using glove.6B.50d.txt for making embedding layer. We have used the concept of transfer learning.
f = open('Embedding/glove.6B.50d.txt',encoding='utf-8')
embedding_index = {}
cnt = 0
for line in f :
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coefs
f.close()

In [37]:
# Creating function for getting output for embedding layer.
emb_dim = embedding_index["eat"].shape[0]
def embedding_output(X):
    maxLen = 10
    embedding_out = np.zeros((X.shape[0],maxLen,emb_dim))
    for i in range(X.shape[0]):
        X[i] = X[i].split()
        for j in range(len(X[i])):
            try:
                embedding_out[i][j] = embedding_index[X[i][j].lower()]
            except:
                embedding_out[i][j] = np.zeros((50,))
    return embedding_out

In [38]:
# Output for embedding layer.
emb_XTrain = embedding_output(XTrain)
emb_XTest = embedding_output(XTest)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[i] = X[i].split()


In [39]:
print(emb_XTrain.shape)
print(emb_XTest.shape)

(132, 10, 50)
(56, 10, 50)


In [40]:
# Creating an LSTM model for predicting emoji. 
model = Sequential()
model.add(LSTM(64,input_shape=(10,50),return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64,return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(5))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 10, 64)            29440     
                                                                 
 dropout_4 (Dropout)         (None, 10, 64)            0         
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 5)                 325       
                                                                 
 activation_2 (Activation)   (None, 5)                 0         
                                                                 
Total params: 62,789
Trainable params: 62,789
Non-trai

In [41]:
# Training model.
checkpoint = ModelCheckpoint('best_model.h5',monitor='val_loss',verbose=True,save_best_only=True)
earlystop = EarlyStopping(monitor='val_acc',patience=10)
hist = model.fit(emb_XTrain,YTrain,batch_size=32,epochs=150,shuffle=True,validation_split=0.1)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150


Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150


Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [42]:
# Finding accuracy for our model on testing dataset.
model.evaluate(emb_XTest,YTest)



[2.518242597579956, 0.6607142686843872]

P.S. : Reason for an average accuracy on validation data is due to lack of training dataset.