# Making Emoji Predictions for Sentences

In [37]:
!pip install emoji



In [38]:
import emoji

In [39]:
emoji.EMOJI_UNICODE
## Gives the dict of expression->emoji in the emoji dictionary

{':1st_place_medal:': 'ü•á',
 ':2nd_place_medal:': 'ü•à',
 ':3rd_place_medal:': 'ü•â',
 ':AB_button_(blood_type):': 'üÜé',
 ':ATM_sign:': 'üèß',
 ':A_button_(blood_type):': 'üÖ∞',
 ':Afghanistan:': 'üá¶üá´',
 ':Albania:': 'üá¶üá±',
 ':Algeria:': 'üá©üáø',
 ':American_Samoa:': 'üá¶üá∏',
 ':Andorra:': 'üá¶üá©',
 ':Angola:': 'üá¶üá¥',
 ':Anguilla:': 'üá¶üáÆ',
 ':Antarctica:': 'üá¶üá∂',
 ':Antigua_&_Barbuda:': 'üá¶üá¨',
 ':Aquarius:': '‚ôí',
 ':Argentina:': 'üá¶üá∑',
 ':Aries:': '‚ôà',
 ':Armenia:': 'üá¶üá≤',
 ':Aruba:': 'üá¶üáº',
 ':Ascension_Island:': 'üá¶üá®',
 ':Australia:': 'üá¶üá∫',
 ':Austria:': 'üá¶üáπ',
 ':Azerbaijan:': 'üá¶üáø',
 ':BACK_arrow:': 'üîô',
 ':B_button_(blood_type):': 'üÖ±',
 ':Bahamas:': 'üáßüá∏',
 ':Bahrain:': 'üáßüá≠',
 ':Bangladesh:': 'üáßüá©',
 ':Barbados:': 'üáßüáß',
 ':Belarus:': 'üáßüáæ',
 ':Belgium:': 'üáßüá™',
 ':Belize:': 'üáßüáø',
 ':Benin:': 'üáßüáØ',
 ':Bermuda:': 'üáßüá≤',
 ':Bhutan:': 'üáßüáπ'

In [40]:
# We used only these 5 emojis as our test data had only these emojis 
# (check the train and test csv files)

emoji_dictionary = {
    "0": "\u2764\uFE0F",
    "1": ":baseball:",
    "2": ":grinning_face_with_big_eyes:",
    "3": ":disappointed_face:",
    "4": ":fork_and_knife:"
}

In [41]:
# Checking out the emojis we have
for ix in emoji_dictionary.values():
    print(emoji.emojize(ix))

‚ù§Ô∏è
‚öæ
üòÉ
üòû
üç¥


## Processing the dataset

In [42]:
import numpy as np
import pandas as pd

In [43]:
train = pd.read_csv('train_emoji.csv',header=None)
test = pd.read_csv('test_emoji.csv',header=None)

In [44]:
train.head() # only the first two columns are useful for us

Unnamed: 0,0,1,2,3
0,never talk to me again,3,,
1,I am proud of your achievements,2,,
2,It is the worst day in my life,3,,
3,Miss you so much,0,,[0]
4,food is life,4,,


In [45]:
data = train.values # Converting a CSV file into numpy array
# we have 132 sentences in train dataset and an emoji label corresponding to each
print(data.shape)

(132, 4)


In [46]:
# visualizing the training dataset
for ix in range (5):
    print (data[ix][0], emoji.emojize(emoji_dictionary[str(data[ix][1])]))

never talk to me again üòû
I am proud of your achievements üòÉ
It is the worst day in my life üòû
Miss you so much ‚ù§Ô∏è
food is life üç¥


In [47]:
X_train_temp = train[0]
y_train = train[1]

X_test_temp = test[0]
y_test = test[1]

In [48]:
# Converting each sentence into a list of words
# (As we have the vector encodings for each word)
X_train = []
X_test = []
for ix in range(X_train_temp.shape[0]):
    X_train.append (X_train_temp[ix].split())

for ix in range(X_test_temp.shape[0]):
    X_test.append (X_test_temp[ix].split())

X_train = np.array (X_train)
X_test = np.array (X_test)

print(X_train.shape,X_test.shape)
print(X_test[1])

(132,) (56,)
['he', 'did', 'not', 'answer']


In [49]:
# Visualizing the Training dataset
for i in range(5):
    print(X_train[i], emoji.emojize(emoji_dictionary[str(y_train[i])]))

['never', 'talk', 'to', 'me', 'again'] üòû
['I', 'am', 'proud', 'of', 'your', 'achievements'] üòÉ
['It', 'is', 'the', 'worst', 'day', 'in', 'my', 'life'] üòû
['Miss', 'you', 'so', 'much'] ‚ù§Ô∏è
['food', 'is', 'life'] üç¥


## Converting sentences into embeddings

In [50]:
# Using the Glove vector encodings developed by Stanford
f = open('glove.6B.50d.txt',encoding='utf-8')

In [51]:
## checking out the format
for line in f:
    print(line)
    break

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581



In [52]:
# dictionary embedding_index will have all the
# words mapped to their corresponding embedding vectors
embedding_index = {}
for line in f:
    values = line.split()
    word = values[0]
    emb = np.asarray(values[1:],dtype='float')
    embedding_index[word] = emb
f.close()

In [53]:
# Checking out the dimension of all the word embeddings
emb_dim = embedding_index['eat'].shape[0]
print(emb_dim)

50


## Converting sentences into vectors 
**(Creating the embedding layer output)**

In [54]:
def embedding_output(X):
    maxLen = 10 # no sentence have more than 10 words
    embedding_out = np.zeros((X.shape[0], maxLen, emb_dim)) 
    
    for ix in range(X.shape[0]):
        for ij in range(len(X[ix])):
            # checking if we have the current word in the dict: embedding_index
            try:
                embedding_out[ix][ij] = embedding_index[X[ix][ij].lower()]
            except:
                embedding_out[ix][ij] = np.zeros((50,))
    return embedding_out

In [55]:
embedding_matrix_train = embedding_output(X_train)
embedding_matrix_test = embedding_output(X_test)

In [56]:
print(embedding_matrix_train.shape, embedding_matrix_test.shape)

(132, 10, 50) (56, 10, 50)


## Define the LSTM/RNN model

In [57]:
from keras.layers import *
from keras.models import Sequential
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [58]:
# Converting the output format into one-hot encodings
YT = np_utils.to_categorical(y_train)
Yt = np_utils.to_categorical(y_test)

In [59]:
YT.shape

(132, 5)

In [60]:
# Using a stacked LSTM Model
# Did this first with a normal LSTM model, then tried
# with stacked LSTM to improve accuracy
model = Sequential()
model.add(LSTM(64, input_shape=(10,50), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64,return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))
model.summary()

# Note: Didn't add an embedding layer as it would have increased
# the number of parameters a lot
# this would have caused overfitting as input data is very small

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 10, 64)            29440     
_________________________________________________________________
dropout_5 (Dropout)          (None, 10, 64)            0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
Total params: 62,789
Trainable params: 62,789
Non-trainable params: 0
_________________________________________________________________


In [61]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [86]:
# Using early stop as my test data was basically overfitting which was 
# visible from the accuracy which almost reached 100% after some epochs
earlystop = EarlyStopping(monitor='val_acc',patience=15)
checkpoint = ModelCheckpoint("best_model.h5", monitor='val_loss', verbose=True, save_best_only=True)
hist = model.fit(embedding_matrix_train,YT,epochs=100,batch_size=64,validation_split=0.20, shuffle=True, callbacks=[checkpoint, earlystop])

Train on 105 samples, validate on 27 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.40583, saving model to best_model.h5
Epoch 2/100

Epoch 00002: val_loss did not improve from 1.40583
Epoch 3/100

Epoch 00003: val_loss did not improve from 1.40583
Epoch 4/100

Epoch 00004: val_loss did not improve from 1.40583
Epoch 5/100

Epoch 00005: val_loss did not improve from 1.40583
Epoch 6/100

Epoch 00006: val_loss did not improve from 1.40583
Epoch 7/100

Epoch 00007: val_loss did not improve from 1.40583
Epoch 8/100

Epoch 00008: val_loss did not improve from 1.40583
Epoch 9/100

Epoch 00009: val_loss did not improve from 1.40583
Epoch 10/100

Epoch 00010: val_loss did not improve from 1.40583
Epoch 11/100

Epoch 00011: val_loss did not improve from 1.40583
Epoch 12/100

Epoch 00012: val_loss did not improve from 1.40583
Epoch 13/100

Epoch 00013: val_loss did not improve from 1.40583
Epoch 14/100

Epoch 00014: val_loss did not improve from 1.40583
Epoch 15/100

Epoch 000

In [87]:
# Loading the best model
model.load_weights("best_model.h5")

In [88]:
model.evaluate(embedding_matrix_test, Yt)



[1.6176965577261788, 0.5535714200564793]

In [89]:
# predict_classes function gives us the output predictions
# in general form (not one hot encoded)
pred = model.predict_classes(embedding_matrix_test)

## Getting Emoji Output

In [90]:
for ix in range(30):
    print(' '.join(X_test[ix]), emoji.emojize(emoji_dictionary[str(pred[ix])]), emoji.emojize(emoji_dictionary[str(y_test[ix])]))

I want to eat üç¥ üç¥
he did not answer üòû üòû
he got a raise üòû üòÉ
she got me a present üòÉ ‚ù§Ô∏è
ha ha ha it was so funny üòÉ üòÉ
he is a good friend üòÉ ‚ù§Ô∏è
I am upset ‚öæ ‚ù§Ô∏è
We had such a lovely dinner tonight üòÉ ‚ù§Ô∏è
where is the food üç¥ üç¥
Stop making this joke ha ha ha üòÉ üòÉ
where is the ball ‚öæ ‚öæ
work is hard üòû üòû
This girl is messing with me ‚ù§Ô∏è üòû
are you serious ha ha üòÉ üòÉ
Let us go play baseball ‚öæ ‚öæ
This stupid grader is not working üòû üòû
work is horrible üòÉ üòû
Congratulation for having a baby üòÉ üòÉ
stop messing around üòû üòû
any suggestions for dinner üòÉ üç¥
I love taking breaks üòû ‚ù§Ô∏è
you brighten my day ‚ù§Ô∏è üòÉ
I boiled rice üç¥ üç¥
she is a bully ‚ù§Ô∏è üòû
Why are you feeling bad üòû üòû
I am upset ‚öæ üòû
I worked during my birthday üòÉ üòû
My grandmother is the love of my life ‚ù§Ô∏è ‚ù§Ô∏è
enjoy your break ‚öæ üòÉ
valentine day is near üòÉ ‚ù§Ô∏è
