## 1. Load Data

In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Embedding

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
df = pd.read_csv('emoji_data.csv', header = None)
df.head()

Unnamed: 0,0,1
0,French macaroon is so tasty,4
1,work is horrible,3
2,I am upset,3
3,throw the ball,1
4,Good joke,2


In [3]:
emoji_dict = {
    0: "❣️",
    1: "⚾",
    2: "😃",
    3: "😔",
    4: "🍽️"
}

def label_to_emoji(label):
    return emoji_dict[label]

In [4]:
X = df[0].values
Y = df[1].values

## 2. Embeddings

In [5]:
file = open('glove/glove.6B.100d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

print(len(content))

400000


In [6]:
embeddings = {}

for line in content:

    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

In [7]:
embeddings['men']

array([ 0.92508  ,  0.49343  , -0.053259 , -0.2788   , -0.36762  ,
        1.0179   ,  0.25543  ,  0.50571  , -0.72731  ,  0.39231  ,
        0.60123  ,  0.23257  ,  0.71127  ,  0.11146  , -0.1101   ,
       -0.070188 , -0.027965 , -0.10344  , -1.2736   , -0.29982  ,
        1.108    , -0.15707  ,  0.64281  ,  0.31294  , -0.12107  ,
       -0.0072911,  0.10496  , -1.0617   ,  0.64642  , -0.02892  ,
        0.22464  , -0.095795 , -0.56932  , -0.81811  ,  1.0645   ,
       -0.50897  , -0.35844  ,  1.3345   ,  0.087079 ,  0.84503  ,
       -0.45696  , -0.27341  ,  0.1486   , -0.40005  ,  0.073584 ,
        0.027856 , -0.019284 ,  0.10275  ,  0.045384 , -0.51425  ,
       -0.37088  , -0.39773  ,  0.11357  ,  1.6347   , -0.062116 ,
       -1.67     , -0.016011 , -0.12335  ,  1.1064   ,  0.73959  ,
       -0.39408  ,  0.82737  ,  0.3472   , -0.074331 ,  0.74837  ,
       -0.11377  ,  0.39867  ,  0.59619  ,  0.035678 ,  0.18488  ,
        0.039027 , -0.81542  , -0.10443  ,  0.088339 ,  0.1361

In [8]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
        
    return maxlen

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word2index = tokenizer.word_index

In [10]:
word2index['macaroon']

104

In [11]:
Xtokens = tokenizer.texts_to_sequences(X)

maxlen = get_maxlen(Xtokens)
Xtrain = pad_sequences(Xtokens, maxlen = maxlen,  padding = 'post', truncating = 'post')

In [12]:
print(f'Sentence: {X[0]}')
print(f'Token: {Xtrain[0]}')

Sentence: French macaroon is so tasty
Token: [103 104   3   6 105   0   0   0   0   0]


In [13]:
Y[29] = '0'

Ytrain = to_categorical(Y)

## 3. Building Model

In [14]:
embed_size = 100
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    embed_vector = embeddings[word]
    embedding_matrix[i] = embed_vector

In [15]:
embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.046539,  0.61966 ,  0.56647 , ..., -0.37616 , -0.032502,
         0.8062  ],
       [-0.49886 ,  0.76602 ,  0.89751 , ..., -0.41179 ,  0.40539 ,
         0.78504 ],
       ...,
       [-0.46263 ,  0.069864,  0.69095 , ..., -0.29174 ,  0.32041 ,
         0.21202 ],
       [ 0.073242,  0.11134 ,  0.62281 , ...,  0.53417 , -0.1646  ,
        -0.27516 ],
       [ 0.29019 ,  0.80497 ,  0.31187 , ..., -0.33603 ,  0.45998 ,
        -0.11278 ]])

In [16]:
model = Sequential([
    Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = maxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    
    LSTM(units = 16, return_sequences = True),
    LSTM(units = 4),
    Dense(5, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [26]:
model.input_shape

(None, 10)

In [18]:
model.output_shape

(None, 5)

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 100)           31300     
                                                                 
 lstm (LSTM)                 (None, 10, 16)            7488      
                                                                 
 lstm_1 (LSTM)               (None, 4)                 336       
                                                                 
 dense (Dense)               (None, 5)                 25        
                                                                 
Total params: 39,149
Trainable params: 7,849
Non-trainable params: 31,300
_________________________________________________________________


In [20]:
model.fit(Xtrain, Ytrain, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1b4276a98d0>

In [29]:
model.get_layer('lstm').input_shape

(None, 10, 100)

## 4. Testing Model

In [21]:
test = ["I feel good", "Lets play football", "lets eat dinner"]

test_seq = tokenizer.texts_to_sequences(test)
Xtest = pad_sequences(test_seq, maxlen = maxlen, padding = 'post', truncating = 'post')

y_pred = model.predict(Xtest)
y_pred = np.argmax(y_pred, axis = 1)

for i in range(len(test)):
    print(test[i], label_to_emoji(y_pred[i]))


I feel good 😃
Lets play football ⚾
lets eat dinner 🍽️
