In [136]:
# Import the relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import keras
import tensorflow as tf
import emoji

In [137]:
# The emoji dictionary
emoji_dictionary = {#"0": ":red_heart:",    # :heart: prints a black instead of red heart depending on the font
                    "0": "\u2764\ufe0f",
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}

In [138]:
# Load the data
train = pd.read_csv('emojify_data.csv')
X_train = np.array(train.iloc[:, 0])
Y_train = np.array(train.iloc[:, 1], dtype=int)
test = pd.read_csv('tesss.csv')
X_test = np.array(test.iloc[:, 0])
Y_test = np.array(test.iloc[:, 1], dtype=int)
y_train = tf.keras.utils.to_categorical(Y_train, num_classes=5)
y_test = tf.keras.utils.to_categorical(Y_test, num_classes=5)

In [139]:
# Find the maximum length of a sentence
maxLen = len(max(X_train, key=len).split())

In [140]:
# Function to get the emoji for a label
def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)],language='alias')

In [141]:
# Print the first 10 examples
for idx in range(10):
    print(X_train[idx], label_to_emoji(Y_train[idx]))

work is horrible 😞
I am upset 😞
throw the ball ⚾
Good joke 😄
what is your favorite baseball game ⚾
I cooked meat 🍴
stop messing around 😞
I want chinese food 🍴
Let us go play baseball ⚾
you are failing this exercise 😞


In [142]:
# Load the pre-trained GloVe vectors
word_to_index = {}
words = []
embeddings = {}
index = 0
with open('glove.6B.100d.txt', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        words.append(word)
        word_to_index[word] = index
        embeddings[word] = np.asarray(values[1:], dtype='float32')
        index += 1

In [143]:
# Function to find the average of the word embeddings
def sentence_to_avg(sentence,embeddings):
    words = sentence.lower().split()
    word_in_embedding = list(embeddings.keys())[0]
    avg = np.zeros((embeddings[word_in_embedding].shape[0]))
    count = 0
    for w in words:
        if w in embeddings:
            avg += embeddings[w]
            count += 1
    if count > 0:
        avg = avg / count
    return avg

In [144]:
# Softmax function
def softmax(array):
    exps = np.exp(array)
    return exps / np.sum(exps)

In [145]:
# Function to predict the emoji for a batch of sentences
def predict(X, Y, W, b, embeddings):
    m = X.shape[0]
    pred = np.zeros((m, 1))
    any_word = list(embeddings.keys())[0]  
    n_h = embeddings[any_word].shape[0] 
    for j in range(m):                     
        words = X[j].lower().split()
        avg = np.zeros((n_h,))
        count = 0
        for w in words:
            if w in embeddings:
                avg += embeddings[w]
                count += 1
        if count > 0:
            avg = avg / count
        Z = np.dot(W, avg) + b
        A = softmax(Z)
        pred[j] = np.argmax(A)       
    print("Accuracy: "  + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
    return pred

In [146]:
# Function to create and train the model
def model(X, Y, embeddings, learning_rate = 0.01, num_iterations = 200):
    word = list(embeddings.keys())[0]
    embedding_size = embeddings[word].shape[0]
    m = X.shape[0]
    n_y = len(np.unique(Y))
    W = np.random.randn(n_y, embedding_size)
    b = np.zeros((n_y,))
    y = tf.keras.utils.to_categorical(Y, num_classes=n_y)
    for t in range(num_iterations):
        for i in range(m):
            avg = sentence_to_avg(X[i], embeddings)
            z = np.dot(W, avg) + b
            a = softmax(z)
            cost = -np.sum(y[i] * np.log(a))
            dz = a - y[i]
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1,embedding_size))
            db = dz
            W = W - learning_rate * dW
            b = b - learning_rate * db
        if t % 10 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(X, Y, W, b, embeddings)
    return pred, W, b

In [147]:
# Train the model
pred, W, b = model(X_train, Y_train, embeddings)
print(pred)

Epoch: 0 --- cost = 2.0987714537945727
Accuracy: 0.22527472527472528
Epoch: 10 --- cost = 1.1391071163700783
Accuracy: 0.489010989010989
Epoch: 20 --- cost = 0.7575083029379175
Accuracy: 0.6428571428571429
Epoch: 30 --- cost = 0.5585546350928853
Accuracy: 0.7417582417582418
Epoch: 40 --- cost = 0.42666402753596616
Accuracy: 0.8076923076923077
Epoch: 50 --- cost = 0.3362730269638698
Accuracy: 0.8461538461538461
Epoch: 60 --- cost = 0.2726578037606863
Accuracy: 0.8516483516483516
Epoch: 70 --- cost = 0.22640001245872207
Accuracy: 0.8736263736263736
Epoch: 80 --- cost = 0.19169889600362527
Accuracy: 0.8846153846153846
Epoch: 90 --- cost = 0.16503920849618733
Accuracy: 0.9010989010989011
Epoch: 100 --- cost = 0.14414649144328942
Accuracy: 0.9120879120879121
Epoch: 110 --- cost = 0.127422394372808
Accuracy: 0.9230769230769231
Epoch: 120 --- cost = 0.1137393470556019
Accuracy: 0.9230769230769231
Epoch: 130 --- cost = 0.10231934328486482
Accuracy: 0.9340659340659341
Epoch: 140 --- cost = 0.09

In [148]:
# Print the results
print("Training set:")
pred_train = predict(X_train, Y_train, W, b, embeddings)
print('Test set:')
pred_test = predict(X_test, Y_test, W, b, embeddings)

Training set:
Accuracy: 0.9560439560439561
Test set:
Accuracy: 0.8363636363636363


In [149]:
# Predict the emoji for a sentence
def predict_single(sentence, W=W, b=b, embeddings=embeddings):
    avg = sentence_to_avg(sentence, embeddings)
    z = np.dot(W, avg) + b
    a = softmax(z)
    return np.argmax(a)

In [150]:
# Predict the emoji for a sentence
label_to_emoji(int(predict_single("Lets eat")))

'🍴'

In [152]:
# Get the index of the words for a sentence
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        j = 0
        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]
            j = j + 1
    return X_indices

In [153]:
# Define the embedding layer
def pretrained_embedding_layer(embeddings, word_to_index):
    vocab_len = len(word_to_index) + 1
    emb_dim = len(embeddings[list(embeddings.keys())[0]])
    emb_matrix = np.zeros((vocab_len, emb_dim))
    for word, index in word_to_index.items():
        emb_matrix[index, :] = embeddings[word]
    embedding_layer = tf.keras.layers.Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

In [154]:
# Define the model
def Emojify_V2(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = tf.keras.layers.Input(input_shape, dtype='int32')
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)
    X = tf.keras.layers.LSTM(128, return_sequences=True)(embeddings)
    X = tf.keras.layers.Dropout(0.5)(X)
    X = tf.keras.layers.LSTM(128, return_sequences=False)(X)
    X = tf.keras.layers.Dropout(0.5)(X)
    X = tf.keras.layers.Dense(5)(X)
    X = tf.keras.layers.Activation('softmax')(X)
    model = tf.keras.models.Model(inputs=sentence_indices, outputs=X)
    return model

In [155]:
# Create the model
model = Emojify_V2((maxLen,), embeddings, word_to_index)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 10)]              0         
                                                                 
 embedding_4 (Embedding)     (None, 10, 100)           40000100  
                                                                 
 lstm_2 (LSTM)               (None, 10, 128)           117248    
                                                                 
 dropout_2 (Dropout)         (None, 10, 128)           0         
                                                                 
 lstm_3 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 5)                 645 

In [156]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [164]:
# Train the model
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
model.fit(X_train_indices, y_train, epochs = 100, batch_size = 32, shuffle=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1e897f6f040>

In [165]:
# Print the results
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
loss, acc = model.evaluate(X_test_indices, y_test)
print("Test accuracy = ", acc*100)

Test accuracy =  83.63636136054993


In [170]:
# Predict the emoji for a sentence
x_test = np.array(["wow"])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
print(x_test[0] +' '+  label_to_emoji(np.argmax(model.predict(X_test_indices))))

wow 😄
