### Checking Out Dataset

In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('data/train_emoji.csv')
test_data = pd.read_csv('data/tess.csv')

In [3]:
train_data.head()

Unnamed: 0,never talk to me again,3,Unnamed: 2,Unnamed: 3
0,I am proud of your achievements,2,,
1,It is the worst day in my life,3,,
2,Miss you so much,0,,[0]
3,food is life,4,,
4,I love you mum,0,,


In [4]:
test_data.head()

Unnamed: 0,I want to eat,4
0,he did not answer\t,3
1,he got a raise\t,2
2,she got me a present\t,2
3,ha ha ha it was so funny\t,2
4,he is a good friend\t,2


### IMPORTING GloVe WORD EMBEDDINGS and PRE-PROCESSING DATASET

In [5]:
import numpy as np
def read_glove_vecs(glove_file):
  with open(glove_file, 'r', errors = 'ignore') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      line = line.strip().split()
      words.add(line[0])
      word_to_vec_map[line[0]] = np.array(line[1:], dtype = np.float64)
    
    i = 1
    words_to_index = {}
    index_to_words = {}
    for w in sorted(words):
      words_to_index[w] = i
      index_to_words[i] = w
      i = i+1
      
  return words_to_index, index_to_words, word_to_vec_map

In [6]:
words_to_index, index_to_words, word_to_vec_map = read_glove_vecs('../../readonly/glove.6B.50d.txt')

In [7]:
words_to_index['cucumber']

113317

In [8]:
word_to_vec_map['cucumber']

array([ 0.68224 , -0.31608 , -0.95201 ,  0.47108 ,  0.56571 ,  0.13151 ,
        0.22457 ,  0.094995, -1.3237  , -0.51545 , -0.39337 ,  0.88488 ,
        0.93826 ,  0.22931 ,  0.088624, -0.53908 ,  0.23396 ,  0.73245 ,
       -0.019123, -0.26552 , -0.40433 , -1.5832  ,  1.1316  ,  0.4419  ,
       -0.48218 ,  0.4828  ,  0.14938 ,  1.1245  ,  1.0159  , -0.50213 ,
        0.83831 , -0.31303 ,  0.083242,  1.7161  ,  0.15024 ,  1.0324  ,
       -1.5005  ,  0.62348 ,  0.54508 , -0.88484 ,  0.53279 , -0.085119,
        0.02141 , -0.56629 ,  1.1463  ,  0.6464  ,  0.78318 , -0.067662,
        0.22884 , -0.042453])

In [9]:
X = train_data.ix[:,0]

In [10]:
X.shape

(131,)

In [11]:
Y = train_data.ix[:,1]
Y.shape

(131,)

Converting labels into one hot encodings

In [12]:
def one_hot(Y, C):
    return np.eye(C)[Y.reshape(-1)]

In [13]:
l = []
for i in range(len(Y)):
    l.append(one_hot(Y[i], 5))
Y = np.array(l)
Y = Y.reshape((131, 5))

In [14]:
Y.shape

(131, 5)

In [15]:
Y[0]

array([ 0.,  0.,  1.,  0.,  0.])

### Making Model

In [16]:
import keras
from keras.layers import Dense, Dropout, LSTM, Activation, Input
from keras.models import Model
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [17]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        j = 0
        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]
            j = j+1
    return X_indices

In [18]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1   #adding 1 is requirement of keras
    emb_len = word_to_vec_map['cucumber'].shape[0]
    emb_matrix = np.zeros((vocab_len, emb_len))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    embedding_layer = Embedding(vocab_len, emb_len, trainable = False)
    embedding_layer.build((None,))  #Build the embedding layer, it is required before setting the weights of the embedding layer.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [19]:
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)

In [20]:
def LSTM_model(input_shape, word_to_vec_map, word_to_index):
    sentence_indices = Input(shape = input_shape, dtype = np.int32)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
    embeddings = embedding_layer(sentence_indices)
    X = LSTM(128, return_sequences = True)(embeddings)
    X = Dropout(0.5)(X)
    X = LSTM(128, return_sequences = True)(X)
    X = Dropout(0.5)(X)
    X = LSTM(64, return_sequences = False)(X)
    X = Dropout(0.25)(X)
    X = Dense(5, activation = 'softmax')(X)
    
    model = Model(inputs = sentence_indices, outputs = X)
    
    return model

In [21]:
maxlen = len(max(X, key = len).split())
model = LSTM_model((maxlen,), word_to_vec_map, words_to_index)

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 10, 50)            20000050  
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 128)           131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                49408     
__________

### Visualising model

In [23]:
from keras.utils import plot_model
plot_model(model, to_file = 'LSTM_model.png', show_shapes = True)

In [24]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [25]:
X = sentences_to_indices(X, words_to_index, maxlen)

In [26]:
X.shape

(131, 10)

In [27]:
model.fit(X, Y, epochs = 50, batch_size = 32, shuffle = True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f8d5a501d30>

### Testing performance on Test set

In [28]:
X_test = test_data.ix[:,0]
X_test = sentences_to_indices(X_test, words_to_index, maxlen)
X_test.shape

(55, 10)

In [29]:
Y_test = test_data.ix[:,1]

In [30]:
l = []
for i in range(len(Y_test)):
    l.append(one_hot(Y_test[i], 5))
Y_test = np.array(l)

In [31]:
Y_test = Y_test.reshape((55, 5))

In [32]:
loss, acc = model.evaluate(X_test, Y_test)
print()
print("Test accuracy = ", acc)

Test accuracy =  0.745454555208


### Code to convert label to emoji

In [33]:
import emoji

In [34]:
emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}
def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary[str(label)], use_aliases = True)

### Predicting emoji for a text

In [35]:
x_test = np.array(['i am very happy !'])
x_test = sentences_to_indices(x_test, words_to_index, maxlen)
print(x_test[0])
print(label_to_emoji(np.argmax(model.predict(x_test))))

[  1.85457000e+05   5.29430000e+04   3.77946000e+05   1.73081000e+05
   1.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00]
😄
