In [41]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.preprocessing import sequence


In [42]:
def read_csv(filename):
  data_frame = pd.read_csv(filename)
  X = np.asarray(data_frame['sentence'])
  Y = np.asarray(data_frame['label'], dtype = int)
  return X, Y



In [43]:
X_train, Y_train = read_csv('train.csv')
X_test, Y_test = read_csv('test.csv')



In [44]:
def label_to_emoji(label):
  emojies = ['❤️','⚾️','😄','😔','🍴']
  return emojies[label]


In [45]:
index = 5
print(X_train[index], label_to_emoji(Y_train[index]))

I love you mum ❤️


In [46]:
max_len = len(max(X_train, key = len).split(' '))
max_len

10

## Emojifier-V1

In [47]:
# One hot encoding (lenght = 5)
Y_train_oh = tf.keras.utils.to_categorical(Y_train, 5)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, 5)


In [48]:
index = 7
print(Y_train[index], 'is converted into one hot : ', Y_train_oh[index])

2 is converted into one hot :  [0. 0. 1. 0. 0.]


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glove.6B

In [50]:
def read_glove_vectors(glove_file):
  f = open(glove_file, encoding = 'utf8')

  words = set()
  words_to_vec = dict()
  for line in f:
    line = line.strip().split()
    word = line[0]
    vec = line[1:]
    words.add(word)
    words_to_vec[word] = np.array(vec, dtype = np.float64)

  return words_to_vec  




In [51]:
words_to_vec = read_glove_vectors('/content/glove.6B/glove.6B.50d.txt')

In [52]:
words_to_vec['hello']


array([-0.38497 ,  0.80092 ,  0.064106, -0.28355 , -0.026759, -0.34532 ,
       -0.64253 , -0.11729 , -0.33257 ,  0.55243 , -0.087813,  0.9035  ,
        0.47102 ,  0.56657 ,  0.6985  , -0.35229 , -0.86542 ,  0.90573 ,
        0.03576 , -0.071705, -0.12327 ,  0.54923 ,  0.47005 ,  0.35572 ,
        1.2611  , -0.67581 , -0.94983 ,  0.68666 ,  0.3871  , -1.3492  ,
        0.63512 ,  0.46416 , -0.48814 ,  0.83827 , -0.9246  , -0.33722 ,
        0.53741 , -1.0616  , -0.081403, -0.67111 ,  0.30923 , -0.3923  ,
       -0.55002 , -0.68827 ,  0.58049 , -0.11626 ,  0.013139, -0.57654 ,
        0.048833,  0.67204 ])

In [53]:
def sentence_to_avg(sentence):
  words = sentence.lower().split()
  sum_vectors = np.zeros((50,))
  for w in words:
    sum_vectors += words_to_vec[w]

  avg_vectors = sum_vectors / len(words)
  return avg_vectors
    

In [54]:
sentence_to_avg('Pasta is my favorite food')


array([ 0.242832  ,  0.370774  , -0.524396  ,  0.018644  ,  0.568756  ,
        0.0219878 , -0.48206322, -0.152204  ,  0.235412  ,  0.1979466 ,
       -0.178818  ,  0.3203976 ,  0.3379962 ,  0.1399654 ,  0.56775044,
        0.118648  , -0.04531252,  0.335416  ,  0.149832  , -0.522814  ,
        0.095746  , -0.0468764 ,  0.5508066 ,  0.39369132,  0.275182  ,
       -1.275018  , -0.76076   ,  0.449102  ,  0.7542772 , -0.2332608 ,
        2.82554   ,  0.287742  , -0.325976  ,  0.608572  , -0.020543  ,
        0.286476  , -0.24984   ,  0.899408  ,  0.38995   , -0.270266  ,
        0.3004734 ,  0.315962  , -0.2408782 ,  0.1586226 ,  0.5400462 ,
        0.412066  , -0.1657008 , -0.253566  ,  0.3091806 ,  0.371192  ])

In [55]:
X_train_avg = []
for i in range(X_train.shape[0]):
  X_train_avg.append(sentence_to_avg(X_train[i]))

X_train_avg = np.array(X_train_avg)



In [56]:
print(X_train_avg.shape)
print(Y_train_oh.shape)


(132, 50)
(132, 5)


In [57]:
class EmojiNet_V1(Model):
  def __init__(self):
    super().__init__()
    self.dense = Dense(5, input_shape = (50,), activation = 'softmax')

  def call(self,x):
    x = self.dense(x)
    return x 


In [58]:
model_v1 = EmojiNet_V1()


In [59]:
model_v1.compile(optimizer = tf.keras.optimizers.Adam(),
                 loss = 'categorical_crossentropy',
                 metrics = ['accuracy'])

In [60]:
model_v1.fit(X_train_avg, Y_train_oh, epochs = 400, shuffle = True)

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

<keras.callbacks.History at 0x7fd4ffbc8c10>

In [61]:
X_test_avg = []
for i in range(X_test.shape[0]):
  X_test_avg.append(sentence_to_avg(X_test[i]))

X_test_avg = np.array(X_test_avg)  

In [62]:
model_v1.evaluate(X_test_avg,Y_test_oh)



[0.6382743120193481, 0.8035714030265808]

In [63]:
X_me = np.array(['i adore you','playing football','delicious food','not bad'])
Y_me = np.array([[0],[1],[4],[3]])

X_avg_me = []
for x in X_me:
  X_avg_me.append(sentence_to_avg(x))
X_avg_me = np.array(X_avg_me)

pred = model_v1.predict(X_avg_me)

for i in range(X_me.shape[0]): 
  print(X_me[i], label_to_emoji(np.argmax(pred[i]))) 





i adore you ❤️
playing football ⚾️
delicious food 🍴
not bad 😔


## Emojifier_v2:using RNN

In [96]:
class EmojiNet_V2(Model):
  def __init__(self):
    super().__init__()
    self.lstm_1 = LSTM(128, return_sequences = True)
    self.dropout_1 = Dropout(0.2)
    self.lstm_2 = LSTM(128)
    self.dropout_2 = Dropout(0.2)
    self.dense = Dense(5, activation = 'softmax')

  def call(self, x):
    x = self.lstm_1(x)
    x = self.dropout_1(x)
    x = self.lstm_2(x)
    x = self.dropout_2(x)
    x = self.dense(x)
    return x




In [97]:
model_V2 = EmojiNet_V2()


In [98]:
model_V2.compile(optimizer = tf.keras.optimizers.Adam(),
                 loss = 'categorical_crossentropy',
                 metrics = ['accuracy'])

In [99]:
def convert_sentences_to_embeddings(X):
  emb_dim = 50 #size of feature vector
  emb_matrix = np.zeros((X.shape[0], max_len, emb_dim))
  for i in range(X.shape[0]):
    words = X[i].lower().split()
    for j in range(len(words)):
      emb_matrix[i, j, :] = words_to_vec[words[j]]

  return emb_matrix    

In [100]:
X_me = np.array(['i adore you','playing football','delicious food','not bad'])
print(convert_sentences_to_embeddings(X_me))

[[[ 1.1891e-01  1.5255e-01 -8.2073e-02 ... -5.7512e-01 -2.6671e-01
    9.2121e-01]
  [ 1.3144e-01  5.0610e-03 -1.0851e-01 ... -4.2291e-01 -4.7042e-01
    2.8576e-01]
  [-1.0919e-03  3.3324e-01  3.5743e-01 ... -4.5697e-01 -4.8969e-02
    1.1316e+00]
  ...
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]]

 [[-1.0398e+00  2.5462e-01 -1.0193e+00 ... -5.3440e-01 -6.1264e-01
    3.0941e-01]
  [-1.8209e+00  7.0094e-01 -1.1403e+00 ...  9.6903e-01 -7.6259e-02
   -5.9909e-01]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  ...
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.0000e+00]
  [ 0.0000e+00  0.0000e+00  0.0000e+00 ...  0.0000e+00  0.0000e+00
    0.

In [101]:
X_train_embs = convert_sentences_to_embeddings(X_train)
X_train_embs.shape

(132, 10, 50)

In [102]:
model_V2.fit(X_train_embs, Y_train_oh, epochs = 40, batch_size = 32, shuffle = True)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fd4ff71d050>

In [103]:
X_test_embs = convert_sentences_to_embeddings(X_test)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, 5)
model_V2.evaluate(X_test_embs, Y_test_oh)




[0.7444772124290466, 0.8214285969734192]

In [105]:
X_me = np.array(['i adore you','playing football','delicious food','not bad'])
Y_me = np.array([[0],[1],[4],[3]])

X_avg_embs = convert_sentences_to_embeddings(X_me)

pred = model_V2.predict(X_avg_embs)

for i in range(X_me.shape[0]): 
  print(X_me[i], label_to_emoji(np.argmax(pred[i]))) 

i adore you ❤️
playing football ⚾️
delicious food 🍴
not bad 😔
