# This Project is about adding Emojis to sentences automatically based on the content of the sentence

In [2]:
import numpy as np
from emo_utils import *
import emoji
import matplotlib.pyplot as plt
import os

%matplotlib inline

## Building a Baseline

## Our data consists of X,Y where X consists 127 sentences and Y is an integer from 0-4 based on these emoji
<img src="images/data_set.png" style="width:700px;height:300px;">

# Loading Data-Set

In [4]:
X_train, y_train = read_csv('data/train_emoji.csv')
X_test, y_test = read_csv('data/tesss.csv')

In [6]:
maximum_length_of_sentence = len(max(X_train, key=len).split())
print(maximum_length_of_sentence)

10


In [7]:
for idx in range(10):
    print(X_train[idx], label_to_emoji(y_train[idx]))

never talk to me again 😞
I am proud of your achievements 😄
It is the worst day in my life 😞
Miss you so much ❤️
food is life 🍴
I love you mum ❤️
Stop saying bullshit 😞
congratulations on your acceptance 😄
The assignment is too long  😞
I want to go play ⚾


## Building V1 of the Classifier 

In [8]:
y_one_hot_train = convert_to_one_hot(y_train, C=5)
y_one_hot_test = convert_to_one_hot(y_test, C=5)

In [10]:
y_one_hot_train[51]

array([0., 0., 0., 1., 0.])

# # Reading word vector embeddings from GloVe

In [19]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [32]:
word = "cucumber"
idx = 289846
print(f'The index for word {word} is ' + str(word_to_index[word]))
print(f'The word for index {idx} is ' + str(index_to_word[idx]))
size = word_to_vec_map[word].shape
print(size)

The index for word cucumber is 113317
The word for index 289846 is potatos
(50,)


In [57]:
def sentence_average(sentence, word_to_vec_map=word_to_vec_map):
    
    sentence = sentence.lower()
    words = np.array(sentence.split(" "))
    words = words[words != '']
    avg = np.zeros(shape=size)
    for word in words:
        avg += word_to_vec_map[word]
    avg /= len(words)
    return avg

In [58]:
avg = sentence_average("Morrocan couscous is my favorite dish")
print("avg = \n", avg)

avg = 
 [-0.008005    0.56370833 -0.50427333  0.258865    0.55131103  0.03104983
 -0.21013718  0.16893933 -0.09590267  0.141784   -0.15708967  0.18525867
  0.6495785   0.38371117  0.21102167  0.11301667  0.02613967  0.26037767
  0.05820667 -0.01578167 -0.12078833 -0.02471267  0.4128455   0.5152061
  0.38756167 -0.898661   -0.535145    0.33501167  0.68806933 -0.2156265
  1.797155    0.10476933 -0.36775333  0.750785    0.10282583  0.348925
 -0.27262833  0.66768    -0.10706167 -0.283635    0.59580117  0.28747333
 -0.3366635   0.23393817  0.34349183  0.178405    0.1166155  -0.076433
  0.1445417   0.09808667]


## Model for training Using Cross Entropy Loss

In [59]:
def model(X, y, learning_rate=0.01, epochs=400):
    
    np.random.seed(1)
    m = y.shape[0]
    n_h = size[0]
    n_y = 5
    
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
    b = np.zeros((n_y,))
    y_one_hot = convert_to_one_hot(y, C=n_y)
    
    for epoch in range(epochs):
        for sample in range(m):
            avg = sentence_average(X[sample])
            z = np.dot(W, avg) + b
            a = softmax(z)
            
            cost = - np.sum(np.log(a) * y_one_hot[sample])
            
            dz = a - y_one_hot[sample]
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
            db = dz
            
            W = W - learning_rate * dW
            b = b - learning_rate * db
            
        if epoch % 100 == 0:
            print("Epoch: " + str(epoch) + " --- cost = " + str(cost))
            pred = predict(X, y, W, b, word_to_vec_map) #predict is defined in emo_utils.py
    return pred, W, b

In [60]:
print(X_train.shape)
print(y_train.shape)
print(np.eye(5)[y_train.reshape(-1)].shape)
print(X_train[0])
print(type(X_train))
Y = np.asarray([5,0,0,5, 4, 4, 4, 6, 6, 4, 1, 1, 5, 6, 6, 3, 6, 3, 4, 4])
print(Y.shape)

X = np.asarray(['I am going to the bar tonight', 'I love you', 'miss you my dear',
 'Lets go party and drinks','Congrats on the new job','Congratulations',
 'I am so happy for you', 'Why are you feeling bad', 'What is wrong with you',
 'You totally deserve this prize', 'Let us go play football',
 'Are you down for football this afternoon', 'Work hard play harder',
 'It is suprising how people can be dumb sometimes',
 'I am very disappointed','It is the best day in my life',
 'I think I will end up alone','My life is so boring','Good job',
 'Great so awesome'])

print(X.shape)
print(np.eye(5)[y_train.reshape(-1)].shape)
print(type(X_train))

(132,)
(132,)
(132, 5)
never talk to me again
<class 'numpy.ndarray'>
(20,)
(20,)
(132, 5)
<class 'numpy.ndarray'>


In [62]:
pred, W, b = model(X_train, y_train)

Epoch: 0 --- cost = 1.9520498812810072
Accuracy: 0.3484848484848485
Epoch: 100 --- cost = 0.07971818726014794
Accuracy: 0.9318181818181818
Epoch: 200 --- cost = 0.04456369243681402
Accuracy: 0.9545454545454546
Epoch: 300 --- cost = 0.03432267378786059
Accuracy: 0.9696969696969697


## Prediction on training and test set

In [64]:
print("Training set:")
pred_train = predict(X_train, y_train, W, b, word_to_vec_map)
print('Test set:')
pred_test = predict(X_test, y_test, W, b, word_to_vec_map)

Training set:
Accuracy: 0.9772727272727273
Test set:
Accuracy: 0.8571428571428571


## Confusion matrix 