# Emoji Prediction

In this project, we will generate/predict emojis based on the input text.
Emoji library is a pre-requisite to do this project, so run the first cell accordingly.

In [None]:
# !pip install emoji

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("train_emoji.csv" , header = None)    # train data
test = pd.read_csv("test_emoji.csv", header=None)         # test data   

In [None]:
import emoji

### Emoji Plot

In [None]:
# emoji.EMOJI_UNICODE       # To print emoji codes .... just for reference

In [None]:
emoji_dictionary = {
    "0" : "\u2764\uFE0F",     # heart emoji 
    "1" : ":baseball:",
    "2" : ":beaming_face_with_smiling_eyes:",
    "3" : ":downcast_face_with_sweat:",
    "4" : ":fork_and_knife:"
}

In [None]:
for e in emoji_dictionary.values():
    print(emoji.emojize(e))

## Data cleaning and exploring

In [None]:
data = train.values

In [None]:
for i in range(10):
    print(data[i][0], emoji.emojize(emoji_dictionary[str(data[i][1])]))

In [None]:
X_train = train[0].values  
X_test = test[0].values

In [None]:
y_train = train[1].values
y_test = test[1].values


Last Two Columns of the data are redundant


In [None]:
from keras.utils import to_categorical

In [None]:
y_train = to_categorical(y_train)      # One Hot Encoding
y_test = to_categorical(y_test)      

In [None]:
X_train.shape, y_train.shape

### Get Embeddings

In [None]:
embeddings = {}     # Embedding dictioinary .... to be filed using GloVe vectors
                    # Words to be mapped with respective vectors

with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f:       
    for line in f:
        d = line.split()                        # GloVe vector data
        word = d[0]                             # First string is the word, rest all are vectors
        vec = np.array(d[1:], dtype="float32")  
        
        embeddings[word] = vec

In [None]:
def getOutputEmbeddings(X):
    
    maxlen = 10
    embedding_matrix_output = np.zeros((X.shape[0], maxlen, 50 ))        # maxlen is the maximum length of all the sentences  
                                                                         # GloVe uses 50 dimensions to represent a word
    for i in range(X.shape[0]):
        X[i] = X[i].split()
        
        for j in range(len(X[i])):
            embedding_matrix_output[i][j] = embeddings[X[i][j].lower()]    # Use the embedding dictionary to get the GloVe
                                                                           # embeddings
            return embedding_matrix_output
    

### Explore training data  : An essential step 

In [None]:
X_train

In [None]:
X_train = getOutputEmbeddings(X_train)
X_test = getOutputEmbeddings(X_test)

In [None]:
X_train.shape

### Model

We'll use two LSTM Layers, with dropouts, followed by a dense layer (5 units) for classification. 

In [None]:
from keras.layers import *
from keras.models import Sequential

In [None]:
model = Sequential()
model.add(LSTM(64, input_shape= (10,50), return_sequences= True ))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics= ['acc'])

### Training

In [None]:
hist = model.fit(X_train, y_train, epochs=40, batch_size=32, shuffle=True, validation_split=0.1)

In [None]:
model.evaluate(X_train, y_train)

In [None]:
model.evaluate(X_test, y_test)

### Predictions/Testing

In [None]:
y_pred = model.predict_classes(X_test)

In [None]:
test = test[0].values

In [None]:
for i in range(40):
    print( " ".join(test[i]) , emoji.emojize(emoji_dictionary[str(y_pred[i])]))       # Join predicted Emoji with test string