# Problem: RNN Text Classification

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, GRU

### Dataset EMOJISET

Tiny dataset (X, Y) where:
- X contains 132 sentences (strings)
- Y contains a integer label between 0 and 4 corresponding to an emoji for each sentence

<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/data_set.png?raw=1" style="width:700px;height:300px;">


In [None]:
# Read csv file
def read_csv(file_name):
  data_frame = pd.read_csv(file_name)
  X = np.array(data_frame["sentence"])
  Y = np.array(data_frame["label"], dtype=int) # labels are integere
  return X, Y

In [None]:
X_train, Y_train = read_csv("datasets/Emoji_Text_Classification/train.csv")
X_test, Y_test = read_csv("datasets/Emoji_Text_Classification/test.csv")

In [None]:
# Get max length of sentences
max_len = len(max(X_train, key=len).split(" "))
max_len

In [None]:
# Replace labels with related emoji
def label_to_emoji(label):
    emojies = ["❤️", "🏐", "😄", "😞", "🍴"]
    return emojies[label]

index = 5
print(X_train[index], label_to_emoji(Y_train[index]))

In [None]:
# Number of sentence in each class
unique, counts = np.unique(Y_train, return_counts=True)
dict(zip(unique, counts))

## Emojifier-V1

Each word has some feature, and in Emojifier-V1 we want to classify sentences using multilayer perceptron:

- We get the average of words in each sentence and then forward it to the multilayer perceptron with 50 input neurons(each word has 50 features, then the average of words in the sentence has 50 features) and an output layer of softmax with 5 neurons.

- For feature vectors, we can get from this link: http://nlp.stanford.edu/data/glove.6B.zip

<br>

<center>
<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/image_1.png?raw=1" style="width:900px;height:300px;">
</center>


In [None]:
# Convert labels to one hot
num_classes = len(np.unique(Y_train))

Y_train_oh = tf.keras.utils.to_categorical(Y_train, num_classes)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, num_classes)

In [None]:
index = 5
print(Y_train[index], "is converted into one hot", Y_train_oh[index])

In [None]:
# Download and extract glove.6B for feature vectors 
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glov.6B

In [None]:
# Read feature vectors and save them
"""
In the text file, in each line,
the word comes first, and then the feature vectors(each word is in one line).
"""
def read_glov_vectors(glove_file):
  f = open(glove_file, encoding="utf8")
  words_to_vec = dict()
  for line in f:
    line = line.strip().split()
    word = line[0]
    vec = line[1:]
    words_to_vec[word] = np.array(vec, dtype=np.float64)
  return words_to_vec

In [None]:
words_to_vec = read_glov_vectors("glove.6B/glove.6B.50d.txt")

# Test the output of read_glov_vectors function
words_to_vec["hello"]

In [None]:
#  Convert sentences to the average of the word vectors
def sentence_to_avg(sentence):
  words = sentence.lower().split() # Convert uppercase to lowercase
  sum_vectors = np.zeros((50, ))
  for w in words:
    sum_vectors += words_to_vec[w]
  avg_vectors = sum_vectors / len(words)
  return avg_vectors

In [None]:
# Test sentence_to_avg function
sentence_to_avg("Pasta is my favorite food")

In [None]:
# Get the average of all sentences
X_train_avg = []
for i in range(X_train.shape[0]):
  X_train_avg.append(sentence_to_avg(X_train[i]))

X_train_avg = np.array(X_train_avg)

X_train_avg.shape, Y_train_oh.shape

In [None]:
# Create model(using perceptron)
class EmojiNet_V1(Model):
    def __init__(self):
        super().__init__()
        self.dense = Dense(num_classes, input_shape=(50,), activation='softmax')

    def call(self, x):
        x = self.dense(x)
        return x

In [None]:
# Compile and fit the model
model = EmojiNet_V1()

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train_avg, Y_train_oh, epochs=200, shuffle=True)

In [None]:
# Evaluation
X_test_avg = []
for i in range(X_test.shape[0]):
    X_test_avg.append(sentence_to_avg(X_test[i]))

X_test_avg = np.array(X_test_avg)
model.evaluate(X_test_avg, Y_test_oh)

In [None]:
# Inference
X_me = np.array(["not sad", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy and funny"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_avg = []

for x in X_me:
    X_me_avg.append(sentence_to_avg(x))

X_me_avg = np.array(X_me_avg)
pred = model.predict(X_me_avg)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))

## Emojifier-V2: Using RNNs: 

Let's build an LSTM model that takes as input word sequences. This model will be able to take word ordering into account. Emojifier-V2 will continue to use pre-trained word embeddings to represent words, but will feed them into an LSTM, whose job it is to predict the most appropriate emoji. 

Run the following cell to load the Keras packages.

<br>

<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/emojifier-v2.png?raw=1" style="width:700px;height:400px;"> <br>
<caption><center> **Figure 3**: Emojifier-V2. A 2-layer LSTM sequence classifier. </center></caption>

<img src="https://github.com/Alireza-Akhavan/rnn-notebooks/blob/master/images/embedding1.png?raw=1" style="width:700px;height:250px;">
<caption><center> **Figure 4**: Embedding layer. This example shows the propagation of two examples through the embedding layer. Both have been zero-padded to a length of `max_len=5`. The final dimension of the representation is  `(2,max_len,50)` because the word embeddings we are using are 50 dimensional. </center></caption>

In [None]:
# Define model
class EmojiNet_V2(Model):
    def __init__(self):
        super().__init__()
        
        self.gru_1 = GRU(128, return_sequences=True)
        self.dropout_1 = Dropout(0.3)
        self.gru_2 = GRU(256)
        self.dropout_2 = Dropout(0.5)
        self.dense = Dense(num_classes, activation='softmax')

    def call(self, x):
        x = self.gru_1(x)
        # x = self.dropout_1(x)
        x = self.gru_2(x)
        x = self.dropout_2(x)
        x = self.dense(x)
        return x

In [None]:
# Compile model
model = EmojiNet_V2()

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [None]:
# Fix the size of all sentences to max_len
def convert_sentences_to_embeddings(X):
    emb_dim = words_to_vec["cucumber"].shape[0]  # define dimensionality of your GloVe word vectors (= 50)
    emb_matrix = np.zeros((X.shape[0], max_len, emb_dim))
    for i in range(X.shape[0]):
        words = X[i].lower().split()
        for j in range(len(words)):
            emb_matrix[i, j, :] = words_to_vec[words[j]]
    return emb_matrix

In [None]:
# Test convert_sentences_to_embeddings function
X_me = np.array(["funny lol", "lets play baseball", "food is ready for you"])
print(X_me)
print(convert_sentences_to_embeddings(X_me))

In [None]:
# Run convert_sentences_to_embeddings function for training data 
X_train_embs =convert_sentences_to_embeddings(X_train)
X_train_embs.shape

In [None]:
model.fit(X_train_embs, Y_train_oh, epochs=100, batch_size=4, shuffle=True)

In [None]:
# Evaluation
X_test_embs = convert_sentences_to_embeddings(X_test)
print(X_test_embs.shape)
model.evaluate(X_test_embs, Y_test_oh)

In [None]:
# Inference
X_me = np.array(["not happy", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy and funny"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3]])
X_me_embed = convert_sentences_to_embeddings(X_me) 

pred = model.predict(X_me_embed)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))