In [5]:
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.keras.layers as tfl
import os
import zipfile
import pandas as pd
import numpy as np

In [4]:
# with zipfile.ZipFile("archive (4).zip", "r") as file:
#     file.extractall("EmojiDataset")

In [17]:
glove_embeddings = {}

with open("glove.6B.100d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        glove_embeddings[word] = np.array(values[1:], dtype="float32")

print(len(glove_embeddings))

100


In [14]:
def get_y(csv_filename):
    df = pd.read_csv(csv_filename)
    y = tf.one_hot(df.iloc[:, -1].values, depth = 20)
    return y

In [32]:
def get_avg_embedding(sentence):
    words = np.array(sentence.split())
    words_embeddings = np.array([glove_embeddings[x.lower()] if (x.lower() in glove_embeddings.keys()) else np.zeros((100,)) for x in words])
    return np.mean(words_embeddings)

def get_x_naive(csv_filename):
    df = pd.read_csv(csv_filename)
    x = df.iloc[:, -2].values
    x = np.vectorize(lambda sentence: get_avg_embedding(sentence))(x)
    return x

In [33]:
x_train_naive_df = get_x_naive("EmojiDataset/train.csv")
x_train_naive_df[:5]

array([-0.00423002, -0.00381428, -0.02088024, -0.00710395, -0.01055331])

In [15]:
y_train_df = get_y("EmojiDataset/train.csv")
y_train_df[:5]

<tf.Tensor: shape=(5, 20), dtype=float32, numpy=
array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)>

In [39]:
naive_model = tf.keras.models.Sequential([
    tfl.Dense(256, activation="relu", input_shape=(1,)),
    tfl.Dense(20, activation="softmax")
])
naive_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [40]:
naive_model.fit(x_train_naive_df, y_train_df, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1ae2ddc4450>

Naive model does not get high accuracy, but given that random guessing would yield around 5% this is still pretty good.

<br>
<br>

## LSTM Model
Implementing an adaptation of the LSTM model described in [this github repository](https://github.com/amanchadha/coursera-deep-learning-specialization/blob/master/C5%20-%20Sequence%20Models/Week%202/Emojify/Emoji_v3a.ipynb)

In [70]:
from keras.src.utils import pad_sequences


def get_x_embed(sentence, maxlen):
    word_arr = np.array(sentence.split())
    if len(word_arr) < maxlen:
        padding = maxlen - len(word_arr)
        word_arr = np.pad(word_arr, (0, padding), mode="constant", constant_values="<OOVo>")
    if len(word_arr) > maxlen:
        word_arr = word_arr[:maxlen]
    sentence_embeddings = np.array([glove_embeddings[x.lower()] if (x.lower() in glove_embeddings.keys()) else np.zeros((100,)) for x in word_arr])

    return sentence_embeddings

def get_x(csv_filename, maxlen=50):
    df = pd.read_csv(csv_filename)
    sentences_arr = df["TEXT"].values
    embeddings = np.array([get_x_embed(x, maxlen) for x in sentences_arr])
    return embeddings

In [69]:
test_alpha = pd.read_csv("EmojiDataset/train.csv")
test_alpha.iloc[:, -2].head

<bound method NDFrame.head of 0        Vacation wasted ! #vacation2017 #photobomb #ti...
1        Oh Wynwood, you’re so funny! : @user #Wynwood ...
2        Been friends since 7th grade. Look at us now w...
3        This is what it looks like when someone loves ...
4        RT @user this white family was invited to a Bl...
                               ...                        
69995    Yes, I call Galina "my Bubie" Go follow my bea...
69996      I SEA you, Seattle @ Ballard Seafood Festival\n
69997    If one of my daughters is wearing this and ask...
69998    Guess who whoop people on THEIR homecoming?! #...
69999    We Love you Robbie @ Heritage Memorial Cemeter...
Name: TEXT, Length: 70000, dtype: object>

In [71]:
x_train = get_x("EmojiDataset/train.csv")
x_test = get_x("EmojiDataset/test.csv")
print(x_train.shape, x_test.shape)

(70000, 50, 100) (25958, 50, 100)
