In [1]:
# import csv
# import random
# import pickle
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# import matplotlib.pyplot as plt
# from scipy.stats import linregress

import datasets


In [29]:
EMBEDDING_DIM = 200
MAXLEN = 4
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"
MAX_EXAMPLES = 16000
TRAINING_SPLIT = 0.9

In [3]:
emotions = datasets.load_dataset('emotion')


In [5]:
emotions.set_format(type="pandas")
train = emotions["train"][:]
test = emotions["test"][:]
val = emotions["validation"][:]

train.shape
# test.shape


(16000, 2)

In [6]:
train_text = pd.concat([train["text"], val["text"]])
y_train = pd.concat([train["label"], val["label"]])
test_text = test["text"]
y_test = test["label"]

In [8]:
train_text.shape

(18000,)

In [9]:
def fit_tokenizer(train_sentences, oov_token):
    """
    Instantiates the Tokenizer class on the training sentences
    Args:
        train_sentences (list of string): lower-cased sentences without stopwords to be used for training
        oov_token (string) - symbol for the out-of-vocabulary token
    Returns:
        tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
    """
    
    tokenizer = Tokenizer(oov_token=oov_token)
    tokenizer.fit_on_texts(train_sentences)
        
    return tokenizer

In [10]:
# Test your function
tokenizer = fit_tokenizer(train_text, OOV_TOKEN)

word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index)

print(f"Vocabulary contains {VOCAB_SIZE} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")
print(f"\nindex of word 'i' should be {word_index['i']}")

Vocabulary contains 16197 words

<OOV> token included in vocabulary

index of word 'i' should be 2


In [11]:
def seq_pad_and_trunc(sentences, tokenizer, padding, truncating, maxlen):
    """
    Generates an array of token sequences and pads them to the same length
    
    Args:
        sentences (list of string): list of sentences to tokenize and pad
        tokenizer (object): Tokenizer instance containing the word-index dictionary
        padding (string): type of padding to use
        truncating (string): type of truncating to use
        maxlen (int): maximum length of the token sequence
    
    Returns:
        pad_trunc_sequences (array of int): tokenized sentences padded to the same length
    """        
       
    sequences = tokenizer.texts_to_sequences(sentences)
    pad_trunc_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding, truncating=truncating)
        
    return pad_trunc_sequences

In [13]:
# Test your function
train_pad_trunc_seq = seq_pad_and_trunc(train_text, tokenizer, PADDING, TRUNCATING, MAXLEN)
test_pad_trunc_seq = seq_pad_and_trunc(test_text, tokenizer, PADDING, TRUNCATING, MAXLEN)

print(f"Padded and truncated training sequences have shape: {train_pad_trunc_seq.shape}\n")
print(f"Padded and truncated validation sequences have shape: {test_pad_trunc_seq.shape}")

Padded and truncated training sequences have shape: (18000, 16)

Padded and truncated validation sequences have shape: (2000, 16)


In [14]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [16]:
del emotions

## Using pre-defined Embeddings

In [15]:
GLOVE_FILE = 'Assets/glove.6B.200d.txt'
enc = 'utf-16'
enc = 'iso-8859-15'

GLOVE_EMBEDDINGS = {}

with open(GLOVE_FILE, encoding=enc) as f:
    for line in f:
        values = line.split(" ")
        word = values[0]
        # print(word)
        coefs = np.asarray(values[1:], dtype='float32')
        GLOVE_EMBEDDINGS[word] = coefs

In [17]:
len(GLOVE_EMBEDDINGS)

400000

In [18]:
import numpy as np
from numpy.linalg import norm

def calculate_cosine_similarity(vec1, vec2):
    """
    Calculates the cosine similarity between two NumPy arrays (vectors).
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = norm(vec1)
    norm_vec2 = norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0  # Handle cases where one or both vectors are zero vectors

    cosine_similarity = dot_product / (norm_vec1 * norm_vec2)
    return cosine_similarity


In [19]:
test_word1 = 'woman'
test_word2 = 'queen'

test_vector1 = GLOVE_EMBEDDINGS[test_word1]
test_vector2 = GLOVE_EMBEDDINGS[test_word2]

similarity = calculate_cosine_similarity(test_vector1, test_vector2)

print(f"Cosine similarity of words {test_word1} and {test_word2}:\n\n{similarity}")


Cosine similarity of words woman and queen:

0.4188632071018219


In [22]:
EMBEDDINGS_MATRIX = np.zeros((VOCAB_SIZE+1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = GLOVE_EMBEDDINGS.get(word)
    if embedding_vector is not None:
        EMBEDDINGS_MATRIX[i] = embedding_vector

In [30]:
embedding_dim = 16
lstm1_dim = 64
lstm2_dim = 4
dense_dim = 32

def create_model(vocab_size, lstm1_dim, lstm2_dim, embedding_dim, maxlen, embeddings_matrix):
    """
    Creates a binary sentiment classifier model
    
    Args:
        vocab_size (int): size of the vocabulary for the Embedding layer input
        embedding_dim (int): dimensionality of the Embedding layer output
        maxlen (int): length of the input sequences
        embeddings_matrix (array): predefined weights of the embeddings
    
    Returns:
        model (tf.keras Model): the sentiment classifier model
    """
    
    model = tf.keras.Sequential([ 
        # This is how you need to set the Embedding layer when using pre-trained embeddings
        tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=False), 
        # tf.keras.layers.Dropout(0.2),
        # tf.keras.layers.Conv1D(32, 5, activation='relu'),
        # tf.keras.layers.GlobalMaxPooling1D(),
        # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim)),
        # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm1_dim, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm2_dim)),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(6, activation='softmax')
    ])
    
    model.compile(loss='SparseCategoricalCrossentropy',
                  optimizer='adam',
                  metrics=['accuracy']) 

    return model

In [31]:
model = create_model(VOCAB_SIZE, lstm1_dim, lstm2_dim, EMBEDDING_DIM, MAXLEN, EMBEDDINGS_MATRIX)

history = model.fit(train_pad_trunc_seq, y_train, epochs=30, validation_data=(test_pad_trunc_seq, y_test))

Epoch 1/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 14ms/step - accuracy: 0.3760 - loss: 1.5671 - val_accuracy: 0.5370 - val_loss: 1.2343
Epoch 2/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.5396 - loss: 1.2406 - val_accuracy: 0.6565 - val_loss: 0.9811
Epoch 3/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.6443 - loss: 0.9952 - val_accuracy: 0.7125 - val_loss: 0.8332
Epoch 4/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.6994 - loss: 0.8592 - val_accuracy: 0.7470 - val_loss: 0.7515
Epoch 5/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.7414 - loss: 0.7529 - val_accuracy: 0.7670 - val_loss: 0.6973
Epoch 6/30
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.7523 - loss: 0.7141 - val_accuracy: 0.7725 - val_loss: 0.6641
Epoch 7/30
[1m563/563[0

In [33]:
pd.Series(y_train).value_counts(normalize=True)

1    0.337000
0    0.289778
3    0.135222
4    0.119389
2    0.082333
5    0.036278
Name: proportion, dtype: float64