In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf
import nltk

In [2]:
dataset = pd.read_csv("Dataset_Capstone_Clean.csv")
dataset

Unnamed: 0,Text,Mood
0,i did there was a mental debate i can eat the ...,anger
1,i dont know how but i forgot how to communicat...,anger
2,i started to feel grumpy and frustrated,anger
3,i also feel insincere shallow and fake,anger
4,i feel petty for it but i think less of you as...,anger
...,...,...
39995,JamieLynnMB i had my son when i was 16hes now ...,worry
39996,sneezing is never a good sign,worry
39997,Awe i feel so left out,worry
39998,LAPPYTOP BATERRRY DYINGtryingtofind a movieto ...,worry


In [3]:
dataset.duplicated().sum()

16

In [4]:
dataset_fix=dataset.drop_duplicates()

In [6]:
dataset_fix.value_counts("Mood")


Mood
anger      5000
joy        5000
sadness    5000
happy      4999
worry      4999
fear       4998
love       4998
neutral    4990
Name: count, dtype: int64

In [7]:
label_encoder = preprocessing.LabelEncoder()
dataset_fix['N_label'] = label_encoder.fit_transform(dataset_fix["Mood"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_fix['N_label'] = label_encoder.fit_transform(dataset_fix["Mood"])


In [8]:
dataset_fix

Unnamed: 0,Text,Mood,N_label
0,i did there was a mental debate i can eat the ...,anger,0
1,i dont know how but i forgot how to communicat...,anger,0
2,i started to feel grumpy and frustrated,anger,0
3,i also feel insincere shallow and fake,anger,0
4,i feel petty for it but i think less of you as...,anger,0
...,...,...,...
39995,JamieLynnMB i had my son when i was 16hes now ...,worry,7
39996,sneezing is never a good sign,worry,7
39997,Awe i feel so left out,worry,7
39998,LAPPYTOP BATERRRY DYINGtryingtofind a movieto ...,worry,7


In [9]:
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(dataset_fix["Text"], 
                                                                                          dataset_fix["N_label"], 
                                                                                          test_size = 0.1, 
                                                                                          stratify = dataset_fix["Mood"])

In [10]:
## Check the lengths
len(training_sentences), len(testing_sentences), len(training_labels), len(testing_labels)

(35985, 3999, 35985, 3999)

In [11]:
dataset_fix["N_label"].value_counts()

N_label
0    5000
3    5000
6    5000
2    4999
7    4999
1    4998
4    4998
5    4990
Name: count, dtype: int64

In [12]:
# View the first 10 training sentences and their labels
training_sentences[:10], training_labels[:10]

(30920                                i feel lonely amp sad
 5309     i am aware of how i am i still feel insecure s...
 5312     i feel tortured and pulled into a series of tests
 6034     i began to feel really anxious and panicky whi...
 19749    i feel a craving i get excited and sometimes i...
 23348    i feel the urge to release the words during ro...
 15153    i liked feeling useful and being needed it eas...
 22245    ive achieved and they find it difficult to emp...
 26142    brieasaurus haha yeah oh well im just going to...
 5562                        i didnt feel threatened at all
 Name: Text, dtype: object,
 30920    6
 5309     1
 5312     1
 6034     1
 19749    3
 23348    4
 15153    3
 22245    4
 26142    5
 5562     1
 Name: N_label, dtype: int32)

In [13]:
# Setup text vectorization with custom variables
max_vocab_length = 80000 # max number of words to have in our vocabulary
max_length = 61 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [14]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(training_sentences)

In [18]:
# Choose a random sentence from the training dataset and tokenize it
import random
random_sentence = random.choice(training_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nVectorized version:")
text_vectorizer([random_sentence])

Original text:
i feel was smart as it avoided making the pages too cumbersome and additionally avoided the clumsiness of trying to introduce all the characters at once      

Vectorized version:


<tf.Tensor: shape=(1, 61), dtype=int64, numpy=
array([[    2,     4,    22,   665,    33,    13,  5908,   245,     6,
         2422,    78, 11892,     5, 31789,  5908,     6, 28856,     8,
          195,     3,  4947,    36,     6,  1242,    32,   273,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0]], dtype=int64)>

In [19]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 32309
Top 5 most common words: ['', '[UNK]', 'i', 'to', 'feel']
Bottom 5 least common words: ['024', '02', '010', '006', '0003']


In [20]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 

embedding



<Embedding name=embedding_1, built=False>

In [27]:
# Get a random sentence from training set
random_sentence = random.choice(training_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
im feeling a tad insecure about my book launch for november      

Embedded version:


<tf.Tensor: shape=(1, 61, 128), dtype=float32, numpy=
array([[[-0.02062116,  0.01529822, -0.00808799, ..., -0.01421716,
          0.02187278, -0.02864701],
        [ 0.02396634, -0.02871766,  0.04745472, ...,  0.04146639,
          0.01451007,  0.00438291],
        [-0.02026639, -0.03737671, -0.0419379 , ...,  0.03638296,
          0.03649696,  0.03494034],
        ...,
        [ 0.00111582,  0.03377286,  0.03121131, ..., -0.02442182,
          0.03076785,  0.03606844],
        [ 0.00111582,  0.03377286,  0.03121131, ..., -0.02442182,
          0.03076785,  0.03606844],
        [ 0.00111582,  0.03377286,  0.03121131, ..., -0.02442182,
          0.03076785,  0.03606844]]], dtype=float32)>

# Model

In [29]:
# Create tensorboard callback (need to create a new one for each model)
from helper_functions import create_tensorboard_callback

# Create directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [28]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_5_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_5")

# Create 1-dimensional convolutional layer to model sequences
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_5_embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
# x = layers.Dense(64, activation="relu")(x) # optional dense layer
outputs = layers.Dense(8, activation="sigmoid")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_5_Conv1D")

# Compile Conv1D model
model_5.compile(loss="sparse_categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Get a summary of our 1D convolution model
model_5.summary()

In [30]:
# Fit the model
model_5_history = model_5.fit(training_sentences,
                              training_labels,
                              epochs=5,
                              validation_data=(testing_sentences, testing_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 
                                                                     "Conv1D")])

Saving TensorBoard log files to: model_logs/Conv1D/20240531-111857
Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 144ms/step - accuracy: 0.4310 - loss: 1.4176 - val_accuracy: 0.7047 - val_loss: 0.6089
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 169ms/step - accuracy: 0.7555 - loss: 0.5382 - val_accuracy: 0.7049 - val_loss: 0.6082
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 171ms/step - accuracy: 0.8555 - loss: 0.3638 - val_accuracy: 0.7054 - val_loss: 0.6818
Epoch 4/5
[1m 398/1125[0m [32m━━━━━━━[0m[37m━━━━━━━━━━━━━[0m [1m2:03[0m 170ms/step - accuracy: 0.9105 - loss: 0.2450

KeyboardInterrupt: 