## import important libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical, pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
! wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

--2024-09-10 20:07:28--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2024-09-10 20:07:29 (72.8 MB/s) - ‘helper_functions.py’ saved [10246/10246]



## Check GPU

In [3]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-0ce1d892-d4e9-657d-88d3-1755bdec52d8)


## Download text Data

In [4]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")

--2024-09-10 20:07:29--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.130.207, 74.125.68.207, 64.233.170.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.130.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-09-10 20:07:31 (708 KB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [5]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
# shuffle data
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [7]:
# visualize text data
train_df_shuffled.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [8]:
random_index = np.random.randint(0, len(train_df_shuffled)-5)
for _, row in train_df_shuffled[['text', 'target']][random_index: random_index + 5].iterrows():
  text = row['text']
  target = row['target']
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 0 (not real disaster)
Text:
@TayIorrMade @MegatronAFC possibly he's had injuries on both ankles though. 2011 one worse but regardless both.

---

Target: 1 (real disaster)
Text:
The Catastrophic Effects of Hiroshima and Nagasaki Atomic Bombings Still Being Felt Today http://t.co/tGcR5voFJ3

---

Target: 0 (not real disaster)
Text:
York Co. first responders compete to save lives in Û÷Badges for BloodÛª #paramedic #EMS http://t.co/E65V80FCus

---

Target: 0 (not real disaster)
Text:
The fear of the Lord is the start of knowledge: but the foolish have no use for wisdom and teaching (Amsal 1:7)

---

Target: 0 (not real disaster)
Text:
@crowdtappers @joinvroom OMG I remember the meltdown the day I did her hair like ELSA and not ANNA.... OHHHH THE HORROR!!! LOL #tangletalk

---



## split data into train and validation sets

In [9]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size = 0.1,
                                                                            random_state = 42)
train_sentences.shape, val_sentences.shape, train_labels.shape, val_labels.shape

((6851,), (762,), (6851,), (762,))

In [10]:
train_sentences

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       ...,
       'Near them on the sand half sunk a shattered visage lies... http://t.co/0kCCG1BT06',
       "kesabaran membuahkan hasil indah pada saat tepat! life isn't about waiting for the storm to pass it's about learning to dance in the rain.",
       "@ScottDPierce @billharris_tv @HarrisGle @Beezersun I'm forfeiting this years fantasy football pool out of fear I may win n get my ass kicked"],
      dtype=object)

## Tokenization and Embedding Approaches
**Manual Tokenization and Embedding using GloVe:**

    Tokenize the text manually.
    Use GloVe embeddings for converting tokens into vectors.

**Automated Tokenization and Embedding using TextVectorization:**

    Utilize the TextVectorization layer to tokenize and generate embeddings automatically.


In [None]:
# Tokenize the text manually.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
word2index = tokenizer.word_index

train_Xtokens = tokenizer.texts_to_sequences(train_sentences)
test_Xtokens = tokenizer.texts_to_sequences(val_sentences)

def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

train_Xmaxlen = get_maxlen(train_Xtokens)
test_Xmaxlen = get_maxlen(test_Xtokens)

Xtrain = pad_sequences(train_Xtokens, maxlen = train_Xmaxlen,  padding = 'post', truncating = 'post')
Xtest = pad_sequences(test_Xtokens, maxlen = test_Xmaxlen,  padding = 'post', truncating = 'post')

train_Xmaxlen, test_Xmaxlen

(33, 31)

In [None]:
# Use GloVe embeddings for converting tokens into vectors.
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

# unzip in my drive
unzip_data('glove.twitter.27B.zip')


--2024-09-10 14:42:16--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2024-09-10 14:42:17--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2024-09-10 14:42:17--  https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [ap

In [None]:
file = open('/content/glove.twitter.27B.200d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

embed_size = 200
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    embed_vector = embeddings.get(word) # Use get() to avoid KeyError
    if embed_vector is not None:
        embedding_matrix[i] = embed_vector
    else:
        embedding_matrix[i] = np.random.randn(embed_size) # Replace OOV with random vector

In [None]:
embedding_matrix.shape

(21058, 200)

## Automated Tokenization and Embedding using TextVectorization

In [None]:
# text vectorization
max_len = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
text_vectorizer = TextVectorization(max_tokens = 10000,
                                    standardize = 'lower_and_strip_punctuation',
                                    split = 'whitespace',
                                    ngrams = None,
                                    output_mode = 'int',
                                    output_sequence_length = max_len)
text_vectorizer.adapt(train_sentences)

In [None]:
# create an embedding using an embedding layer
tf.random.set_seed(42)

embedding = Embedding(input_dim = 10000,
                      output_dim = 128,
                      embeddings_initializer = 'uniform',
                      input_length = max_len,
                      name = 'embedding_1')
embedding



<Embedding name=embedding_1, built=False>

In [None]:
# Get a random sentence from training set
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
Photo: weallheartonedirection: I wouldnÛªt let David electrocute himself so IÛªm the asshole http://t.co/uWiJMEGl4E      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.0312425 ,  0.00034546,  0.04540445, ...,  0.02484102,
         -0.04221076, -0.03009614],
        [-0.03462382, -0.01829042,  0.049901  , ...,  0.02931268,
          0.00874907, -0.02289612],
        [-0.00831745, -0.01953418, -0.02796302, ...,  0.0079015 ,
          0.04535688, -0.04887616],
        ...,
        [ 0.01566333,  0.03144893, -0.03266732, ...,  0.01498551,
          0.02628967, -0.02459664],
        [ 0.00967594,  0.00370361,  0.03496987, ...,  0.01521616,
         -0.01336848, -0.02971851],
        [ 0.00967594,  0.00370361,  0.03496987, ...,  0.01521616,
         -0.01336848, -0.02971851]]], dtype=float32)>

## Model_0: Getting a baseline (MultinomialNB)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Fit the pipeline on the training data
model_0.fit(train_sentences, train_labels)

In [None]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 79.27%


In [None]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

## Evaluation of baseline model

In [None]:
from helper_functions import calculate_results
baseline_results = calculate_results(y_true = val_labels,
                                     y_pred = baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

## Model_1: A simple Dense Model (using Embedding layer)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# Create a tensorboard callback (need to create a new one for each model)
tf.random.set_seed(42)

# create a directory to save tensorboard logs
save_dir = 'model_logs'

model_1 = Sequential([
    Input(shape = (1,), dtype = tf.string),
    text_vectorizer,
    embedding,
    layers.GlobalAveragePooling1D(),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
], name = 'model_1_dense')

model_1.compile(loss = 'binary_crossentropy',
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

model_1.summary()

In [None]:
history_1 = model_1.fit(train_sentences,
                        train_labels,
                        epochs = 5,
                        validation_data = (val_sentences, val_labels),
                        callbacks = [create_tensorboard_callback(dir_name = save_dir,
                                                                 experiment_name = 'model_1_dense')])

Saving TensorBoard log files to: model_logs/model_1_dense/20240910-145313
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - accuracy: 0.6577 - loss: 0.6089 - val_accuracy: 0.7769 - val_loss: 0.4675
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.8557 - loss: 0.3448 - val_accuracy: 0.7730 - val_loss: 0.4972
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.9084 - loss: 0.2372 - val_accuracy: 0.7795 - val_loss: 0.5765
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.9333 - loss: 0.1658 - val_accuracy: 0.7598 - val_loss: 0.7567
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.9514 - loss: 0.1200 - val_accuracy: 0.7625 - val_loss: 0.8420


In [None]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs)) # squeeze removes single dimensions
model_1_preds[:20]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step


<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [None]:
# Calculate model_1 metrics
model_1_results = calculate_results(y_true = val_labels,
                                    y_pred = model_1_preds)
model_1_results

{'accuracy': 76.24671916010499,
 'precision': 0.7718398984730336,
 'recall': 0.7624671916010499,
 'f1': 0.757001973617136}

## Using the same model but with (GloVe embeddings)

In [None]:
tf.random.set_seed(42)
model_1_glove = Sequential([
              Input(shape = (1,)),
              Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = train_Xmaxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    layers.GlobalAveragePooling1D(),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

model_1_glove.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_1_glove.summary()



In [None]:
history_2 = model_1_glove.fit(Xtrain,
                        train_labels,
                        epochs = 5,
                        validation_data = (Xtest, val_labels),
                        callbacks = [create_tensorboard_callback(dir_name = save_dir,
                                                                 experiment_name = 'model_1_dense_glove')])

Saving TensorBoard log files to: model_logs/model_1_dense_glove/20240910-145342
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.7052 - loss: 0.5725 - val_accuracy: 0.7664 - val_loss: 0.4760
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7935 - loss: 0.4577 - val_accuracy: 0.7717 - val_loss: 0.4587
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8040 - loss: 0.4358 - val_accuracy: 0.7769 - val_loss: 0.4539
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.8173 - loss: 0.4140 - val_accuracy: 0.7782 - val_loss: 0.4532
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8326 - loss: 0.3874 - val_accuracy: 0.7808 - val_loss: 0.4560


In [None]:
model_1_glove_pred_probs = model_1_glove.predict(Xtest)
model_1_glove_preds = tf.squeeze(tf.round(model_1_glove_pred_probs))
model_1_glove_results = calculate_results(y_true = val_labels,
                                    y_pred = model_1_glove_preds)
model_1_glove_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


{'accuracy': 78.08398950131233,
 'precision': 0.7812855270102692,
 'recall': 0.7808398950131233,
 'f1': 0.7809967192089909}

In [None]:
!tensorboard dev upload --logdir ./model_logs \
  --name "First deep model on text data" \
  --description "Trying a dense model with an embedding layer and with an Glove embedding" \
  --one_shot # exits the uploader when upload has finished

2024-09-10 14:53:57.413160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 14:53:57.443286: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 14:53:57.451149: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
ERROR: The `tensorboard dev` command is no longer available.

TensorBoard.dev has been shut down. For further information,
see the FAQ at <https://tensorboard.dev/>.


In [None]:
# %load_ext tensorboard
# %tensorboard --logdir model_logs

![](https://github.com/Xmen3em/Beginner-Projects/blob/main/Beginner-NLP-%20Projects/Natural%20Language%20Processing%20with%20Disaster%20Tweets/Screenshot%202024-09-10%20062208.png?raw=true)

## Model 2: LSTM

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers

model_2_embedding = Embedding(input_dim = 10000,
                      output_dim = 128,
                      embeddings_initializer = 'uniform',
                      input_length = max_len,
                      name = 'embedding_2')
model_2 = Sequential([
    Input(shape = (1,), dtype = tf.string),
    text_vectorizer,
    model_2_embedding,
    layers.LSTM(64, return_sequences=True) ,
    layers.LSTM(64),
    Dense(128, activation = 'relu'),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
], name = 'model_2_LSTM')

model_2.compile(loss = 'binary_crossentropy',
                optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001),
                metrics = ['accuracy'])

model_2.summary()



In [None]:
# Fit model
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(save_dir,
                                                                     "LSTM")])

Saving TensorBoard log files to: model_logs/LSTM/20240910-151427
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 66ms/step - accuracy: 0.5722 - loss: 0.6735 - val_accuracy: 0.7375 - val_loss: 0.6536
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 34ms/step - accuracy: 0.7816 - loss: 0.4795 - val_accuracy: 0.7743 - val_loss: 0.5249
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 59ms/step - accuracy: 0.8566 - loss: 0.3491 - val_accuracy: 0.7743 - val_loss: 0.4842
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 46ms/step - accuracy: 0.8975 - loss: 0.2694 - val_accuracy: 0.7638 - val_loss: 0.5787
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 36ms/step - accuracy: 0.9274 - loss: 0.2058 - val_accuracy: 0.7625 - val_loss: 0.6708


In [None]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_results = calculate_results(y_true = val_labels,
                                    y_pred = model_2_preds)
model_2_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step


{'accuracy': 76.24671916010499,
 'precision': 0.7625326572242482,
 'recall': 0.7624671916010499,
 'f1': 0.7612168110136286}

## Model 2: LSTM with GLOVE Embedding

In [None]:
tf.random.set_seed(42)
model_2_glove = Sequential([
              Input(shape = (1,)),
              Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = train_Xmaxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    layers.LSTM(64, return_sequences=True) ,
    layers.LSTM(64),
    Dense(128, activation = 'relu'),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

model_2_glove.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_2_glove.summary()



In [None]:
history_3 = model_2_glove.fit(Xtrain,
                        train_labels,
                        epochs = 5,
                        validation_data = (Xtest, val_labels),
                        callbacks = [create_tensorboard_callback(dir_name = save_dir,
                                                                 experiment_name = 'model_2_LSTM_glove')])

Saving TensorBoard log files to: model_logs/model_2_LSTM_glove/20240910-152319
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 96ms/step - accuracy: 0.6797 - loss: 0.5902 - val_accuracy: 0.7730 - val_loss: 0.4938
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 60ms/step - accuracy: 0.8057 - loss: 0.4537 - val_accuracy: 0.7756 - val_loss: 0.4709
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 91ms/step - accuracy: 0.8239 - loss: 0.4119 - val_accuracy: 0.7664 - val_loss: 0.4599
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 57ms/step - accuracy: 0.8416 - loss: 0.3750 - val_accuracy: 0.7690 - val_loss: 0.4910
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 97ms/step - accuracy: 0.8618 - loss: 0.3392 - val_accuracy: 0.7966 - val_loss: 0.6100


In [None]:
model_2_glove_pred_probs = model_2_glove.predict(Xtest)
model_2_glove_preds = tf.squeeze(tf.round(model_2_glove_pred_probs))
model_2_glove_results = calculate_results(y_true = val_labels,
                                    y_pred = model_2_glove_preds)
model_2_glove_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 138ms/step


{'accuracy': 79.65879265091863,
 'precision': 0.7971332811059624,
 'recall': 0.7965879265091863,
 'f1': 0.795517158602831}

## Model 3: Conv1D

In [None]:
tf.random.set_seed(42)
model_3_embedding = Embedding(input_dim = 10000,
                      output_dim = 128,
                      embeddings_initializer = 'uniform',
                      input_length = max_len,
                      name = 'embedding_2')
model_3 = Sequential([
    Input(shape = (1,), dtype = tf.string),
    text_vectorizer,
    model_3_embedding,
    layers.Conv1D(filters = 64, kernel_size = 5, activation = 'relu'),
    layers.GlobalMaxPool1D(),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
    ], name = 'model_3_Conv1D')
model_3.compile(loss = 'binary_crossentropy',
                optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
                , metrics = ['accuracy'])
model_3.summary()



In [None]:
model_3_history = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(save_dir,
                                                                     "Conv1D")])

Saving TensorBoard log files to: model_logs/Conv1D/20240910-180723
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.5529 - loss: 0.6875 - val_accuracy: 0.5512 - val_loss: 0.6743
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.6356 - loss: 0.6418 - val_accuracy: 0.7493 - val_loss: 0.5441
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.8389 - loss: 0.4449 - val_accuracy: 0.7690 - val_loss: 0.4776
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 22ms/step - accuracy: 0.8982 - loss: 0.2854 - val_accuracy: 0.7756 - val_loss: 0.4840
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.9365 - loss: 0.1878 - val_accuracy: 0.7730 - val_loss: 0.5117


In [None]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_results = calculate_results(y_true = val_labels,
                                    y_pred = model_3_preds)
model_3_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


{'accuracy': 77.29658792650919,
 'precision': 0.7750482481613202,
 'recall': 0.7729658792650919,
 'f1': 0.7707112186521387}

## Model 3: Conv1d with Glove Embedding

In [None]:
tf.random.set_seed(42)
model_3_glove = Sequential([
              Input(shape = (1,)),
              Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = train_Xmaxlen,
              weights = [embedding_matrix],
              trainable = False
              ),
    layers.Conv1D(filters = 64, kernel_size = 3, activation = 'relu', padding = 'same'),
    layers.GlobalMaxPool1D(),
    Dense(128, activation = 'relu'),
    Dropout(0.5),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
    ], name = 'model_3_Conv1D_glove')
model_3_glove.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model_3_glove.summary()



In [None]:
model_3_glove_history = model_3_glove.fit(Xtrain,
                              train_labels,
                              epochs=5,
                              validation_data=(Xtest, val_labels),
                              callbacks=[create_tensorboard_callback(save_dir,
                                                                     "Conv1D_glove")])

Saving TensorBoard log files to: model_logs/Conv1D_glove/20240910-182531
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.6347 - loss: 0.6458 - val_accuracy: 0.7428 - val_loss: 0.4970
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.7942 - loss: 0.4560 - val_accuracy: 0.7717 - val_loss: 0.4764
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.8436 - loss: 0.3684 - val_accuracy: 0.7861 - val_loss: 0.5092
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8869 - loss: 0.2872 - val_accuracy: 0.7703 - val_loss: 0.5940
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9220 - loss: 0.2065 - val_accuracy: 0.7730 - val_loss: 0.6397


![](https://github.com/Xmen3em/Beginner-Projects/blob/main/Beginner-NLP-%20Projects/Natural%20Language%20Processing%20with%20Disaster%20Tweets/all_prev_deep_models.png?raw=true)

    for all of the previous deep learning models we've built and tained,we've created and used our own embedding from scratch each time
    - try to use pretrained Embeddings