## import important libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical, pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
! wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

--2024-09-10 01:27:25--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2024-09-10 01:27:25 (77.8 MB/s) - ‘helper_functions.py’ saved [10246/10246]



## Check GPU

In [3]:
!nvidia-smi -L

/bin/bash: line 1: nvidia-smi: command not found


## Download text Data

In [4]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")

--2024-09-10 01:27:33--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.202.207, 173.194.203.207, 74.125.199.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.202.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-09-10 01:27:33 (104 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [5]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
# shuffle data
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [7]:
# visualize text data
train_df_shuffled.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [10]:
random_index = np.random.randint(0, len(train_df_shuffled)-5)
for _, row in train_df_shuffled[['text', 'target']][random_index: random_index + 5].iterrows():
  text = row['text']
  target = row['target']
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 0 (not real disaster)
Text:
@Ruddyyyyyy @JamieGriff97 Jamie is too traumatised to answer http://t.co/VzgslEPkkH

---

Target: 0 (not real disaster)
Text:
.@stavernise: France agreed to repay Russia for two warships which were never delivered after economic sanctions  http://t.co/K4H8cq7puo

---

Target: 0 (not real disaster)
Text:
@HimeRuisu I'm going to ram your ass so hard I'll have to shove your face on the pillows to muffle your screams of pain and pleasure~

---

Target: 0 (not real disaster)
Text:
But even if the stars and moon collide I never want you back into my life??????.

---

Target: 0 (not real disaster)
Text:
@ssssnell yeah I agree but it's the shock factor that ropes people in if they just show devastation then it angers me

---



## split data into train and validation sets

In [11]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size = 0.1,
                                                                            random_state = 42)
train_sentences.shape, val_sentences.shape, train_labels.shape, val_labels.shape

((6851,), (762,), (6851,), (762,))

In [12]:
train_sentences

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       ...,
       'Near them on the sand half sunk a shattered visage lies... http://t.co/0kCCG1BT06',
       "kesabaran membuahkan hasil indah pada saat tepat! life isn't about waiting for the storm to pass it's about learning to dance in the rain.",
       "@ScottDPierce @billharris_tv @HarrisGle @Beezersun I'm forfeiting this years fantasy football pool out of fear I may win n get my ass kicked"],
      dtype=object)

## Tokenization and Embedding Approaches
**Manual Tokenization and Embedding using GloVe:**

    Tokenize the text manually.
    Use GloVe embeddings for converting tokens into vectors.

**Automated Tokenization and Embedding using TextVectorization:**

    Utilize the TextVectorization layer to tokenize and generate embeddings automatically.


In [13]:
# Tokenize the text manually.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
word2index = tokenizer.word_index

train_Xtokens = tokenizer.texts_to_sequences(train_sentences)
test_Xtokens = tokenizer.texts_to_sequences(val_sentences)

def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

train_Xmaxlen = get_maxlen(train_Xtokens)
test_Xmaxlen = get_maxlen(test_Xtokens)

Xtrain = pad_sequences(train_Xtokens, maxlen = train_Xmaxlen,  padding = 'post', truncating = 'post')
Xtest = pad_sequences(test_Xtokens, maxlen = test_Xmaxlen,  padding = 'post', truncating = 'post')

train_Xmaxlen, test_Xmaxlen

(33, 31)

In [18]:
# Use GloVe embeddings for converting tokens into vectors.
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

# unzip in my drive
unzip_data('glove.twitter.27B.zip')


--2024-09-10 01:29:57--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2024-09-10 01:29:57--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2024-09-10 01:29:57--  https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [ap

In [19]:
file = open('/content/glove.twitter.27B.200d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

embed_size = 200
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    embed_vector = embeddings.get(word) # Use get() to avoid KeyError
    if embed_vector is not None:
        embedding_matrix[i] = embed_vector
    else:
        embedding_matrix[i] = np.random.randn(embed_size) # Replace OOV with random vector

In [21]:
embedding_matrix.shape

(21058, 200)

## Automated Tokenization and Embedding using TextVectorization

In [30]:
# text vectorization
max_len = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
text_vectorizer = TextVectorization(max_tokens = 10000,
                                    standardize = 'lower_and_strip_punctuation',
                                    split = 'whitespace',
                                    ngrams = None,
                                    output_mode = 'int',
                                    output_sequence_length = max_len)
text_vectorizer.adapt(train_sentences)

In [31]:
# create an embedding using an embedding layer
tf.random.set_seed(42)

embedding = Embedding(input_dim = 10000,
                      output_dim = 128,
                      embeddings_initializer = 'uniform',
                      input_length = max_len,
                      name = 'embedding_1')
embedding



<Embedding name=embedding_1, built=False>

In [33]:
# Get a random sentence from training set
import random
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
@TheBoyOfMasks 'Thanks again for letting me stay here since the manor was blown up..... Anyways how are you doing buddy?'      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 1.60583369e-02, -1.93522125e-03,  1.45799853e-02, ...,
          2.78151147e-02,  8.48112255e-03,  3.56712081e-02],
        [ 3.47475670e-02, -1.77157633e-02,  2.51810215e-02, ...,
         -2.88766976e-02,  5.29484823e-03, -1.71114691e-02],
        [-4.91122007e-02,  4.93815653e-02, -2.11685896e-02, ...,
         -1.30949505e-02, -3.43465097e-02, -1.39662735e-02],
        ...,
        [ 1.73287429e-02,  2.27263011e-02, -4.78691943e-02, ...,
         -4.39065360e-02,  1.37822889e-02,  4.76700068e-03],
        [-1.35733485e-02, -3.73135209e-02,  3.13850753e-02, ...,
          6.77108765e-05, -3.46353278e-02,  9.48148966e-03],
        [-7.74520636e-03, -4.45477255e-02,  4.89204414e-02, ...,
         -3.80082354e-02,  4.81168516e-02,  1.63719170e-02]]],
      dtype=float32)>

## Model_0: Getting a baseline (MultinomialNB)

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Fit the pipeline on the training data
model_0.fit(train_sentences, train_labels)

In [27]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 79.27%


In [28]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

## Evaluation of baseline model

In [29]:
from helper_functions import calculate_results
baseline_results = calculate_results(y_true = val_labels,
                                     y_pred = baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

## Model_1: A simple Dense Model (using Embedding layer)

In [37]:
import tensorflow as tf
from tensorflow.keras import layers

# Create a tensorboard callback (need to create a new one for each model)
tf.random.set_seed(42)

# create a directory to save tensorboard logs
save_dir = 'model_logs'

model_1 = Sequential([
    Input(shape = (1,), dtype = tf.string),
    text_vectorizer,
    embedding,
    layers.GlobalAveragePooling1D(),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
], name = 'model_1_dense')

model_1.compile(loss = 'binary_crossentropy',
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

model_1.summary()

In [38]:
history_1 = model_1.fit(train_sentences,
                        train_labels,
                        epochs = 5,
                        validation_data = (val_sentences, val_labels),
                        callbacks = [create_tensorboard_callback(dir_name = save_dir,
                                                                 experiment_name = 'model_1_dense')])

Saving TensorBoard log files to: model_logs/model_1_dense/20240910-015221
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 26ms/step - accuracy: 0.6589 - loss: 0.6022 - val_accuracy: 0.7717 - val_loss: 0.4698
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 23ms/step - accuracy: 0.8629 - loss: 0.3401 - val_accuracy: 0.7664 - val_loss: 0.5101
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9127 - loss: 0.2342 - val_accuracy: 0.7638 - val_loss: 0.6034
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9366 - loss: 0.1650 - val_accuracy: 0.7677 - val_loss: 0.7086
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9541 - loss: 0.1210 - val_accuracy: 0.7769 - val_loss: 0.8595


In [39]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs)) # squeeze removes single dimensions
model_1_preds[:20]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step


<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1.], dtype=float32)>

In [40]:
# Calculate model_1 metrics
model_1_results = calculate_results(y_true = val_labels,
                                    y_pred = model_1_preds)
model_1_results

{'accuracy': 77.69028871391076,
 'precision': 0.7831567504504889,
 'recall': 0.7769028871391076,
 'f1': 0.7731247281588467}

## Using the same model but with (GloVe embeddings)

In [60]:
tf.random.set_seed(42)
model_1_glove = Sequential([
              Input(shape = (1,)),
              Embedding(input_dim = len(word2index) + 1,
              output_dim = embed_size,
              input_length = train_Xmaxlen,
              weights = [embedding_matrix],
              trainable = False
             ),
    layers.GlobalAveragePooling1D(),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

model_1_glove.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_1_glove.summary()

In [61]:
history_2 = model_1_glove.fit(Xtrain,
                        train_labels,
                        epochs = 5,
                        validation_data = (Xtest, val_labels),
                        callbacks = [create_tensorboard_callback(dir_name = save_dir,
                                                                 experiment_name = 'model_1_dense_glove')])

Saving TensorBoard log files to: model_logs/model_1_dense_glove/20240910-031027
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.7178 - loss: 0.5707 - val_accuracy: 0.7664 - val_loss: 0.4701
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7912 - loss: 0.4590 - val_accuracy: 0.7743 - val_loss: 0.4527
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8039 - loss: 0.4387 - val_accuracy: 0.7782 - val_loss: 0.4477
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8145 - loss: 0.4176 - val_accuracy: 0.7730 - val_loss: 0.4477
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8297 - loss: 0.3943 - val_accuracy: 0.7677 - val_loss: 0.4523


In [62]:
model_1_glove_pred_probs = model_1.predict(val_sentences)
model_1_glove_preds = tf.squeeze(tf.round(model_1_glove_pred_probs))
model_1_glove_results = calculate_results(y_true = val_labels,
                                    y_pred = model_1_glove_preds)
model_1_glove_results

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


{'accuracy': 77.69028871391076,
 'precision': 0.7831567504504889,
 'recall': 0.7769028871391076,
 'f1': 0.7731247281588467}

In [63]:
!tensorboard dev upload --logdir ./model_logs \
  --name "First deep model on text data" \
  --description "Trying a dense model with an embedding layer and with an Glove embedding" \
  --one_shot # exits the uploader when upload has finished

2024-09-10 03:15:43.133889: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 03:15:43.515537: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 03:15:43.604842: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
ERROR: The `tensorboard dev` command is no longer available.

TensorBoard.dev has been shut down. For further information,
see the FAQ at <https://tensorboard.dev/>.


In [64]:
# %load_ext tensorboard
# %tensorboard --logdir model_logs

![](https://github.com/Xmen3em/Beginner-Projects/blob/main/Beginner-NLP-%20Projects/Natural%20Language%20Processing%20with%20Disaster%20Tweets/Screenshot%202024-09-10%20062208.png?raw=true)