## import important libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical, pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [8]:
! wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

--2024-08-13 12:47:05--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.1’


2024-08-13 12:47:05 (118 MB/s) - ‘helper_functions.py.1’ saved [10246/10246]



## Check GPU

In [5]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-2199bd10-e78e-9d71-b833-be0e3f1680f1)


## Download text Data

In [11]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")

--2024-08-13 12:48:10--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.175.207, 74.125.24.207, 142.251.10.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.175.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-08-13 12:48:11 (743 KB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [12]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [13]:
# shuffle data
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [14]:
# visualize text data
train_df_shuffled.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [22]:
random_index = np.random.randint(0, len(train_df_shuffled)-5)
for _, row in train_df_shuffled[['text', 'target']][random_index: random_index + 5].iterrows():
  text = row['text']
  target = row['target']
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 1 (real disaster)
Text:
Massive Sinkhole Emerges In Brooklyn http://t.co/n3Ow73Oasw http://t.co/Gs9bmplbHH

---

Target: 1 (real disaster)
Text:
In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!

---

Target: 0 (not real disaster)
Text:
Photoshop Tools Crash Course - Complete Photoshop Tool Guide http://t.co/DunMvj7ITl #course http://t.co/RgdrJv63hF

---

Target: 0 (not real disaster)
Text:
A river of lava in the sky this evening! It was indeed a beautiful sunset sky tonight. (8-4-15) http://t.co/17EGMlNi80

---

Target: 1 (real disaster)
Text:

---



## split data into train and validation sets

In [23]:
from sklearn.model_selection import train_test_split
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size = 0.1,
                                                                            random_state = 42)
train_sentences.shape, val_sentences.shape, train_labels.shape, val_labels.shape

((6851,), (762,), (6851,), (762,))

In [24]:
train_sentences

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       ...,
       'Near them on the sand half sunk a shattered visage lies... http://t.co/0kCCG1BT06',
       "kesabaran membuahkan hasil indah pada saat tepat! life isn't about waiting for the storm to pass it's about learning to dance in the rain.",
       "@ScottDPierce @billharris_tv @HarrisGle @Beezersun I'm forfeiting this years fantasy football pool out of fear I may win n get my ass kicked"],
      dtype=object)

## Tokenization and Embedding Approaches
**Manual Tokenization and Embedding using GloVe:**

    Tokenize the text manually.
    Use GloVe embeddings for converting tokens into vectors.

**Automated Tokenization and Embedding using TextVectorization:**

    Utilize the TextVectorization layer to tokenize and generate embeddings automatically.


In [25]:
# Tokenize the text manually.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)
word2index = tokenizer.word_index

train_Xtokens = tokenizer.texts_to_sequences(train_sentences)
test_Xtokens = tokenizer.texts_to_sequences(val_sentences)

def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

train_Xmaxlen = get_maxlen(train_Xtokens)
test_Xmaxlen = get_maxlen(test_Xtokens)

Xtrain = pad_sequences(train_Xtokens, maxlen = train_Xmaxlen,  padding = 'post', truncating = 'post')
Xtest = pad_sequences(test_Xtokens, maxlen = test_Xmaxlen,  padding = 'post', truncating = 'post')

train_Xmaxlen, test_Xmaxlen

(33, 31)

In [38]:
# Use GloVe embeddings for converting tokens into vectors.
# !wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

# unzip in my drive
unzip_data('glove.twitter.27B.zip.1')


In [41]:
file = open('/content/glove.twitter.27B.200d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

embed_size = 200
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    embed_vector = embeddings.get(word) # Use get() to avoid KeyError
    if embed_vector is not None:
        embedding_matrix[i] = embed_vector
    else:
        embedding_matrix[i] = np.random.randn(embed_size) # Replace OOV with random vector

## Model_0: Getting a baseline (MultinomialNB)

In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Fit the pipeline on the training data
model_0.fit(train_sentences, train_labels)

In [46]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

Our baseline model achieves an accuracy of: 79.27%


In [47]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

## Evaluation of baseline model

In [48]:
from helper_functions import calculate_results
baseline_results = calculate_results(y_true = val_labels,
                                     y_pred = baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

## Automated Tokenization and Embedding using TextVectorization

In [56]:
# text vectorization
max_len = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
text_vectorizer = TextVectorization(max_tokens = 10000,
                                    standardize = 'lower_and_strip_punctuation',
                                    split = 'whitespace',
                                    ngrams = None,
                                    output_mode = 'int',
                                    output_sequence_length = max_len)
text_vectorizer.adapt(train_sentences)

In [57]:
# create an embedding using an embedding layer
tf.random.set_seed(42)

embedding = Embedding(input_dim = 10000,
                      output_dim = 128,
                      embeddings_initializer = 'uniform',
                      input_length = max_len,
                      name = 'embedding_1')
embedding



<Embedding name=embedding_1, built=False>

In [58]:
# Get a random sentence from training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
if they kill off Val I'm rioting #Emmerdale      

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.04258509, -0.04624546,  0.01630769, ..., -0.00146295,
         -0.02713509,  0.04285808],
        [-0.04254272,  0.02041656, -0.02852876, ...,  0.03995467,
         -0.0346517 ,  0.03572458],
        [-0.0187807 ,  0.03691017,  0.02101907, ...,  0.01599291,
         -0.00366716, -0.04715193],
        ...,
        [ 0.00684749,  0.00210594, -0.03678628, ..., -0.0051327 ,
         -0.04411061, -0.03383931],
        [ 0.00684749,  0.00210594, -0.03678628, ..., -0.0051327 ,
         -0.04411061, -0.03383931],
        [ 0.00684749,  0.00210594, -0.03678628, ..., -0.0051327 ,
         -0.04411061, -0.03383931]]], dtype=float32)>

**notes**
complete the section to see how the instructor implement models and then apply it in this notebook for two ways of tokenization and embeddings and then see the compare between them in tensorboard