## Check for GPU

In [None]:
!nvidia-smil -L

/bin/bash: nvidia-smil: command not found


## Get helper functions

In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

# Import 
from helper_functions import unzip_data,create_tensorboard_callback,plot_loss_curves,compare_historys

--2023-01-20 02:12:07--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2023-01-20 02:12:07 (63.5 MB/s) - ‘helper_functions.py’ saved [10246/10246]



## Get a text dataset

The dataset we are going to be using is Kaggle's introduction to NLP dataset (Tweet is disaster or non disaster)

In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2023-01-20 02:12:23--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.134.128, 173.194.213.128, 108.177.11.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.134.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-01-20 02:12:23 (26.9 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [None]:
# Unzip data
unzip_data('nlp_getting_started.zip')

## Visualizing a text dataset

In [None]:
import pandas as pd
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1,random_state=42)
train_df_shuffled.head()


Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [None]:
# What does the test dataframe look like
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
# How many examples of each class?
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [None]:
# How many total samples?
len(train_df), len(test_df)

(7613, 3263)

In [None]:
# Let's visualize some random training examples
import random
random_index = random.randint(0,len(train_df)-5)
for row in train_df_shuffled[['text','target']][random_index:random_index+5].itertuples():
  _,text,target = row
  print(f'Target: {target}', "(real disaster)" if target > 0 else '(not real disaster)')
  print(f"Text: {text}\n")


Target: 0 (not real disaster)
Text: *se pone a cantar crying lightning*

Target: 1 (real disaster)
Text: Hiroshima survivors fight nuclear industry in Brazil Ì¢?? video http://t.co/GLZmGBM7w0

Target: 1 (real disaster)
Text: Richmond police officer wounded suspect killed http://t.co/m9d2ElImZI

Target: 1 (real disaster)
Text: Still and Box alarm for the train derailment at 61st and Calumet struck out on the orders of 2-1-21. #ChicagoScanner

Target: 1 (real disaster)
Text: PHOTOS: The Rocky Fire has grown into California's most destructive wildfire this year. http://t.co/h9v4HoWtiP http://t.co/8IcSesHbj3



## Split data into training and test sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [None]:
# Check the lengths
len(train_sentences),len(val_sentences),len(train_labels),len(val_labels)

(6851, 762, 6851, 762)

In [None]:
# Check first 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object), array([0, 

## Converiting text into numbers

### Text vectorization (tokenization)

In [None]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Use the default TextVectorization parameters
text_vectorizer = TextVectorization(max_tokens=10000, #How many different words in the vocabulary (automatically add <oov>)
                                    standardize = 'lower_and_strip_punctuation',
                                    split='whitespace',
                                    ngrams=None, # create groups of n words)
                                    output_mode='int', # how to map tokens to numbers
                                    output_sequence_length=None, # how long do you want your sequence to be
                                    pad_to_max_tokens=True
                                    )

In [None]:
# Find the average number of tokens in the training tweets
round(sum([len(i.split()) for i in train_sentences])/ len(train_sentences))

15

In [None]:
# Setup text vectorization variables
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length, #How many different words in the vocabulary (automatically add <oov>)
                                    standardize = 'lower_and_strip_punctuation',
                                    output_mode='int', # how to map tokens to numbers
                                    output_sequence_length=max_length, # how long do you want your sequence to be
                                    )

In [None]:
# Fit the text vectorizer to training data
text_vectorizer.adapt(train_sentences)

In [None]:
# Create a sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [None]:
# Choose random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f'Original text:\n {random_sentence} \
\n\nVectorized version: {text_vectorizer([random_sentence])}')

Original text:
 WANTED: gritty and real casualty photos of Pasta Thursdays at Amico's. Tag us or #amicospizzato #seeyouatamicos... http://t.co/MZ8VQXbKTs 

Vectorized version: [[ 974    1    7  369  712  729    6 4954 7573   17    1 2270   69   53
     1]]


In [None]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()

top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

len(words_in_vocab), top_5_words , bottom_5_words

(10000,
 ['', '[UNK]', 'the', 'a', 'in'],
 ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1'])

### Creating an Embedding using an Embedding Layer

We will use Tensorflow embedding layer

Important Parameters:
* `input_dim` = size of our vocabulary
* `outpyt_dim` = size of the output embedding vector
* `input_length` = length of the sequences being passed to the embedding layer

In [None]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             
                             input_length=max_length)
embedding

<keras.layers.core.embedding.Embedding at 0x7f146122d8b0>

In [None]:
# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n {random_sentence}\
\n\n Embedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
 #stormchase Violent Record Breaking EF-5 El Reno Oklahoma Tornado Nearly Runs Over ... - http://t.co/3SICroAaNz http://t.co/I27Oa0HISp

 Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.04799047,  0.04704391, -0.01427438, ...,  0.00309984,
         -0.04752575, -0.03101015],
        [ 0.03356596,  0.01658892, -0.04563714, ...,  0.02001128,
          0.03917797, -0.01133127],
        [-0.03495397, -0.02837089,  0.02765583, ...,  0.01920546,
         -0.0191936 ,  0.00265662],
        ...,
        [-0.00532752, -0.02598155,  0.03816107, ...,  0.04207392,
         -0.03088194, -0.03571664],
        [-0.02133021, -0.0128861 , -0.0207394 , ..., -0.02688187,
          0.04000927,  0.00638647],
        [-0.01828736, -0.01137853, -0.01044468, ...,  0.03657773,
          0.0083071 ,  0.02988762]]], dtype=float32)>