## Check for GPU

In [3]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-e67a0ced-9e5c-1011-6219-7d3bf627f4bb)


## Get helper functions

In [4]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

# Import 
from helper_functions import unzip_data,create_tensorboard_callback,plot_loss_curves,compare_historys

--2023-01-23 04:23:55--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2023-01-23 04:23:55 (94.8 MB/s) - ‘helper_functions.py’ saved [10246/10246]



## Get a text dataset

The dataset we are going to be using is Kaggle's introduction to NLP dataset (Tweet is disaster or non disaster)

In [5]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2023-01-23 04:23:58--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.194.128, 74.125.68.128, 74.125.24.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.194.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-01-23 04:23:59 (756 KB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [6]:
# Unzip data
unzip_data('nlp_getting_started.zip')

## Visualizing a text dataset

In [7]:
import pandas as pd
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1,random_state=42)
train_df_shuffled.head()


Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [9]:
# What does the test dataframe look like
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
# How many examples of each class?
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [11]:
# How many total samples?
len(train_df), len(test_df)

(7613, 3263)

In [12]:
# Let's visualize some random training examples
import random
random_index = random.randint(0,len(train_df)-5)
for row in train_df_shuffled[['text','target']][random_index:random_index+5].itertuples():
  _,text,target = row
  print(f'Target: {target}', "(real disaster)" if target > 0 else '(not real disaster)')
  print(f"Text: {text}\n")


Target: 0 (not real disaster)
Text: I HAVE GOT MORE VIDEOS THAN YOU RAPPERS GOT SONGS! http://t.co/pBLvPM6C27

Target: 1 (real disaster)
Text: @WesleyLowery ?????? how are you going to survive this devastation?

Target: 0 (not real disaster)
Text: Hollywood movie about trapped miners released in Chile http://t.co/xe0EE1Fzfh

Target: 0 (not real disaster)
Text: #sing #tsunami Beginners #computer tutorial.: http://t.co/ukQYbhxMQI Everyone Wants To Learn To Build A Pc. Re http://t.co/iDWS2ZgYsa

Target: 1 (real disaster)
Text: #Reddit updates #content #policy promises to quarantine Û÷extremely offensiveÛª communities http://t.co/EHGtZhKAn4



## Split data into training and test sets

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                            train_df_shuffled['target'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [15]:
# Check the lengths
len(train_sentences),len(val_sentences),len(train_labels),len(val_labels)

(6851, 762, 6851, 762)

In [16]:
# Check first 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object), array([0, 

## Converiting text into numbers

### Text vectorization (tokenization)

In [17]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [18]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Use the default TextVectorization parameters
text_vectorizer = TextVectorization(max_tokens=10000, #How many different words in the vocabulary (automatically add <oov>)
                                    standardize = 'lower_and_strip_punctuation',
                                    split='whitespace',
                                    ngrams=None, # create groups of n words)
                                    output_mode='int', # how to map tokens to numbers
                                    output_sequence_length=None, # how long do you want your sequence to be
                                    pad_to_max_tokens=True
                                    )

In [19]:
# Find the average number of tokens in the training tweets
round(sum([len(i.split()) for i in train_sentences])/ len(train_sentences))

15

In [20]:
# Setup text vectorization variables
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length, #How many different words in the vocabulary (automatically add <oov>)
                                    standardize = 'lower_and_strip_punctuation',
                                    output_mode='int', # how to map tokens to numbers
                                    output_sequence_length=max_length, # how long do you want your sequence to be
                                    )

In [21]:
# Fit the text vectorizer to training data
text_vectorizer.adapt(train_sentences)

In [22]:
# Create a sample sentence and tokenize it
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [23]:
# Choose random sentence from the training dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f'Original text:\n {random_sentence} \
\n\nVectorized version: {text_vectorizer([random_sentence])}')

Original text:
 Why are you engulfed by low self-image? Take the quiz: http://t.co/I9dSPDKrUK http://t.co/NEp5aZwKNA 

Vectorized version: [[  91   22   12  436   18  773 1368  167    2  925    1    1    0    0
     0]]


In [24]:
# Get the unique words in the vocabulary
words_in_vocab = text_vectorizer.get_vocabulary()

top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]

len(words_in_vocab), top_5_words , bottom_5_words

(10000,
 ['', '[UNK]', 'the', 'a', 'in'],
 ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1'])

### Creating an Embedding using an Embedding Layer

We will use Tensorflow embedding layer

Important Parameters:
* `input_dim` = size of our vocabulary
* `outpyt_dim` = size of the output embedding vector
* `input_length` = length of the sequences being passed to the embedding layer

In [25]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,                     
                             input_length=max_length)
embedding

<keras.layers.core.embedding.Embedding at 0x7fea62abdc70>

In [26]:
# Get a random sentence from the training set
random_sentence = random.choice(train_sentences)
print(f"Original text:\n {random_sentence}\
\n\n Embedded version:")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:

 Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.00328518, -0.02824464, -0.00636219, ...,  0.03047546,
         -0.00091954,  0.04165984],
        [-0.02913462,  0.04743263,  0.0244731 , ...,  0.03250157,
          0.02375517, -0.0132188 ],
        [ 0.00925437, -0.02402787,  0.04912831, ..., -0.0093065 ,
          0.03142693,  0.03505364],
        ...,
        [ 0.04457066,  0.02756472,  0.00386431, ..., -0.01868436,
         -0.04803326, -0.00835171],
        [ 0.04457066,  0.02756472,  0.00386431, ..., -0.01868436,
         -0.04803326, -0.00835171],
        [ 0.04457066,  0.02756472,  0.00386431, ..., -0.01868436,
         -0.04803326, -0.00835171]]], dtype=float32)>

In [27]:
len(sample_embed)

1

In [28]:
sample_embed.shape

TensorShape([1, 15, 128])

In [34]:
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-3.28518078e-03, -2.82446388e-02, -6.36218861e-03,  1.08281970e-02,
       -4.72559817e-02, -4.68125828e-02, -4.68831547e-02, -4.67510708e-02,
       -3.73336896e-02, -1.90091021e-02,  2.65049450e-02,  1.23883374e-02,
       -1.49595253e-02, -2.74367817e-02, -4.12950628e-02,  1.53226778e-03,
       -7.17234612e-03,  2.33032145e-02, -1.50591508e-02,  4.70391400e-02,
       -2.54027974e-02, -3.57864983e-02, -2.01434847e-02,  1.18152872e-02,
       -2.16678269e-02,  3.48060392e-02, -3.45406681e-03, -4.48295362e-02,
       -4.18449156e-02, -2.53971107e-02,  3.09171937e-02, -8.59089941e-03,
       -3.13071162e-02, -2.60072704e-02, -1.45009980e-02,  4.45547141e-02,
        2.84479894e-02,  1.00155473e-02,  4.58285846e-02,  1.62511133e-02,
        4.06529568e-02,  4.92997877e-02, -6.72768429e-03, -3.84535789e-02,
        1.13016963e-02,  5.18469885e-03, -1.30928755e-02,  4.64084856e-02,
       -5.54211438e-05, -4.98166792e-02,  4.77903001

In [36]:
text_vectorizer([random_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[5313, 6031,  777,  232,  338,   10, 3154,    1,    1,    1,    1,
        1731,    0,    0,    0]])>

## Modelling a text dataset and running series of experiments