# Learn basics in NLP with TensorFlow 

I'm gonna follow this github tutorial.

https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/08_introduction_to_nlp_in_tensorflow.ipynb

Get dataset from kaggle.

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('./dataaset/train.csv')

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Split data into train and test

In [4]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_lables, val_lables = train_test_split(
    train_data["text"].to_numpy(),
    train_data["target"].to_numpy(),
    test_size=0.1
    )

In [5]:
train_sentences

array(['@MoorlandsChmbr Loads of stuff going on recently. See the blog at http://t.co/XVcO7sLxhW #sinkhole #piling http://t.co/jbVmGeg522',
       'Know them recognize them......then obliterate them! \n#gym #gymflow #gymtime #team #assassins\x89Û_ https://t.co/mUHj8CbdQb',
       'Young dancer moves about 300 youth in attendance at the GMMBC Youth Explosion this past Saturday. Inspiring! http://t.co/TMmOrvxsWz',
       ...,
       'Passengers evacuated &amp; lanes blocked off as power lines come down over a Gold Coast tram @9NewsGoldCoast http://t.co/zZweEezJuG',
       'Sassy city girl country hunk stranded in Smoky Mountain snowstorm #AoMS http://t.co/HDJS9RNtJ4 #ibooklove #bookboost',
       'I liked a @YouTube video http://t.co/jK7nPdpWRo J. Cole - Fire Squad (2014 Forest Hills Drive)'],
      dtype=object)

# Converting text into numbers

Create words to vector function.

In [6]:
from tensorflow.keras.layers import TextVectorization

In [7]:
text2vec = TextVectorization(
    max_tokens=10000, standardize='lower_and_strip_punctuation',
    split='whitespace', ngrams=None, output_mode='int',
    output_sequence_length=15, pad_to_max_tokens=False, vocabulary=None,
    idf_weights=None, sparse=False, ragged=False
)

In [8]:
text2vec.adapt(train_sentences)

See how the words 

In [9]:
sample_sentence = "There is a flood in my street!"
text2vec([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9,   3, 198,   4,  13, 771,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

Get first words

In [10]:
text2vec.get_vocabulary()[:5]

['', '[UNK]', 'the', 'a', 'in']

Get the words from 100 to 105th.

In [11]:
text2vec.get_vocabulary()[100:105]

['see', 'bomb', 'time', 'our', 'attack']

# Creating Embedding layer

We are going to use TnsorFlow's embedding layers.

https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

In [12]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = 10000, # set imput shape
                             output_dim = 128, # output shape
                             input_length = 10000 # how long is each input 
                            )

embedding

<keras.layers.embeddings.Embedding at 0x209068a1490>

Get a random sentence from the training set

In [13]:
import random
random_sentence = random.choice(train_sentences)

print(f"Original text:\n {random_sentence}\
        \n\nEmbedded version:")

# Embed the random sentence (turn it into dense vectors of fixed size)
sample_embed = embedding(text2vec([random_sentence]))
sample_embed


Original text:
 Fire Call: BRANT AV / DRUMMOND RD for Fire - *Structure - Single. Units: CAR 6 On Call Truck http://t.co/euDwNFyUeM        

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.04951564,  0.0078002 , -0.01222025, ..., -0.02327453,
          0.0099053 ,  0.0384097 ],
        [-0.02880297,  0.02261369, -0.01718979, ...,  0.01479206,
          0.04564745, -0.03239497],
        [ 0.04460982, -0.03286145, -0.03843545, ..., -0.04534746,
         -0.02421483, -0.02914529],
        ...,
        [ 0.01917287,  0.04783763,  0.01503087, ...,  0.01635103,
          0.00802214,  0.03813008],
        [ 0.00086619,  0.00442737,  0.02149177, ...,  0.01872211,
          0.00563936,  0.02213318],
        [-0.02880297,  0.02261369, -0.01718979, ...,  0.01479206,
          0.04564745, -0.03239497]]], dtype=float32)>

In [14]:
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.04951564,  0.0078002 , -0.01222025,  0.04072765,  0.04100624,
         0.03876096, -0.00401591, -0.02894153,  0.02201004,  0.03407015,
         0.04352674, -0.02901422,  0.02552651, -0.04807413, -0.04559938,
        -0.01662544, -0.00063174,  0.03865595,  0.00734339,  0.008965  ,
        -0.01906165, -0.04343814, -0.03796182, -0.01543085, -0.0078395 ,
         0.04879439,  0.03825457, -0.02467214,  0.01627273, -0.0060693 ,
         0.0122801 , -0.01302365, -0.01803945,  0.02276829,  0.04341394,
         0.04464534,  0.0080142 ,  0.03985028,  0.03155437,  0.03409494,
         0.03972944, -0.02956646,  0.02102919, -0.0357822 ,  0.02046719,
        -0.00503008, -0.00356709, -0.03743269, -0.0390722 , -0.02759565,
         0.03918496,  0.01268556,  0.00449878,  0.0463611 , -0.04795349,
        -0.04124595, -0.04676462,  0.00936396,  0.03810005,  0.01818502,
         0.01095189,  0.04293311,  0.04872977,  0.01508769, -0.01856861,
  

# Modelling a text dataset with running a series of experiment

There are some Model to learn text:

0, Naive Bayes with TF-IDF encoder (baseline)

1, Feed-forward neural network (dence model)

2, LSTM (RNN)

3, GRU (RNN)

4, Bidirectional-LSTM (RNN)

5, 1D Convolutional Neural Network

6, TensorFlow Hub Pretrained Feature Extractor

7, TensorFlow Hub Pretrained Feature Extractor (10% of data)

How are we going to approach all of these?

Use the standard steps in modeling with tensorflow:

* Create a model
* Build a model
* Fit a model
* Evaluate our model

# Create LSTM(RNN) layer

In [15]:
from tensorflow.keras import layers
import tensorflow as tf

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text2vec(inputs)
x = embedding(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name = "model_2_LSTM")

In [16]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

Compile the model

In [17]:
model_2.compile(loss="binary_crossentropy",
               optimizer=tf.keras.optimizers.Adam(),
               metrics=["accuracy"])

Fit the model

In [25]:
# Create a tensorboard callback ( need to a new one for each model)
from helper_function import create_tensorboard_callback

# Create a directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [29]:

model_2_history = model_2.fit(x=train_sentences,
                             y=train_lables,
                             epochs=5,
                             validation_data = (val_sentences, val_lables),
                             callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                   experiment_name="model_1_LSTM")])

Saving TensorBoard log files to: model_logs/model_1_LSTM/20211229-165021
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
# Make prediction of LSTM
model_2_pred_prob = model_2.predict(val_sentences)
model_2_pred_prob[:10]

array([[0.99954623],
       [0.00243211],
       [0.04912975],
       [0.0180428 ],
       [0.375425  ],
       [0.4732651 ],
       [0.1736669 ],
       [0.9829545 ],
       [0.10995877],
       [0.11345091]], dtype=float32)

In [34]:
# Convert model 2 prediction to lable format 
model_2_pred =tf.squeeze(tf.round(model_2_pred_prob))
model_2_pred[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([1., 0., 0., 0., 0., 0., 0., 1., 0., 0.], dtype=float32)>

In [39]:
# Caliculate our model_2 results
from Evaluation import caluculate_results

model_2_results = caluculate_results(y_true=val_lables, 
                                   y_pre=model_2_pred)
model_2_results

{'accuracy': 75.7217847769029,
 'prediction': 0.7572982515398522,
 'recall': 0.7572178477690289,
 'f1': 0.7539976852065212}