# Learn basics in NLP with TensorFlow 

I'm gonna follow this github tutorial.

https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/08_introduction_to_nlp_in_tensorflow.ipynb

Get dataset from kaggle.

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('./dataaset/train.csv')

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Split data into train and test

In [4]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_lables, val_lables = train_test_split(
    train_data["text"].to_numpy(),
    train_data["target"].to_numpy(),
    test_size=0.1
    )

In [5]:
train_sentences

array(['Hollywood movie about trapped miners released in Chile http://t.co/xe0EE1Fzfh',
       'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring',
       "Oops: Bounty hunters try to raid Phoenix police chief's home: http://t.co/yPRJWMigHL -- A group of armed bounty... http://t.co/3RrKRCjYW7",
       ...,
       'Heavy smoke pouring out of buildings on fire in Port Coquitlam http://t.co/GeqkdaO4cV http://t.co/Dg0bGzeCgM',
       'RT @DianneG: Gunshot wound #9 is in the bicep. only 1 of the 10 wounds that is not in the chest/torso area.  #KerrickTrial #JonathanFerrell',
       '10News ? Water main break disrupts trolley service http://t.co/pAug7a68i0'],
      dtype=object)

# Converting text into numbers

Create words to vector function.

In [6]:
from tensorflow.keras.layers import TextVectorization

In [7]:
text2vec = TextVectorization(
    max_tokens=10000, standardize='lower_and_strip_punctuation',
    split='whitespace', ngrams=None, output_mode='int',
    output_sequence_length=15, pad_to_max_tokens=False, vocabulary=None,
    idf_weights=None, sparse=False, ragged=False
)

In [8]:
text2vec.adapt(train_sentences)

See how the words 

In [9]:
sample_sentence = "There is a flood in my street!"
text2vec([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 75,   9,   3, 203,   5,  13, 666,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

Get first words

In [10]:
text2vec.get_vocabulary()[:5]

['', '[UNK]', 'the', 'a', 'to']

Get the words from 100 to 105th.

In [11]:
text2vec.get_vocabulary()[100:105]

['see', 'first', 'day', 'cant', 'world']

# Creating Embedding layer

We are going to use TnsorFlow's embedding layers.

https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

In [12]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = 10000, # set imput shape
                             output_dim = 128, # output shape
                             input_length = 10000 # how long is each input 
                            )

embedding

<keras.layers.embeddings.Embedding at 0x1d9ad6a1a30>

Get a random sentence from the training set

In [13]:
import random
random_sentence = random.choice(train_sentences)

print(f"Original text:\n {random_sentence}\
        \n\nEmbedded version:")

# Embed the random sentence (turn it into dense vectors of fixed size)
sample_embed = embedding(text2vec([random_sentence]))
sample_embed


Original text:
 'wHeRE's mY aRsOnISt aT???'        

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.0297563 , -0.02092905,  0.04078798, ...,  0.00711242,
         -0.04284542,  0.03391292],
        [ 0.0010883 ,  0.01953406,  0.00258931, ...,  0.02488092,
          0.04873807, -0.0099984 ],
        [ 0.01043765, -0.02641513,  0.04381008, ...,  0.01944622,
          0.03896156, -0.03296417],
        ...,
        [-0.01821476, -0.03163229,  0.02818273, ...,  0.03684529,
         -0.02070861,  0.02276877],
        [-0.01821476, -0.03163229,  0.02818273, ...,  0.03684529,
         -0.02070861,  0.02276877],
        [-0.01821476, -0.03163229,  0.02818273, ...,  0.03684529,
         -0.02070861,  0.02276877]]], dtype=float32)>

In [14]:
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.0297563 , -0.02092905,  0.04078798, -0.02196819, -0.00807965,
        -0.00148211, -0.02353201, -0.01615844, -0.01918172,  0.02138889,
        -0.02645737,  0.00046874,  0.03271106,  0.00450251,  0.02479123,
        -0.03978216, -0.03750757, -0.00949186,  0.00463222,  0.01125814,
         0.04539147,  0.03123078,  0.04295513,  0.04089332,  0.02253295,
         0.01699356,  0.02225811,  0.03145471,  0.01545269,  0.04654398,
        -0.03441787,  0.03656666,  0.00861831, -0.03463018,  0.04097452,
         0.0020848 , -0.00505561, -0.03251859,  0.01918758,  0.02089768,
        -0.04322491,  0.04906039, -0.03597886, -0.0076287 , -0.03545251,
         0.03034674, -0.02888008,  0.00652199,  0.01065707, -0.03353274,
         0.03557615, -0.02449963, -0.00256044, -0.00994682,  0.02820032,
        -0.0139662 ,  0.03899192,  0.03373424,  0.03343875,  0.00955718,
        -0.04186545,  0.046738  ,  0.02758098, -0.00883343, -0.0215392 ,
  

# Modelling a text dataset with running a series of experiment

There are some Model to learn text:

0, Naive Bayes with TF-IDF encoder (baseline)

1, Feed-forward neural network (dence model)

2, LSTM (RNN)

3, GRU (RNN)

4, Bidirectional-LSTM (RNN)

5, 1D Convolutional Neural Network

6, TensorFlow Hub Pretrained Feature Extractor

7, TensorFlow Hub Pretrained Feature Extractor (10% of data)

How are we going to approach all of these?

Use the standard steps in modeling with tensorflow:

* Create a model
* Build a model
* Fit a model
* Evaluate our model

# Create Bidirectional-LSTM layer

In [16]:
import tensorflow as tf
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text2vec(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_GRU")

Bidirectional layer has twice output shapes than LSTM.

In [17]:
model_4.summary()

Model: "model_4_GRU"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 15)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                                                       

In [23]:
model_4.compile(loss="binary_crossentropy",
               optimizer=tf.keras.optimizers.Adam(),
               metrics=["accuracy"])

In [20]:
# Create a tensorboard callback ( need to a new one for each model)
from helper_function import create_tensorboard_callback

# Create a directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [24]:
# Fit the model
model_4_history = model_4.fit(train_sentences,
                              train_lables,
                              epochs=5,
                              validation_data=(val_sentences, val_lables),
                              callbacks=(create_tensorboard_callback(SAVE_DIR,
                                                                    "model_4_bidirectional"))
                             )

Saving TensorBoard log files to: model_logs/model_4_bidirectional/20211230-093512
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
# Make predictions 
model_4_pre_probs = model_4.predict(val_sentences)
model_4_pre_probs[:10]

array([[0.25142705],
       [0.99904674],
       [0.12830713],
       [0.01412478],
       [0.9619221 ],
       [0.12754735],
       [0.21966681],
       [0.99987   ],
       [0.9997458 ],
       [0.24157801]], dtype=float32)

In [26]:
# Convert Model 4 prediction to lable format
model_4_pre = tf.squeeze(tf.round(model_4_pre_probs))
model_4_pre[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 0., 0., 1., 0., 0., 1., 1., 0.], dtype=float32)>

In [29]:
# Calculate model 4 results
from Evaluation import caluculate_results
model_4_results = caluculate_results(y_true=val_lables,
                                    y_pre=model_4_pre)
model_4_results

{'accuracy': 75.45931758530183,
 'prediction': 0.7540931154144959,
 'recall': 0.7545931758530183,
 'f1': 0.752516366382761}