# Learn basics in NLP with TensorFlow 

I'm gonna follow this github tutorial.

https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/08_introduction_to_nlp_in_tensorflow.ipynb

Get dataset from kaggle.

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('./dataaset/train.csv')

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Split data into train and test

In [4]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_lables, val_lables = train_test_split(
    train_data["text"].to_numpy(),
    train_data["target"].to_numpy(),
    test_size=0.1
    )

In [5]:
train_sentences

array(['Any disaster impairs mental health especially in vulnerable individuals... http://t.co/ZisuwLqRHf',
       'PM Abe pledged to make every effort to seek a world without nuclear weapons. http://t.co/CBXnHhZ6kD',
       "Forsure back in the gym tomorrow. Body isn't even at 50%. Don't wanna risk injuries.",
       ...,
       "Love how I don't get in any trouble for having people over and the house still being trashed",
       'Do you feel engulfed with low self-image? Take the quiz: http://t.co/YzDmouXQBO http://t.co/PeXfgawrG1',
       "I understand you wanting to hang out with your guy friends I'll give you your space but don't ruin my trust with you."],
      dtype=object)

# Converting text into numbers

Create words to vector function.

In [6]:
from tensorflow.keras.layers import TextVectorization

In [7]:
text2vec = TextVectorization(
    max_tokens=10000, standardize='lower_and_strip_punctuation',
    split='whitespace', ngrams=None, output_mode='int',
    output_sequence_length=15, pad_to_max_tokens=False, vocabulary=None,
    idf_weights=None, sparse=False, ragged=False
)

2022-01-02 11:12:51.867819: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-02 11:12:51.899593: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-01-02 11:12:51.899614: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-01-02 11:12:51.899898: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

In [8]:
text2vec.adapt(train_sentences)

See how the words 

In [9]:
sample_sentence = "There is a flood in my street!"
text2vec([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 72,   9,   3, 228,   4,  13, 734,   0,   0,   0,   0,   0,   0,
          0,   0]])>

Get first words

In [10]:
text2vec.get_vocabulary()[:5]

['', '[UNK]', 'the', 'a', 'in']

Get the words from 100 to 105th.

In [11]:
text2vec.get_vocabulary()[100:105]

['bomb', 'buildings', 'see', 'our', 'know']

# Creating Embedding layer

We are going to use TnsorFlow's embedding layers.

https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

In [12]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = 10000, # set imput shape
                             output_dim = 128, # output shape
                             input_length = 10000 # how long is each input 
                            )

embedding

<keras.layers.embeddings.Embedding at 0x7f6da40802e0>

Get a random sentence from the training set

In [13]:
import random
random_sentence = random.choice(train_sentences)

print(f"Original text:\n {random_sentence}\
        \n\nEmbedded version:")

# Embed the random sentence (turn it into dense vectors of fixed size)
sample_embed = embedding(text2vec([random_sentence]))
sample_embed


Original text:
 Sydney Traffic HAZARD Oil spill - BANKSTOWN Stacey St at Wattle St #sydtraffic #trafficnetwork        

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.03892614, -0.04641541,  0.04777295, ..., -0.0052767 ,
         -0.00435907, -0.0036934 ],
        [ 0.01728321,  0.03248573, -0.02044897, ..., -0.03938868,
         -0.02776018, -0.03744652],
        [-0.04819452, -0.03230914, -0.03411354, ..., -0.00883552,
         -0.03620319, -0.0242218 ],
        ...,
        [ 0.00067035,  0.03130225,  0.02520123, ..., -0.0402687 ,
         -0.01895981,  0.03321929],
        [ 0.02673699, -0.03297522,  0.02253493, ...,  0.04762368,
          0.03784218, -0.04502017],
        [ 0.02673699, -0.03297522,  0.02253493, ...,  0.04762368,
          0.03784218, -0.04502017]]], dtype=float32)>

In [14]:
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.03892614, -0.04641541,  0.04777295, -0.00414192,  0.018943  ,
        -0.04670976, -0.03457719, -0.00282471,  0.00265484, -0.0055011 ,
        -0.02693317,  0.03918287,  0.03043142, -0.00431378, -0.03164884,
        -0.03541088, -0.04157429,  0.02756966, -0.02426277,  0.01664637,
         0.00429921,  0.03090943,  0.04107935,  0.04609969,  0.00375288,
         0.01543416, -0.00789005, -0.03775101,  0.04337377, -0.01977952,
        -0.04641226, -0.0022339 , -0.01215573, -0.01021969, -0.00839522,
         0.01770103,  0.02704338,  0.03156639,  0.02284536,  0.04737722,
         0.02939064,  0.03326209, -0.03806438,  0.0435033 , -0.0443492 ,
         0.01927627, -0.01093759,  0.03253   , -0.00324621,  0.00061215,
         0.04780847, -0.01315317, -0.03231405, -0.01922563,  0.03342343,
         0.0157779 ,  0.03171125,  0.04932895,  0.03321553,  0.0320182 ,
         0.03938054,  0.0369794 , -0.02484522,  0.04967489, -0.01517323,
  

# Modelling a text dataset with running a series of experiment

There are some Model to learn text:

0, Naive Bayes with TF-IDF encoder (baseline)

1, Feed-forward neural network (dence model)

2, LSTM (RNN)

3, GRU (RNN)

4, Bidirectional-LSTM (RNN)

5, 1D Convolutional Neural Network

6, TensorFlow Hub Pretrained Feature Extractor

7, TensorFlow Hub Pretrained Feature Extractor (10% of data)

How are we going to approach all of these?

Use the standard steps in modeling with tensorflow:

* Create a model
* Build a model
* Fit a model
* Evaluate our model

# Create Tensorflow Pretrained model

refer this model
* https://tfhub.dev/google/universal-sentence-encoder/4

This apploach takes lots time with local PC, so comment out these code.

In [19]:
import tensorflow_hub as hub
import tensorflow as tf

In [17]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embed_samples = embed([sample_sentence,
                       "When you can the universal sentence encoder on a sentence, it turns it into numbers."])
print(embed_samples[0][:50])

tf.Tensor(
[-0.01602831  0.01068851  0.02425469 -0.01405769  0.01434426  0.08292625
  0.01963368  0.06160142 -0.003527   -0.01216412  0.00978648 -0.01248495
  0.01232345  0.09748451  0.06141113 -0.03728355  0.01860887 -0.04669856
  0.00413912 -0.06363905 -0.024699    0.0271369   0.02284444 -0.00210028
 -0.00630594 -0.03964957  0.02220405  0.00115079 -0.03132173  0.00119527
 -0.04012548  0.04561892 -0.01530599 -0.00175915  0.02173127 -0.08450424
  0.03340026  0.04604553 -0.02480252 -0.08681665  0.00702694 -0.00770478
 -0.01434541  0.07814164 -0.10676058 -0.05152994 -0.00858156 -0.03232234
 -0.03871094  0.02581467], shape=(50,), dtype=float32)


In [20]:
# Create model
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                       input_shape=[],
                                       dtype=tf.string,
                                       trainable=False,
                                       name="USE")

In [21]:
# Create model useing the Sequence 
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(1, activation="sigmoid")
], name="model_6")

In [25]:
model_6.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense (Dense)               (None, 1)                 513       
                                                                 
Total params: 256,798,337
Trainable params: 513
Non-trainable params: 256,797,824
_________________________________________________________________


In [22]:
# Compile the model
model_6.compile(loss="binary_crossentropy",
               optimizer=tf.keras.optimizers.Adam(),
               metrics=["accuracy"])

In [23]:
# Create a tensorboard callback ( need to a new one for each model)
from helper_function import create_tensorboard_callback

# Create a directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [24]:
# Fit the model
model_6_history = model_6.fit(x=train_sentences,
                             y=train_lables,
                             epochs=5,
                             validation_data=(val_sentences, val_lables),
                             callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                   "model_6_Pretrained")])

Saving TensorBoard log files to: model_logs/model_6_Pretrained/20220102-112717
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
# Predict
model_6_pred_probs = model_6.predict(val_sentences)
model_6_pred_probs[:10]

array([[0.40397117],
       [0.860996  ],
       [0.37137312],
       [0.24236357],
       [0.5873279 ],
       [0.7531559 ],
       [0.26694798],
       [0.8313131 ],
       [0.66474056],
       [0.370508  ]], dtype=float32)

In [29]:
model_6_pred = tf.squeeze(tf.round(model_6_pred_probs))
model_6_pred[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 0., 0., 1., 1., 0., 1., 1., 0.], dtype=float32)>

In [31]:
from Evaluation import caluculate_results
model_6_result = caluculate_results(y_true=val_lables,
                                   y_pre=model_6_pred)
model_6_result

{'accuracy': 78.4776902887139,
 'prediction': 0.7839507588245316,
 'recall': 0.7847769028871391,
 'f1': 0.783566315955472}