# Learn basics in NLP with TensorFlow 

I'm gonna follow this github tutorial.

https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/08_introduction_to_nlp_in_tensorflow.ipynb

Get dataset from kaggle.

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('./dataaset/train.csv')

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Split data into train and test

In [4]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_lables, val_lables = train_test_split(
    train_data["text"].to_numpy(),
    train_data["target"].to_numpy(),
    test_size=0.1
    )

In [5]:
train_sentences

array(['Guaranteed been bitten by some mutant mosquito my ankle has blown up. Little cunts',
       '@RJG0789 idk....I feel like his movies have done more harm than good. They make us look sterotypical annddd colorism is prevalent sort of',
       'I liked a @YouTube video http://t.co/43sXG9Z6xh TREMOR IS NO JOKE!! [TREMOR DLC] [FATALITIES/X-RAY]',
       ...,
       "wo Pic of 16yr old PKK suicide bomber who detonated bomb in Turkey Army trench released http://t.co/5v29w19tFX /'/'//",
       'I feel like death',
       'Do you have a plan? Emergency Preparedness for #Families of\nChildren with Special Needs  http://t.co/RdOVqaUAx5  #autism #specialneeds'],
      dtype=object)

# Converting text into numbers

Create words to vector function.

In [6]:
from tensorflow.keras.layers import TextVectorization

In [7]:
text2vec = TextVectorization(
    max_tokens=10000, standardize='lower_and_strip_punctuation',
    split='whitespace', ngrams=None, output_mode='int',
    output_sequence_length=15, pad_to_max_tokens=False, vocabulary=None,
    idf_weights=None, sparse=False, ragged=False
)

2022-01-04 16:25:13.667310: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-01-04 16:25:13.667336: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-01-04 16:25:13.667577: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
text2vec.adapt(train_sentences)

See how the words 

In [9]:
sample_sentence = "There is a flood in my street!"
text2vec([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 75,   9,   3, 207,   4,  13, 760,   0,   0,   0,   0,   0,   0,
          0,   0]])>

Get first words

In [10]:
text2vec.get_vocabulary()[:5]

['', '[UNK]', 'the', 'a', 'in']

Get the words from 100 to 105th.

In [11]:
text2vec.get_vocabulary()[100:105]

['man', 'first', 'fires', 'cant', 'bomb']

# Creating Embedding layer

We are going to use TnsorFlow's embedding layers.

https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding

In [12]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim = 10000, # set imput shape
                             output_dim = 128, # output shape
                             input_length = 10000 # how long is each input 
                            )

embedding

<keras.layers.embeddings.Embedding at 0x7f5fe43a8f10>

Get a random sentence from the training set

In [13]:
import random
random_sentence = random.choice(train_sentences)

print(f"Original text:\n {random_sentence}\
        \n\nEmbedded version:")

# Embed the random sentence (turn it into dense vectors of fixed size)
sample_embed = embedding(text2vec([random_sentence]))
sample_embed


Original text:
 Exploration Takes Seismic Shift in #Gabon to #Somalia
http://t.co/Ltf6jL5keU http://t.co/Zlq8tHcTkW        

Embedded version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.00584246, -0.00486549, -0.03438734, ...,  0.01744742,
         -0.01539519,  0.04480382],
        [-0.02566576, -0.01726035,  0.03642262, ...,  0.03788001,
          0.02513185,  0.03414276],
        [ 0.02440382,  0.02525632, -0.01863666, ...,  0.00854806,
         -0.00353139,  0.04812253],
        ...,
        [-0.01673266,  0.01151526,  0.00124147, ...,  0.04875894,
         -0.04456549,  0.04744624],
        [-0.01673266,  0.01151526,  0.00124147, ...,  0.04875894,
         -0.04456549,  0.04744624],
        [-0.01673266,  0.01151526,  0.00124147, ...,  0.04875894,
         -0.04456549,  0.04744624]]], dtype=float32)>

In [14]:
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.00584246, -0.00486549, -0.03438734,  0.00473927,  0.03942709,
        -0.00914267,  0.04823155,  0.02335917,  0.00289087, -0.0079584 ,
         0.04030385, -0.02551092,  0.01181896,  0.02185899,  0.00164554,
        -0.02360288, -0.00598522,  0.00817355, -0.01286701, -0.01950574,
         0.02077207, -0.01325671,  0.03551588, -0.04193858, -0.00425833,
         0.00488   ,  0.00186374,  0.04490009,  0.04443793, -0.00309221,
         0.00743573, -0.02974598, -0.04953986, -0.04644821, -0.03849795,
        -0.04130647,  0.03584014, -0.04506174, -0.02111856,  0.02653432,
         0.03120626,  0.0348633 , -0.04617438,  0.03508409, -0.00868416,
        -0.00882361, -0.00256375, -0.04937835, -0.045761  , -0.00064474,
         0.04234774, -0.03597004,  0.02966757,  0.04181362,  0.01648979,
         0.03530759,  0.00140101, -0.02576051,  0.03456293, -0.0436573 ,
        -0.03178125,  0.00496196,  0.00561043,  0.02661629, -0.00091805,
  

# Modelling a text dataset with running a series of experiment

There are some Model to learn text:

0, Naive Bayes with TF-IDF encoder (baseline)

1, Feed-forward neural network (dence model)

2, LSTM (RNN)

3, GRU (RNN)

4, Bidirectional-LSTM (RNN)

5, 1D Convolutional Neural Network

6, TensorFlow Hub Pretrained Feature Extractor

7, TensorFlow Hub Pretrained Feature Extractor (10% of data)

How are we going to approach all of these?

Use the standard steps in modeling with tensorflow:

* Create a model
* Build a model
* Fit a model
* Evaluate our model

# Create Tensorflow Pretrained model 10% of data

Use only 10% of data to train.

In [15]:
import tensorflow_hub as hub
import tensorflow as tf

In [17]:
# Create model
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                       input_shape=[],
                                       dtype=tf.string,
                                       trainable=False,
                                       name="USE")

In [37]:
# Create model useing the Sequence 
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")
], name="model_6")

In [19]:
# Create a tensorboard callback ( need to a new one for each model)
from helper_function import create_tensorboard_callback

# Create a directory to save TensorBoard logs
SAVE_DIR = "model_logs"

In [27]:
train_df_shuffled = train_data.sample(frac=1, random_state=42)
train_10_percent = train_df_shuffled[["text", "target"]].sample(frac=0.1, random_state=42)
train_10_percent.head(), len(train_10_percent)

(                                                   text  target
 4955  DFR EP016 Monthly Meltdown - On Dnbheaven 2015...       0
 584   FedEx no longer to transport bioterror germs i...       0
 7411  Gunmen kill four in El Salvador bus attack: Su...       1
 5950  @camilacabello97 Internally and externally scr...       1
 5541  Radiation emergency #preparedness starts with ...       1,
 761)

In [30]:
train_sentences_10_percent = train_10_percent["text"].to_list()
train_lables_10_percent = train_10_percent["target"].to_list()

len(train_sentences_10_percent), len(train_lables_10_percent)

(761, 761)

Check the number of targets in our subset of data

In [32]:
train_10_percent["target"].value_counts()

0    413
1    348
Name: target, dtype: int64

In [33]:
train_df_shuffled["target"].value_counts()

0    4342
1    3271
Name: target, dtype: int64

Building the model

In [38]:
# Create model
model_7 = tf.keras.models.clone_model(model_6)

In [39]:
model_7.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_1 (Dense)             (None, 64)                32832     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [40]:
model_7.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [43]:
model_7.fit(x=train_sentences_10_percent,
            y=train_lables_10_percent,
            epochs=5,
            validation_data=(val_sentences, val_lables),
            callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                 "model_7_pretrained_10_percent")])

Saving TensorBoard log files to: model_logs/model_7_pretrained_10_percent/20220104-164744
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f5f00301700>

In [45]:
# Make predictions 
model_7_pred_probs = model_7.predict(val_sentences)
model_7_pred_probs[:10]

array([[0.33813885],
       [0.0710935 ],
       [0.45362866],
       [0.30733424],
       [0.22581276],
       [0.10957441],
       [0.46320567],
       [0.0774343 ],
       [0.90216565],
       [0.12078884]], dtype=float32)

In [46]:
model_7_preds = tf.squeeze(tf.round(model_7_pred_probs))
model_7_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], dtype=float32)>

In [49]:
# Evaluate model 7
from Evaluation import caluculate_results
model_7_results = caluculate_results(y_true=val_lables,
                                    y_pre=model_7_preds)
model_7_results

{'accuracy': 80.31496062992126,
 'prediction': 0.8027449132960943,
 'recall': 0.8031496062992126,
 'f1': 0.8022638996116164}