# Introuduction to NLP

NLP has a goal of deriving information out of natural language (either in sequence text or speech format)

Another term for NPL problem is sequence to sequence problem (seq2seq)

In [110]:
!nvidia-smi -L

/bin/bash: line 1: nvidia-smi: command not found


In [111]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2023-07-20 09:01:05--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.1’


2023-07-20 09:01:05 (4.73 MB/s) - ‘helper_functions.py.1’ saved [10246/10246]



In [112]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [113]:
# Get a text dataset => kaggle's Introduction to NLP dataset
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
unzip_data('nlp_getting_started.zip')

--2023-07-20 09:01:05--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.18.128, 142.250.153.128, 142.250.145.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.18.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip.1’


2023-07-20 09:01:05 (3.00 MB/s) - ‘nlp_getting_started.zip.1’ saved [607343/607343]



# Visualising the text dataset

In [114]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [115]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [116]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [117]:
train_df_shuff = train_df.sample(frac=1, random_state=42)
train_df_shuff.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [118]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [119]:
len(train_df), len(test_df)

(7613, 3263)

In [120]:
import random

random_index = random.randint(0, len(train_df)-5)

for row in train_df_shuff[['text', 'target']][random_index: random_index + 5].itertuples():
    _, text, target = row
    print(f"Target: {target}", "real disaster" if target > 0 else "not a disaster")
    print(f"Text:\n{text}\n")
    print("---\n")

Target: 1 real disaster
Text:
Gotta love #summer in #Calgary. #yyc #hailstorm #crazyweather http://t.co/xQbWnLBBIu

---

Target: 0 not a disaster
Text:
#ActionMoviesTaughtUs things actually can explode with a loud bang...in space.

---

Target: 1 real disaster
Text:
This is why I am scared to leave my car under trees in a storm

#jamaicaplain #boston #hailstormÛ_ https://t.co/MJ8rEZOXlJ

---

Target: 1 real disaster
Text:
Hundreds feared drowned as migrant boat capsizes off Libya http://t.co/pPJi1tCNML

---

Target: 1 real disaster
Text:
reaad/ plsss 12000 Nigerian refugees repatriated from Cameroon

---



# split data into training and validation sets

In [121]:
from sklearn.model_selection import train_test_split

# use train test split to split the sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuff["text"].to_numpy(),
                                                                            train_df_shuff['target'].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [122]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

# Converting text to number

# Text Vectoriztion (tokenization)

In [123]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize='lower_and_strip_punctuation',
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=None,
                                    pad_to_max_tokens=False)

In [124]:
# Find the avg number of token (words) in training tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [125]:
# Setup text vectoriztion variable
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=max_length)

In [126]:
text_vectorizer.adapt(train_sentences)

In [127]:
sample_sentence = 'There is flood in my street!'
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [128]:
random_sentences = random.choice(train_sentences)
print(f"Original text: {random_sentences}\nVector Text: {text_vectorizer([random_sentences])}")

Original text: Nearly 35 years after their release from captivity legislation is being introduced in Congress to compensate 53Û_ http://t.co/NCjLXzFWaa
Vector Text: [[ 841 2208  141   43  131 2008   20    1 5232    9  121    1    4 2153
     5]]


In [129]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_vocab_word = words_in_vocab[:5]
bottom_5_vocab_word = words_in_vocab[-5:]

In [130]:
top_5_vocab_word, bottom_5_vocab_word

(['', '[UNK]', 'the', 'a', 'in'],
 ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1'])

# Creating an Embedding using an Embedding Layer

In [131]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=max_length)

embedding

<keras.layers.core.embedding.Embedding at 0x7ef6b2985180>

In [132]:
random_sentence = random.choice(train_sentences)
print(f"Original Text: {random_sentence}\nEmbedded Text: {embedding(text_vectorizer([random_sentence]))}")

Original Text: I'm on 2 blood pressure meds and it's still probably through the roof! Long before the #PPact story broke I was involved in animal rescue
Embedded Text: [[[ 0.02730216 -0.02677671  0.03184432 ...  0.0004494   0.0350357
   -0.04311375]
  [-0.04665002 -0.04027241 -0.01013255 ... -0.00557783  0.04684925
   -0.01445813]
  [ 0.03375362  0.01117992  0.03260412 ...  0.04061187  0.01627241
   -0.00418261]
  ...
  [ 0.04727821  0.00615884  0.01295929 ...  0.01301715  0.01555518
    0.01403678]
  [ 0.02203308  0.03200487  0.006637   ...  0.02309107  0.0184494
    0.00305117]
  [ 0.01026043 -0.01079539 -0.02088214 ... -0.04887406  0.04852447
   -0.03750764]]]


# Modeling a text Dataset

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization
model_0 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Fit the pipeline
model_0.fit(train_sentences, train_labels)

In [134]:
# Evaluate our base model
baseline_score = model_0.score(val_sentences, val_labels)

In [135]:
print(f"accuracy: {baseline_score*100:.2f}%")

accuracy: 79.27%


In [136]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:10], val_labels[:10]

(array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0]), array([0, 0, 1, 1, 1, 1, 1, 1, 1, 0]))

# Creating an evaluation function for our model experiments

In [137]:
# Function to evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
    # Calculate model acuracy
    model_accuracy = accuracy_score(y_true, y_pred)

    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    model_results = {
        'accuracy': model_accuracy,
        'precision': model_precision,
        'recall': model_precision,
        'fl': model_f1,
    }
    return model_results

In [138]:
baseline_results = calculate_results(val_labels, baseline_preds)
baseline_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.8111390004213173,
 'fl': 0.7862189758049549}

# Model 1:  simple dense model

In [139]:
# Create a tensorboard callback
from helper_functions import create_tensorboard_callback

SAVE_DIR = 'model_logs'

In [31]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [32]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [33]:
model_1.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [34]:
model_1_history = model_1.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 'model_1_dense')])

Saving TensorBoard log files to: model_logs/model_1_dense/20230720-074622
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
model_1.evaluate(val_sentences, val_labels)



[0.4763720631599426, 0.7795275449752808]

In [36]:
# converting model prediction probabilities
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs[0]



array([0.41091758], dtype=float32)

In [37]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))

In [38]:
model_1_results = calculate_results(val_labels, model_1_preds)
model_1_results

{'accuracy': 0.7795275590551181,
 'precision': 0.7829461349391948,
 'recall': 0.7829461349391948,
 'fl': 0.7768651797484589}

# Visualization learned embedding

In [39]:
words_in_vocab = text_vectorizer.get_vocabulary()
len(words_in_vocab), words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [40]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [41]:
embed_weights = model_1.get_layer('embedding').get_weights()
embed_weights[0].shape

(10000, 128)


# Visualize the embedding weights

In [42]:
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[0][index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [43]:
# try:
#   from google.colab import files
#   files.download('vectors.tsv')
#   files.download('metadata.tsv')
# except Exception:
#   pass

# Recurrent Neural Networks (RNN's)

RNN are useful for seqence data.

* use representation of a previous to aid the representation of a later input

### Model 2: LSTM (Long short term memory)

```Input (text) -> Tokenize -> Embedding -> Layers (RNNs/dense) -> Output (label probability)```

In [44]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")

x = text_vectorizer(inputs)
x = embedding(x)
# print(x.shape)
# x = layers.LSTM(64, return_sequences=True)(x)
print(x.shape)
x = layers.LSTM(64)(x)
print(x.shape)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
print(outputs.shape)

model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

(None, 15, 128)
(None, 64)
(None, 1)


In [45]:
model_2.summary()

Model: "model_2_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                      

In [46]:
# Compile the model
model_2.compile(loss='binary_crossentropy',
                optimizer='Adam',
                metrics=['accuracy'])

In [47]:
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR,
                                                                     "model_2_LSMT")])

Saving TensorBoard log files to: model_logs/model_2_LSMT/20230720-074706
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [48]:
# Make prediction with LSTM model
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]



array([[1.17685914e-01],
       [8.94209564e-01],
       [9.99964237e-01],
       [3.37895267e-02],
       [2.52214144e-04],
       [9.98934329e-01],
       [9.44582105e-01],
       [9.99989450e-01],
       [9.99970019e-01],
       [4.12866920e-01]], dtype=float32)

In [49]:
# convert model 2 pred probs to labels
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))

In [50]:
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [51]:
model_2_results = calculate_results(val_labels, model_2_preds)
model_2_results

{'accuracy': 0.7847769028871391,
 'precision': 0.7854004938276028,
 'recall': 0.7854004938276028,
 'fl': 0.7834945648091367}

In [52]:
baseline_results

{'accuracy': 0.7926509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.8111390004213173,
 'fl': 0.7862189758049549}

### Model 3: GRU (Gated Recurrent Unit)

In [53]:
inputs = layers.Input(shape=(1,), dtype="string")

x = text_vectorizer(inputs)
x = embedding(x)
# x = layers.GRU(64, return_sequences=True)(x)
# x = layers.LSTM(64, return_sequences=True)(x)
x = layers.GRU(64)(x)
x = layers.Dense(64, activation='relu')(x)

outputs = layers.Dense(1, activation='sigmoid')(x)

model_3 = tf.keras.Model(inputs, outputs)

In [54]:
model_3.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                             

In [55]:
# Compile the model
model_3.compile(loss='binary_crossentropy',
                optimizer='Adam',
                metrics=['accuracy'])

In [56]:
history_model_3 = model_3.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "model_3_GRU")])

Saving TensorBoard log files to: model_logs/model_3_GRU/20230720-074752
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [57]:
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10]



array([[7.6531316e-03],
       [5.0697702e-01],
       [9.9997622e-01],
       [2.9903820e-01],
       [3.7667793e-04],
       [9.9998754e-01],
       [9.7573960e-01],
       [9.9999869e-01],
       [9.9999732e-01],
       [9.8698473e-01]], dtype=float32)

In [58]:
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [59]:
model_3_results = calculate_results(val_labels, model_3_preds)
model_3_results, baseline_results

({'accuracy': 0.7637795275590551,
  'precision': 0.764383846466808,
  'recall': 0.764383846466808,
  'fl': 0.7621412379223811},
 {'accuracy': 0.7926509186351706,
  'precision': 0.8111390004213173,
  'recall': 0.8111390004213173,
  'fl': 0.7862189758049549})

### Model 4: Bidirectional RNN

Normal RNN's go from left to right however bidirectional RNN goes from right to left as well as left to right.

In [60]:
# Build a bidirectional model

from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")

x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)

outputs = layers.Dense(1, activation='sigmoid')(x)

model_4 = tf.keras.Model(inputs, outputs, name="model_4_bidirectional")

In [61]:
# Compile the model
model_4.compile(loss='binary_crossentropy',
                optimizer='Adam',
                metrics=['accuracy'])

In [62]:
model_4.summary()

Model: "model_4_bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 15, 128)          98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                                             

In [63]:
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 'model_4_bidirectional')])

Saving TensorBoard log files to: model_logs/model_4_bidirectional/20230720-074838
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [64]:
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[6.4494130e-03],
       [5.3373307e-01],
       [9.9984246e-01],
       [4.1171964e-02],
       [1.0116678e-05],
       [9.9974746e-01],
       [9.2188692e-01],
       [9.9995214e-01],
       [9.9993235e-01],
       [9.9947834e-01]], dtype=float32)

In [65]:
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [66]:
model_4_results = calculate_results(val_labels, model_4_preds)
model_4_results, baseline_results

({'accuracy': 0.7519685039370079,
  'precision': 0.7518863114144899,
  'recall': 0.7518863114144899,
  'fl': 0.7506628579092585},
 {'accuracy': 0.7926509186351706,
  'precision': 0.8111390004213173,
  'recall': 0.8111390004213173,
  'fl': 0.7862189758049549})

## Convolution Neural Networks for Text (and other types of sequences)

We've used CNN for images but images are typically 2D, however our text data is 1D.

```
Inputs (text) -> Tokenization -> Embedding -> Layers (Conv1d + pooling) => Output layer


# Model 5: Conv1D

In [67]:
# Test out our embedding layer, Conv1D layer and max pooling
from tensorflow.keras import layers

embedding_test = embedding(text_vectorizer(['this i a test sentence']))
conv_1d = layers.Conv1D(filters=32,
                        kernel_size=5,
                        activation='relu',
                        padding='valid')
conv_1d_output = conv_1d(embedding_test)
max_pool = layers.GlobalMaxPool1D()
max_pool_output = max_pool(conv_1d_output)

embedding_test.shape, conv_1d_output.shape, max_pool_output.shape

(TensorShape([1, 15, 128]), TensorShape([1, 11, 32]), TensorShape([1, 32]))

In [68]:
# embedding_test
# conv_1d_output

In [69]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")

x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters=64,
                  kernel_size=5,
                  activation='relu',
                  padding='valid')(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model_5 = tf.keras.Model(inputs, outputs, name="Convolution_1D_model")

In [70]:
model_5.summary()

Model: "Convolution_1D_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 conv1d_1 (Conv1D)           (None, 11, 64)            41024     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 1)        

In [71]:
model_5.compile(loss='binary_crossentropy',
                optimizer='Adam',
                metrics=['accuracy'])

In [72]:
history_model_5 = model_5.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "model_5_Conv1D")])

Saving TensorBoard log files to: model_logs/model_5_Conv1D/20230720-075011
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [73]:
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:5]



array([[6.9333535e-01],
       [9.5156986e-01],
       [9.9993509e-01],
       [4.7978617e-02],
       [2.1619063e-07]], dtype=float32)

In [74]:
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_preds[:5]

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 1., 1., 0., 0.], dtype=float32)>

In [75]:
model_5_results = calculate_results(val_labels, model_5_preds)
model_5_results

{'accuracy': 0.7637795275590551,
 'precision': 0.7633913718531662,
 'recall': 0.7633913718531662,
 'fl': 0.7633947072405423}

### Model 6: Tensorflow Hub Pretrained Sentence Encode

In [76]:
import tensorflow_hub as hub

embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

embed_samples = embed([sample_sentence, "When you can the universal sentence encoder on a sentence, it turn in into numbers."])
print(embed_samples[0][:50])

tf.Tensor(
[-0.00767769  0.03333809  0.0320662  -0.02125602  0.04300849  0.08399192
  0.02452653  0.05380183 -0.02588227 -0.00499588  0.01928363 -0.00056099
  0.01276441  0.09088546  0.0562213  -0.04887728  0.05191502 -0.04136144
 -0.01161554 -0.05452844 -0.01740017  0.0045459   0.00692502 -0.01381139
 -0.00961125 -0.03933032  0.0197613  -0.01115342 -0.03671153  0.01013187
 -0.04107149  0.03479797 -0.03814119 -0.00185953  0.01663749 -0.08907217
  0.00592126  0.04859087 -0.03741663 -0.08817419  0.00334023  0.00904682
 -0.01701778  0.06431309 -0.10647684 -0.0381299  -0.02107327 -0.02888192
 -0.03679271  0.01302323], shape=(50,), dtype=float32)


In [77]:
from tensorflow.keras import layers

sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], dtype=tf.string, trainable=False, name="USE")

In [78]:
model_6 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
], name="model_6_USE")

# compile the model
model_6.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

model_6.summary()

Model: "model_6_USE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_7 (Dense)             (None, 64)                32832     
                                                                 
 dense_8 (Dense)             (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [79]:
history_model_6 = model_6.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, 'tf_hub_sentence_encoder')])

Saving TensorBoard log files to: model_logs/tf_hub_sentence_encoder/20230720-075130
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [80]:
model_6_pred_prods = model_6.predict(val_sentences)
model_6_preds = tf.squeeze(tf.round(model_6_pred_prods))
model_6_preds[:10]



<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 1., 1., 1., 1., 1., 0.], dtype=float32)>

In [81]:
model_6_results = calculate_results(val_labels, model_6_preds)
model_6_results, baseline_results

({'accuracy': 0.8123359580052494,
  'precision': 0.8145687227271431,
  'recall': 0.8145687227271431,
  'fl': 0.81078947666798},
 {'accuracy': 0.7926509186351706,
  'precision': 0.8111390004213173,
  'recall': 0.8111390004213173,
  'fl': 0.7862189758049549})

In [82]:
# model_61 = tf.keras.Sequential([
#     sentence_encoder_layer,
#     layers.Dense(64, activation='relu'),
#     layers.Dense(1, activation='sigmoid')
# ])

# model_61.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
# history_model_61 = model_61.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels), validation_steps=len(val_sentences))

In [83]:
# model_61_preds = tf.squeeze(tf.round(model_61.predict(val_sentences)))
# calculate_results(val_labels, model_61_preds)

In [87]:
train_10_prercent = train_df_shuff[["text", "target"]].sample(frac=0.1, random_state=42)
train_sentences_10_precent = train_10_precent["text"].to_list()
train_labels_10_percent = train_10_precent['target'].to_list()

In [89]:
train_10_precent["target"].value_counts()

0    413
1    348
Name: target, dtype: int64

In [None]:
### Model 7: Predefined

In [93]:
from tensorflow.keras import layers

sentence_encoder_layer_trainable = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], dtype=tf.string, trainable=True, name="USE")

In [149]:
model_7 = tf.keras.Sequential([
    sentence_encoder_layer,
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigm
    oid')
])

model_7.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                metrics=['accuracy'])
model_7.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 USE (KerasLayer)            (None, 512)               256797824 
                                                                 
 dense_17 (Dense)            (None, 64)                32832     
                                                                 
 dense_18 (Dense)            (None, 1)                 65        
                                                                 
Total params: 256,830,721
Trainable params: 32,897
Non-trainable params: 256,797,824
_________________________________________________________________


In [150]:
# history_model_7 = model_7.fit(train_sentences_10_precent,
#                               train_labels_10_percent,
#                               epochs=5,
#                               validation_data=(val_sentences, val_labels),
#                               callbacks=[create_tensorboard_callback(SAVE_DIR, 'model_7_10%')])

In [151]:
# model_7_results = calculate_results(val_labels, tf.squeeze(tf.round(model_7.predict(val_sentences))))
# model_7_results, model_6_results, baseline_results

In [152]:
# model_7.save('./drive/MyDrive/Tensorflow/Model_7_temp')

In [153]:
# model_temp = tf.keras.models.load_model('./drive/MyDrive/Tensorflow/Model_7_temp')

In [154]:
# model_temp.summary()

In [155]:
train_10_percent_split = int(0.1 * len(train_sentences))

In [156]:
train_sentences_10_percent = train_sentences[:train_10_percent_split]
train_labels_10_percent = train_labels[:train_10_percent_split]

In [157]:
history_model_7 = model_7.fit(train_sentences_10_percent,
                              train_labels_10_percent,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "tf_hub_Sentence_encoder_10%_spilt")])

Saving TensorBoard log files to: model_logs/tf_hub_Sentence_encoder_10%_spilt/20230720-091501
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [158]:
model_7_results = calculate_results(val_labels, tf.squeeze(tf.round(model_7.predict(val_sentences))))
model_7_results, model_6_results, baseline_results



({'accuracy': 0.7860892388451444,
  'precision': 0.7864869352962041,
  'recall': 0.7864869352962041,
  'fl': 0.784963205498461},
 {'accuracy': 0.8123359580052494,
  'precision': 0.8145687227271431,
  'recall': 0.8145687227271431,
  'fl': 0.81078947666798},
 {'accuracy': 0.7926509186351706,
  'precision': 0.8111390004213173,
  'recall': 0.8111390004213173,
  'fl': 0.7862189758049549})