In [3]:
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

--2023-04-05 08:14:05--  https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.10.128, 142.251.12.128, 172.217.194.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.10.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json’


2023-04-05 08:14:07 (4.58 MB/s) - ‘sarcasm.json’ saved [5643545/5643545]



In [4]:
import tensorflow as tf   
from tensorflow import keras 
from keras import layers
from keras.layers import TextVectorization
import numpy as np
from sklearn.model_selection import train_test_split

In [107]:
import json 

with open('sarcasm.json', 'r') as f :
  datastore = json.load(f)

In [108]:
print("datastore type", type(datastore))
print("datastore[0] type", type(datastore[0]))
print("datastore[0]", datastore[0])

datastore type <class 'list'>
datastore[0] type <class 'dict'>
datastore[0] {'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}


In [109]:
labels = []
sentences = []

for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])

print("sentences len",  len(sentences))
print("labels len", len(labels))
print("sentences[0]", sentences[0])  
print("labels[0]", labels[0])

sentences len 26709
labels len 26709
sentences[0] former versace store clerk sues over secret 'black code' for minority shoppers
labels[0] 0


In [112]:
max_length = 0
sentence = ''
for i in sentences : 
  if len(i) >= max_length:
    max_length = len(i)
print(max_length)

254


In [113]:
sentences = np.array(sentences)
labels = np.array(labels)

In [114]:
def make_dataset(features , labels):
  ds = tf.data.Dataset.from_tensor_slices((features, labels))
  ds = ds.shuffle(buffer_size=len(features))
  ds = ds.batch(batch_size=32)
  return ds 

In [115]:
train_sentences , test_sentences , train_labels , test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42 )
train_sentences , val_sentences , train_labels, val_labels = train_test_split(train_sentences , train_labels , test_size=0.25, random_state=42)

In [116]:
train_ds = make_dataset(train_sentences , train_labels)
val_ds = make_dataset(val_sentences , val_labels)
test_ds = make_dataset(test_sentences , test_labels)

In [117]:
# bag of words approach ,
# preprocessing with TextVectorization
# 1gram 

text_vectorization = TextVectorization(
    max_tokens = 10000,
    output_mode = "multi_hot",
)

In [118]:
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

binary_1gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

binary_1gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [119]:
for inputs, targets in binary_1gram_train_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 10000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int64'>
inputs[0]: tf.Tensor([1. 0. 0. ... 0. 0. 0.], shape=(10000,), dtype=float32)
targets[0]: tf.Tensor(1, shape=(), dtype=int64)


In [120]:
def get_model(max_tokens=10000, hidden_dim=16):
  tf.keras.backend.clear_session()
  
  inputs = keras.Input(shape=(max_tokens,))
  x = layers.Dense(hidden_dim, activation="relu")(inputs)
  x = layers.Dropout(0.5)(x)
  outputs = layers.Dense(1, activation="sigmoid")(x)
  model = keras.Model(inputs, outputs)

  model.compile(optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"])
  
  return model

In [121]:
model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10000)]           0         
                                                                 
 dense (Dense)               (None, 16)                160016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [122]:
callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras",save_best_only=True)]

model.fit(binary_1gram_train_ds , 
          validation_data = binary_1gram_val_ds ,
          epochs = 10,
          callbacks = callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8299bcad60>

In [123]:
model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Test acc: 0.844


In [124]:
# bigrams with binary encoding 
text_vectorization = TextVectorization(
    max_tokens = 10000,
    ngrams=2,
    output_mode="multi_hot",
)

In [125]:
text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

binary_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [126]:
model = get_model()
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10000)]           0         
                                                                 
 dense (Dense)               (None, 16)                160016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [127]:
callbacks = [keras.callbacks.ModelCheckpoint("binary_2gram.keras",save_best_only=True)]

model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f82fd128ca0>

In [128]:
model = keras.models.load_model("binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Test acc: 0.842


In [None]:
# BIGRAMS WITH TF-IDF ENCODING
"""
You can also add a bit more information to this representation by counting how many
times each word or N-gram occurs, that is to say, by taking the histogram of the words
over the text.

We will use TF-IDF(term frequency, inverse document frequency.) to normalize the data because some words like 
"a" , "the" ... even if nit very usefull will dominate the counts

Understanding TF-IDF normalization
The more a given term appears in a document, the more important that term is for
understanding what the document is about. At the same time, the frequency at which
the term appears across all documents in your dataset matters too: terms that
appear in almost every document (like “the” or “a”) aren’t particularly informative,
while terms that appear only in a small subset of all texts (like “Herzog”) are very dis-
tinctive, and thus important. TF-IDF is a metric that fuses these two ideas. It weights
a given term by taking “term frequency,” how many times the term appears in the
current document, and dividing it by a measure of “document frequency,” which esti-
mates how often the term comes up across the dataset. You’d compute it as follows:

    def tfidf(term, document, dataset):
    term_freq = document.count(term)
    doc_freq = math.log(sum(doc.count(term) for doc in dataset) + 1)
    return term_freq / doc_freq

"""

In [129]:
# Configuring the TextVectorization layer to return token counts

text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens = 10000,
    output_mode = "tf_idf",
)

In [130]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [134]:
model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10000)]           0         
                                                                 
 dense (Dense)               (None, 16)                160016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [135]:
callbacks = [keras.callbacks.ModelCheckpoint("tfidf_2gram.keras",save_best_only=True)]

model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f82feaec970>

In [136]:
model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

Test acc: 0.847


In [None]:
## processing words as sequence , the sequence model approach

In [137]:
# Preparing integer sequence datasets
max_length = 250
max_tokens = 10000

text_vectorization = TextVectorization(
    max_tokens = max_tokens ,
    output_mode = "int",
    output_sequence_length = max_length,
)

In [138]:
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(
lambda x, y: (text_vectorization(x), y),
num_parallel_calls=4)

int_val_ds = val_ds.map(
lambda x, y: (text_vectorization(x), y),
num_parallel_calls=4)

int_test_ds = test_ds.map(
lambda x, y: (text_vectorization(x), y),
num_parallel_calls=4)

In [None]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics = ["accuracy"])

model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirectiona  (None, 64)               5128448   
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,128,513
Trainable params: 5,128,513
Non-trainable params: 0
_________________________________________________

In [None]:
callbacks = [keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",save_best_only=True)]

model.fit(int_train_ds, validation_data=int_val_ds,
          epochs=10,
          callbacks = callbacks )


In [None]:
"""
This model is more slower than the previous due to the size of our input data :
each sample is a matrix of shape (600,20000)600 words per sample , 20000 possible words

Instead of multi_hot encoding , we will use word Embedding to encode the samples
"""

In [None]:
""" 

Notice that whith one encoding , each word is processed independantly from the others.
But in fact , words are tied to each other , they share information , can be synonims, can 
be interchangeable , they have a semantic relation with all other words .

for example the vector representing the word "movie" and the one representing the word
"film" as these words are interchangebale should be the same or close enough.

Word embedding is encoding the semantic relation between words into a dense and
structured  geometric space.

"""

In [None]:
# learning word embedding with from the data
# Instantiating an Embedding layer

"""
The Embedding layer is best understood as a dictionary that maps integer indices
(which stand for specific words) to dense vectors. It takes integers as input, looks up
these integers in an internal dictionary, and returns the associated vectors. It’s effec-
tively a dictionary lookup

"""


In [139]:
tf.keras.backend.clear_session()


inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=10000, output_dim=120, mask_zero =True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dense(24, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics= ["accuracy"])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 120)         1200000   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               39168     
 l)                                                              
                                                                 
 dense (Dense)               (None, 24)                1560      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 1,240,753
Trainable params: 1,240,753
Non-trainable params: 0
___________________________________________________

In [140]:
callbacks = [keras.callbacks.ModelCheckpoint("embeddings_bidir_gru.keras",save_best_only=True)]

model.fit(int_train_ds,
          validation_data = int_val_ds,
          epochs = 10,
          callbacks = callbacks)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f82fe97dee0>

In [141]:
model = keras.models.load_model("embeddings_bidir_gru.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.863


In [142]:
# USING PRETRAINED WORD EMBEDDINGS

! wget http://nlp.stanford.edu/data/glove.6B.zip
! unzip -q glove.6B.zip

--2023-04-05 11:33:29--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-04-05 11:33:30--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-04-05 11:33:31--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [143]:
# Parsing the GloVe word-embeddings file

path_to_glove_file = "glove.6B.100d.txt"
embeddings_index = {}
with open(path_to_glove_file) as f:
  for line in f :
    word , coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [157]:
for i , j in embeddings_index.items():
  print(i,j)
  break

the [-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.87424  -0.72569  -0.51058  -0.52028  -0.1459
  0.8278    0.270

In [158]:
embedding_dim = 100
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
  if i < max_tokens:
    embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [159]:
embedding_layer = layers.Embedding(max_tokens,
                                   embedding_dim,
                                   embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                   trainable=False,
                                   mask_zero=True,
                                   )

In [161]:
# Model that uses a pretrained Embedding layer
tf.keras.backend.clear_session()

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         1000000   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               34048     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1,034,113
Trainable params: 34,113
Non-trainable params: 1,000,000
______________________________________________

In [162]:
callbacks = [keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras",save_best_only=True)]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
          callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8245cbc340>

In [163]:
model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.860
