In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [2]:
data_1_path = '/home/hanlinn/00.projects/tensorflow-prepare/example-file/scarasm/Sarcasm_Headlines_Dataset_v2.json'
data_path = '/home/hanlinn/00.projects/tensorflow-prepare/example-file/scarasm/Sarcasm_Headlines_Dataset.json'

In [8]:
df = pd.read_json(data_1_path,  lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [10]:
from sklearn.model_selection import train_test_split
train_sentence,val_sentence,train_label,val_labels = train_test_split(df["headline"].to_numpy(),
                                                                      df["is_sarcastic"].to_numpy(),
                                                                      test_size=0.1, # dedicate 10% of samples to validation set
                                                                       random_state=42
                                                                        )

In [11]:
train_sentence.shape, train_label.shape,val_sentence.shape, val_labels.shape

((25757,), (25757,), (2862,), (2862,))

In [13]:
train_sentence[1], train_label[1]

('peta condemns bbc for trapping thousands of endangered animals inside tv screens',
 1)

In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentence, train_label))
train_dataset =  train_dataset.shuffle(25757).batch(32).cache().prefetch(tf.data.AUTOTUNE)
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentence,val_labels))
valid_dataset = valid_dataset.batch(32).cache().prefetch(tf.data.AUTOTUNE)
print(f"Train : {train_dataset} \n"
      f"Test : {valid_dataset}")

Train : <PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))> 
Test : <PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>


In [16]:

max_vocab = 10_000  # Maximum vocab size.
max_seq_len = 600  # Sequence length to pad the outputs to.

# Create the layer.
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_vocab,
    output_mode='int',
    output_sequence_length=max_seq_len)

# Now that the vocab layer has been created, call `adapt` on the
# text-only dataset to create the vocabulary. You don't have to batch,
# but for large datasets this means we're not keeping spare copies of
# the dataset.
vectorize_layer.adapt(train_sentence)

embedding_layers = tf.keras.layers.Embedding(input_dim=max_vocab,
                                     output_dim=5,
                                     embeddings_initializer="uniform",
                                     input_length = max_seq_len,
                                     name="embedding_layers")

In [17]:
# Callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", # watch the val loss metric
                                                  patience=5) # if val loss decreases for 3 epochs in a row, stop training

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss",  
                                                 factor=0.2, # multiply the learning rate by 0.2 (reduce by 5x)
                                                 patience=3,
                                                 verbose=1, # print out when learning rate goes down 
                                                 min_lr=1e-7)

In [18]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string")
x = vectorize_layer(inputs)
x = embedding_layers(x)
x = layers.Conv1D(128, 5, padding = 'same', activation = 'elu')(x)
x = layers.Conv1D(128, 5, padding = 'same', activation = 'elu')(x)
x = layers.GlobalMaxPool1D()(x)
#x = layers.Dense(32, activation='relu')(x)
x = layers.Dense(32, activation='relu')(x)
x = layers.Dense(16, activation='relu')(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs, name="Conv")

model.summary()
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.binary_crossentropy,
              metrics=['accuracy'])

Model: "Conv"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 600)              0         
 torization)                                                     
                                                                 
 embedding_layers (Embedding  (None, 600, 5)           50000     
 )                                                               
                                                                 
 conv1d (Conv1D)             (None, 600, 128)          3328      
                                                                 
 conv1d_1 (Conv1D)           (None, 600, 128)          82048     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0      

In [19]:
from datetime import datetime

start = datetime.now()
history = model.fit(train_dataset,
                    epochs= 10,
                    validation_data=valid_dataset,
                    verbose=1)

end = datetime.now()

print(f"The time taken to train the model is :{end - start}")
results = model.evaluate(valid_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [None]:
def calculate_accuracy_results(y_true, y_pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    """
     Calculates model accuracy, precision, recall and f1 score of a binary classification model.

    Args:
        y_true: true labels in the form of a 1D array
        y_pred: predicted labels in the form of a 1D array

    Returns a dictionary of accuracy, precision, recall, f1-score.
    """
    # Calculate model accuracy
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    # Calculate model precision, recall and f1 score using "weighted average
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division= 1)
    model_results = {"accuracy": model_accuracy,
                      "precision": model_precision,
                      "recall": model_recall,
                      "f1": model_f1}
    return model_results
    
result_preds_probs = model.predict(val_sentence)
result_preds = tf.squeeze(tf.round(result_preds_probs))

results = calculate_accuracy_results( 
    y_true= val_labels,
    y_pred = result_preds)

print(results)