In [1]:
# =====================================================================================================
# PROBLEM C4
#
# Build and train a classifier for the sarcasm dataset.
# The classifier should have a final layer with 1 neuron activated by sigmoid.
#
# Do not use lambda layers in your model.
#
# Dataset used in this problem is built by Rishabh Misra (https://rishabhmisra.github.io/publications).
#
# Desired accuracy and validation_accuracy > 75%
# =======================================================================================================


In [2]:
import json
import tensorflow as tf
import numpy as np
import urllib
from tensorflow import keras 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [17]:
# Const
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 20000

In [4]:
# Get Data
data_url = 'https://github.com/dicodingacademy/assets/raw/main/Simulation/machine_learning/sarcasm.json'
urllib.request.urlretrieve(data_url, 'sarcasm.json')

('sarcasm.json', <http.client.HTTPMessage at 0x7fbe32f0be20>)

In [20]:
temp = open("sarcasm.json")
data = json.load(temp)

In [27]:
sentences = []
labels = []

for i in data:
    sentences.append(i["headline"])
    labels.append(i["is_sarcastic"])

In [32]:
train_sentences = sentences[:training_size]
val_sentences = sentences[training_size:]
train_labels = labels[:training_size]
val_labels = labels[training_size:]

In [30]:
token = Tokenizer(num_words=vocab_size, oov_token = oov_tok)
token.fit_on_texts(train_sentences)

In [33]:
def get_pad_sequences(sentences):
    sequences = token.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

train_pad_sequences=get_pad_sequences(train_sentences)
val_pad_sequences=get_pad_sequences(val_sentences)

In [35]:
# Turn into a batch ds
def get_ds(padsequences, labels):
    ds = tf.data.Dataset.from_tensor_slices((padsequences, labels))
    ds = ds.cache()
    ds = ds.batch(32)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds=get_ds(train_pad_sequences, train_labels)
val_ds=get_ds(val_pad_sequences, val_labels)

In [36]:
class thecallback(tf.keras.callbacks.Callback):
    def __init__(self):
        super(thecallback, self).__init__()
        
    def on_epoch_end(self, epoch, logs=None):
        loggedtrain, loggedval = logs["accuracy"], logs["val_accuracy"]
        if loggedtrain > 0.75 and loggedval > 0.75:
            self.model.stop_training = True

In [39]:
def get_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        keras.layers.GlobalAvgPool1D(),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer = "adam", loss=keras.losses.binary_crossentropy, metrics="accuracy")
    return model

In [42]:
model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=100, callbacks=thecallback())

Epoch 1/100
Epoch 2/100
Epoch 3/100


<keras.src.callbacks.History at 0x7fbe241a24c0>

In [43]:
# =====================================================================================================
# PROBLEM C4
#
# Build and train a classifier for the sarcasm dataset.
# The classifier should have a final layer with 1 neuron activated by sigmoid.
#
# Do not use lambda layers in your model.
#
# Dataset used in this problem is built by Rishabh Misra (https://rishabhmisra.github.io/publications).
#
# Desired accuracy and validation_accuracy > 75%
# =======================================================================================================

import json
import tensorflow as tf
import numpy as np
import urllib
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


def solution_C4():
    data_url = 'https://github.com/dicodingacademy/assets/raw/main/Simulation/machine_learning/sarcasm.json'
    urllib.request.urlretrieve(data_url, 'sarcasm.json')

    # DO NOT CHANGE THIS CODE
    # Make sure you used all of these parameters or test may fail
    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type = 'post'
    padding_type = 'post'
    oov_tok = "<OOV>"
    training_size = 20000

    temp = open("sarcasm.json")
    data = json.load(temp)

    sentences = []
    labels = []

    for i in data:
        sentences.append(i["headline"])
        labels.append(i["is_sarcastic"])
        
    # Train Test split
    train_sentences = sentences[:training_size]
    val_sentences = sentences[training_size:]
    train_labels = labels[:training_size]
    val_labels = labels[training_size:]
    
    # Fit your tokenizer with training data
    token = Tokenizer(num_words=vocab_size, oov_token = oov_tok)
    token.fit_on_texts(train_sentences)

    def get_pad_sequences(sentences):
        sequences = token.texts_to_sequences(sentences)
        return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

    train_pad_sequences=get_pad_sequences(train_sentences)
    val_pad_sequences=get_pad_sequences(val_sentences)

    class thecallback(tf.keras.callbacks.Callback):
        def __init__(self):
            super(thecallback, self).__init__()
            
        def on_epoch_end(self, epoch, logs=None):
            loggedtrain, loggedval = logs["accuracy"], logs["val_accuracy"]
            if loggedtrain > 0.75 and loggedval > 0.75:
                self.model.stop_training = True

    def get_model():
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
            keras.layers.GlobalAvgPool1D(),
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(32, activation="relu"),
            keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
        
        model.compile(optimizer = "adam", loss=keras.losses.binary_crossentropy, metrics="accuracy")
        return model

    model=get_model()
    model.fit(train_ds, validation_data=val_ds, epochs=100, callbacks=thecallback())
    
    return model


# The code below is to save your model as a .h5 file.
# It will be saved automatically in your Submission folder.
if __name__ == '__main__':
    # DO NOT CHANGE THIS CODE
    model = solution_C4()
    model.save("Model/model_C4.h5")


Epoch 1/100
Epoch 2/100


  saving_api.save_model(
