In [3]:
# ===================================================================================================
# PROBLEM B4
#
# Build and train a classifier for the BBC-text dataset.
# This is a multiclass classification problem.
# Do not use lambda layers in your model.
#
# The dataset used in this problem is originally published in: http://mlg.ucd.ie/datasets/bbc.html.
#
# Desired accuracy and validation_accuracy > 91%
# ===================================================================================================

In [27]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [34]:
#const
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_portion = .8

In [4]:
bbc = pd.read_csv('https://github.com/dicodingacademy/assets/raw/main/Simulation/machine_learning/bbc-text.csv')

In [85]:
#decode to ordinal
replacer=dict(zip(bbc["category"].unique(), list(range(len(bbc["category"].unique())))))
bbc["category"]=bbc["category"].replace(replacer)

In [87]:
#train test split
training_sentences, validation_sentences, training_labels, validation_labels= train_test_split(bbc["text"].to_numpy(), bbc["category"].to_numpy(), train_size=training_portion, shuffle=False)

In [88]:
#the token
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
tokenizer.fit_on_texts(training_sentences)

In [89]:
def get_paddedsequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_pad_sequences=get_paddedsequences(training_sentences)
validation_pad_sequences=get_paddedsequences(validation_sentences)

In [90]:
def get_ds(the_padsequences, labels):
    ds = tf.data.Dataset.from_tensor_slices((the_padsequences, labels))
    ds = ds.cache()
    ds = ds.batch(32)
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds = get_ds(training_pad_sequences, training_labels)
val_ds = get_ds(validation_pad_sequences, validation_labels)

In [91]:
def get_model():
    model=keras.Sequential([
        keras.layers.Embedding(input_dim=vocab_size, input_length=max_length, output_dim=embedding_dim),
        keras.layers.GlobalAvgPool1D(),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dropout(0.3),        
        keras.layers.Dense(6, activation='softmax')
    ])   
    model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.sparse_categorical_crossentropy, metrics="accuracy")
    return model

In [93]:
class thecallback(tf.keras.callbacks.Callback):
    def __init__(self):
        super(thecallback, self).__init__()
        
    def on_epoch_end(self, epoch, logs=None):
        logtrain, logval = logs["accuracy"], logs["val_accuracy"]
        if logtrain > 0.91 and logval> 0.91:
            self.model.stop_training = True

In [94]:

model=get_model()
model.fit(train_ds, validation_data=val_ds, epochs=100, callbacks=thecallback())

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


<keras.src.callbacks.History at 0x7f858b9d0190>

# Answer!1

In [95]:
# ===================================================================================================
# PROBLEM B4
#
# Build and train a classifier for the BBC-text dataset.
# This is a multiclass classification problem.
# Do not use lambda layers in your model.
#
# The dataset used in this problem is originally published in: http://mlg.ucd.ie/datasets/bbc.html.
#
# Desired accuracy and validation_accuracy > 91%
# ===================================================================================================

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd


def solution_B4():
    bbc = pd.read_csv('https://github.com/dicodingacademy/assets/raw/main/Simulation/machine_learning/bbc-text.csv')

    # DO NOT CHANGE THIS CODE
    # Make sure you used all of these parameters or you can not pass this test
    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type = 'post'
    padding_type = 'post'
    oov_tok = "<OOV>"
    training_portion = .8
    
    #decode to ordinal
    replacer=dict(zip(bbc["category"].unique(), list(range(len(bbc["category"].unique())))))
    bbc["category"]=bbc["category"].replace(replacer)
    
    #train test split
    training_sentences, validation_sentences, training_labels, validation_labels= train_test_split(bbc["text"].to_numpy(), bbc["category"].to_numpy(), train_size=training_portion, shuffle=False)

    #the token
    tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)
    tokenizer.fit_on_texts(training_sentences)
    
    def get_paddedsequences(sentences):
        sequences = tokenizer.texts_to_sequences(sentences)
        return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

    training_pad_sequences=get_paddedsequences(training_sentences)
    validation_pad_sequences=get_paddedsequences(validation_sentences)

    def get_ds(the_padsequences, labels):
        ds = tf.data.Dataset.from_tensor_slices((the_padsequences, labels))
        ds = ds.cache()
        ds = ds.batch(32)
        return ds.prefetch(tf.data.AUTOTUNE)

    train_ds = get_ds(training_pad_sequences, training_labels)
    val_ds = get_ds(validation_pad_sequences, validation_labels)

    def get_model():
        model=keras.Sequential([
            keras.layers.Embedding(input_dim=vocab_size, input_length=max_length, output_dim=embedding_dim),
            keras.layers.GlobalAvgPool1D(),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(32, activation="relu"),
            keras.layers.Dropout(0.3),        
            keras.layers.Dense(6, activation='softmax')
        ])   
        model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.sparse_categorical_crossentropy, metrics="accuracy")
        return model
    
    class thecallback(tf.keras.callbacks.Callback):
        def __init__(self):
            super(thecallback, self).__init__()
            
        def on_epoch_end(self, epoch, logs=None):
            logtrain, logval = logs["accuracy"], logs["val_accuracy"]
            if logtrain > 0.91 and logval> 0.91:
                self.model.stop_training = True
                
    model=get_model()
    model.fit(train_ds, validation_data=val_ds, epochs=100, callbacks=thecallback())

    return model

    # The code below is to save your model as a .h5 file.
    # It will be saved automatically in your Submission folder.
if __name__ == '__main__':
    # DO NOT CHANGE THIS CODE
    model = solution_B4()
    model.save("Model/model_B4.h5")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


  saving_api.save_model(
