# Saran Optimasi untuk Model Klasifikasi Konten

Notebook ini berisi saran-saran yang telah dianalisis untuk meningkatkan akurasi model klasifikasi konten Anda, berdasarkan notebook `content_moderation_full2.ipynb`.

## 1. Impor Library dan Konfigurasi Awal
Pastikan semua library yang dibutuhkan telah diimpor dan konfigurasi awal seperti seed dan mixed precision telah diatur.

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetB2 # Atau EfficientNetB0, sesuaikan
from tensorflow.keras.applications.efficientnet import preprocess_input # Sesuaikan jika base model berbeda
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras import mixed_precision
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from collections import Counter
from PIL import Image

# Set seed untuk reproduktifitas
np.random.seed(42)
tf.random.set_seed(42)

# Mixed Precision (jika menggunakan GPU yang mendukung dan TF >= 2.4)
# mixed_precision.set_global_policy('mixed_float16') 
# Catatan: Output dari notebook Anda menunjukkan tidak ada GPU yang tersedia, 
# jadi mixed precision mungkin tidak memberikan manfaat signifikan dan bisa diabaikan jika di CPU.

print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.19.0


## 2. Konfigurasi Dataset dan Parameter Model
Definisikan path dataset, ukuran gambar, batch size, dan nama kelas. Pastikan `img_size` sesuai dengan model dasar yang dipilih (misalnya, EfficientNetB2 umumnya menggunakan (260, 260)).

In [3]:
train_dir = 'converted/train'
val_dir = 'converted/valid'
test_dir = 'converted/test'

# Pilih base model dan sesuaikan img_size
BASE_MODEL_CHOICE = EfficientNetB2 # Ganti ke EfficientNetB0 jika itu yang Anda gunakan

if BASE_MODEL_CHOICE == EfficientNetB2:
    img_size = (260, 260)
    PREPROCESS_INPUT_FUNC = tf.keras.applications.efficientnet.preprocess_input
else: # Asumsi EfficientNetB0 atau default lain
    img_size = (224, 224)
    PREPROCESS_INPUT_FUNC = tf.keras.applications.efficientnet.preprocess_input # atau sesuaikan

batch_size = 32 # atau 64, bisa dieksperimen
class_names = sorted(os.listdir(train_dir))
num_classes = len(class_names)

print(f"Base Model: {BASE_MODEL_CHOICE.__name__}")
print(f"Image size: {img_size}")
print(f"Batch size: {batch_size}")
print(f"Number of classes: {num_classes}")
print("Classes:", class_names)

Base Model: EfficientNetB2
Image size: (260, 260)
Batch size: 32
Number of classes: 9
Classes: ['Accident', 'Blood', 'Blood and Gore', 'Explosion', 'Normal', 'Sexual Harassment', 'Suicide', 'Violence', 'nudity']


## 3. Data Generators dengan Augmentasi
Augmentasi data penting untuk mencegah overfitting dan meningkatkan generalisasi model. Sesuaikan parameter augmentasi jika diperlukan.

In [4]:
def create_data_generators(img_size_param, batch_size_param, train_dir_param, val_dir_param):
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=20,
        width_shift_range=0.15,
        height_shift_range=0.15,
        zoom_range=0.15,
        horizontal_flip=True,
        brightness_range=[0.8, 1.2],
        shear_range=0.1
    )
    val_datagen = ImageDataGenerator(rescale=1./255)
    
    train_generator = train_datagen.flow_from_directory(
        train_dir_param,
        target_size=img_size_param,
        batch_size=batch_size_param,
        class_mode='categorical',
        shuffle=True
    )
    val_generator = val_datagen.flow_from_directory(
        val_dir_param,
        target_size=img_size_param,
        batch_size=batch_size_param,
        class_mode='categorical',
        shuffle=False
    )
    return train_generator, val_generator

train_generator, val_generator = create_data_generators(img_size, batch_size, train_dir, val_dir)

Found 3600 images belonging to 9 classes.
Found 919 images belonging to 9 classes.


## 4. Penanganan Ketidakseimbangan Kelas
Ketidakseimbangan kelas dapat mempengaruhi performa model. Pertimbangkan untuk menggunakan class weights standar atau Focal Loss.

In [5]:
# Opsi 1: Sklearn balanced class weights
y_train_classes = train_generator.classes
sklearn_class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_classes),
    y=y_train_classes
)
sklearn_class_weights_dict = dict(enumerate(sklearn_class_weights))
print("Sklearn Balanced Class Weights:", sklearn_class_weights_dict)

# Opsi 2: Fungsi custom class weights Anda (pastikan terdefinisi)
# def calculate_custom_class_weights(y_train, alpha=0.5, num_classes_param=num_classes):
#     class_counts = Counter(y_train)
#     total_samples = len(y_train)
#     weights = {}
#     for class_idx, count in class_counts.items():
#         standard_weight = total_samples / (num_classes_param * count)
#         if count > total_samples * 0.4: 
#             penalty_weight = standard_weight * (1 + alpha * (count / total_samples))
#             weights[class_idx] = 1.0 / penalty_weight
#         else:
#             weights[class_idx] = standard_weight
#     return weights
# custom_weights = calculate_custom_class_weights(y_train_classes, alpha=0.5)
# print("Custom Class Weights (alpha=0.5):", custom_weights)

# Pilih class weights yang akan digunakan:
active_class_weights = sklearn_class_weights_dict # atau custom_weights

# Opsi 3: Focal Loss (fungsi didefinisikan di bawah)
def focal_loss(gamma=2.0, alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        epsilon = tf.keras.backend.epsilon()
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
        ce = -y_true * tf.math.log(y_pred)
        weight = alpha * y_true * tf.pow((1 - y_pred), gamma)
        fl = weight * ce
        return tf.reduce_mean(tf.reduce_sum(fl, axis=1))
    return focal_loss_fixed

Sklearn Balanced Class Weights: {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0}


## 5. Pembuatan Model
Gunakan model dasar pre-trained dan tambahkan layer kustom di atasnya. Sesuaikan jumlah layer yang di-fine-tune dan parameter regularisasi.

In [6]:
def create_selected_model(num_classes_param, img_size_param):
    base_model = BASE_MODEL_CHOICE(
        include_top=False,
        weights='imagenet',
        input_shape=img_size_param + (3,),
        pooling='avg'
    )
    
    # Sesuaikan jumlah layer yang di-fine-tune
    num_fine_tune_layers = 30 # Untuk B2, mungkin perlu disesuaikan untuk B0 atau model lain
    if len(base_model.layers) > num_fine_tune_layers:
        for layer in base_model.layers[:-num_fine_tune_layers]:
            layer.trainable = False
        for layer in base_model.layers[-num_fine_tune_layers:]:
            # Pastikan layer tidak BatchNormalization sebelum dijadikan trainable jika mixed precision aktif
            if not isinstance(layer, tf.keras.layers.BatchNormalization):
                 layer.trainable = True
    else: # Jika model lebih kecil dari num_fine_tune_layers, fine-tune semua
        for layer in base_model.layers:
            if not isinstance(layer, tf.keras.layers.BatchNormalization):
                 layer.trainable = True

    model = tf.keras.Sequential([
        layers.Input(shape=img_size_param + (3,)),
        layers.Lambda(PREPROCESS_INPUT_FUNC),
        base_model,
        layers.Dropout(0.4),
        layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
        layers.Dropout(0.4),
        layers.Dense(num_classes_param, activation='softmax', dtype='float32') # Output float32 jika pakai mixed precision
    ])
    return model

model = create_selected_model(num_classes, img_size)




## 6. Callbacks dan Kompilasi Model
Gunakan callbacks seperti `ReduceLROnPlateau`, `EarlyStopping`, `ModelCheckpoint`, dan `WarmUpLearningRateScheduler` yang dimodifikasi. Pilih loss function dan optimizer yang sesuai.

In [None]:
INITIAL_LR = 1e-4 # Eksperimen dengan 1e-4 atau 5e-5
WARMUP_EPOCHS = 5
BASE_LR_WARMUP = 1e-7 # LR awal yang sangat kecil untuk warmup

class WarmUpLearningRateScheduler(tf.keras.callbacks.Callback):
    def __init__(self, target_lr, warmup_epochs, base_lr=1e-7, verbose=0):
        super().__init__()
        self.target_lr = target_lr
        self.warmup_epochs = warmup_epochs
        self.base_lr = base_lr
        self.verbose = verbose

    def on_epoch_begin(self, epoch, logs=None):
        if epoch < self.warmup_epochs:
            lr = self.base_lr + (self.target_lr - self.base_lr) * (epoch + 1) / self.warmup_epochs
            if hasattr(self.model.optimizer, 'lr') and isinstance(self.model.optimizer.lr, tf.Variable):
                self.model.optimizer.lr.assign(lr)
            elif hasattr(self.model.optimizer, 'learning_rate') and isinstance(self.model.optimizer.learning_rate, tf.Variable):
                self.model.optimizer.learning_rate.assign(lr)
            if self.verbose > 0:
                print(f"\nEpoch {epoch+1}: Warmup learning rate set to {lr:.7f}")
        elif epoch == self.warmup_epochs:
             if hasattr(self.model.optimizer, 'lr') and isinstance(self.model.optimizer.lr, tf.Variable):
                self.model.optimizer.lr.assign(self.target_lr)
             elif hasattr(self.model.optimizer, 'learning_rate') and isinstance(self.model.optimizer.learning_rate, tf.Variable):
                self.model.optimizer.learning_rate.assign(self.target_lr)
             if self.verbose > 0:
                print(f"\nEpoch {epoch+1}: Learning rate set to target {self.target_lr:.7f} after warmup.")

reduce_lr_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-8, verbose=1)
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
model_checkpoint_cb = ModelCheckpoint('best_model_optimized.keras', monitor='val_loss', save_best_only=True, mode='min', verbose=1)
warmup_scheduler_cb = WarmUpLearningRateScheduler(target_lr=INITIAL_LR, warmup_epochs=WARMUP_EPOCHS, base_lr=BASE_LR_WARMUP, verbose=1)

callbacks_list = [reduce_lr_cb, early_stopping_cb, model_checkpoint_cb, warmup_scheduler_cb]

# Pilih loss function:
# chosen_loss = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1)
# use_class_weights_in_fit = True

chosen_loss = focal_loss(gamma=2.0, alpha=0.25) # Eksperimen dengan parameter focal loss
use_class_weights_in_fit = False # Umumnya tidak perlu dengan focal loss, tapi bisa dieksperimen

model.compile(
    optimizer=tf.keras.optimizers.AdamW(learning_rate=INITIAL_LR, weight_decay=1e-5), # Coba AdamW
    loss=chosen_loss,
    metrics=['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')]
)
model.summary()

## 7. Pelatihan Model
Latih model dengan data generator dan callbacks yang telah disiapkan. Tambah jumlah epoch jika `EarlyStopping` menghentikan terlalu dini.

In [None]:
print("\nStarting model training...")
EPOCHS_TO_TRAIN = 50 # EarlyStopping akan mengontrol jumlah epoch sebenarnya

history = model.fit(
    train_generator,
    epochs=EPOCHS_TO_TRAIN,
    validation_data=val_generator,
    callbacks=callbacks_list,
    verbose=1,
    class_weight=active_class_weights if use_class_weights_in_fit else None
)


Starting model training...


  self._warn_if_super_not_called()



Epoch 1: Warmup learning rate set to 0.0000401
Epoch 1/50
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.1087 - loss: 0.8636 - precision: 0.0000e+00 - recall: 0.0000e+00
Epoch 1: val_loss improved from inf to 0.80835, saving model to best_model_optimized.keras
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 3s/step - accuracy: 0.1087 - loss: 0.8635 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.1088 - val_loss: 0.8084 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 2.0000e-04

Epoch 2: Warmup learning rate set to 0.0000801
Epoch 2/50
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.1145 - loss: 0.7937 - precision: 0.0000e+00 - recall: 0.0000e+00
Epoch 2: val_loss improved from 0.80835 to 0.74184, saving model to best_model_optimized.keras
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 2s/step - accuracy: 0.1145 - loss: 0.7936 - 

## 8. Visualisasi Hasil Training
Plot kurva akurasi dan loss untuk training dan validasi.

In [None]:
def plot_training_history(history_to_plot):
    acc = history_to_plot.history.get('accuracy', [])
    val_acc = history_to_plot.history.get('val_accuracy', [])
    loss = history_to_plot.history.get('loss', [])
    val_loss = history_to_plot.history.get('val_loss', [])

    epochs_range = range(len(acc))

    plt.figure(figsize=(14, 6))

    plt.subplot(1, 2, 1)
    if acc and val_acc:
        plt.plot(epochs_range, acc, label='Training Accuracy')
        plt.plot(epochs_range, val_acc, label='Validation Accuracy')
        plt.legend(loc='lower right')
        plt.title('Training and Validation Accuracy')
    else:
        plt.text(0.5, 0.5, 'Accuracy data not available', ha='center')

    plt.subplot(1, 2, 2)
    if loss and val_loss:
        plt.plot(epochs_range, loss, label='Training Loss')
        plt.plot(epochs_range, val_loss, label='Validation Loss')
        plt.legend(loc='upper right')
        plt.title('Training and Validation Loss')
    else:
        plt.text(0.5, 0.5, 'Loss data not available', ha='center')

    plt.tight_layout()
    plt.show()

if history:
    plot_training_history(history)

## 9. Evaluasi Model Lebih Lanjut
Gunakan fungsi evaluasi lanjutan untuk melihat metrik per kelas dan confusion matrix.

In [None]:
def advanced_evaluation(model_to_eval, test_gen, class_names_param):
    test_gen.reset() # Pastikan generator kembali ke awal
    y_pred_proba = model_to_eval.predict(test_gen, verbose=1)
    y_pred_classes_eval = np.argmax(y_pred_proba, axis=1)
    y_true_eval = test_gen.classes
    
    print("\nDetailed Classification Report:")
    print(classification_report(y_true_eval, y_pred_classes_eval, target_names=class_names_param, zero_division=0))
    
    cm = confusion_matrix(y_true_eval, y_pred_classes_eval)
    cm_normalized = confusion_matrix(y_true_eval, y_pred_classes_eval, normalize='true')
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    disp1 = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names_param)
    disp1.plot(ax=ax1, cmap='Blues', xticks_rotation='vertical')
    ax1.set_title("Raw Confusion Matrix")
    
    disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=class_names_param)
    disp2.plot(ax=ax2, cmap='Blues', xticks_rotation='vertical')
    ax2.set_title("Normalized Confusion Matrix")
    
    plt.tight_layout()
    plt.show()
    
    print("\nPer-class Analysis:")
    for i, class_name_eval in enumerate(class_names_param):
        class_mask = (y_true_eval == i)
        if np.sum(class_mask) > 0:
            class_accuracy = np.sum((y_pred_classes_eval == i) & class_mask) / np.sum(class_mask)
            print(f"{class_name_eval}: {class_accuracy:.3f} accuracy ({np.sum(class_mask)} samples)")

test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=img_size,
    batch_size=1, # Batch size 1 untuk evaluasi akurat per sampel
    class_mode='categorical',
    shuffle=False
)

# Load model terbaik (jika disimpan dalam format .keras)
# try:
#     print("Loading best model from 'best_model_optimized.keras'...")
#     model = tf.keras.models.load_model('best_model_optimized.keras', custom_objects={'focal_loss_fixed': focal_loss() if chosen_loss.__name__ == 'focal_loss' else None})
# except Exception as e:
#     print(f"Could not load .keras model: {e}. Ensure custom objects are passed if needed.")
#     print("Using the model from training history.")

print("\nEvaluating model on test set...")
eval_results = model.evaluate(test_generator, verbose=1)
print(f"\nTest Results (from model.evaluate):")
for metric_name, metric_value in zip(model.metrics_names, eval_results):
    print(f"{metric_name}: {metric_value:.4f}")

advanced_evaluation(model, test_generator, class_names)

## 10. Prediksi dengan Threshold
Fungsi untuk prediksi gambar tunggal dengan *confidence threshold* untuk mengurangi *false positives*.

In [None]:
def predict_with_threshold(model_to_predict, image_path, img_size_param, class_names_param, conf_threshold=0.7):
    try:
        img = Image.open(image_path).convert('RGB')
    except FileNotFoundError:
        print(f"Error: Image not found at {image_path}")
        return None, None
    
    img_resized = img.resize(img_size_param)
    img_array = np.expand_dims(np.array(img_resized) / 255.0, axis=0)
    
    # Jika menggunakan preprocess_input dari Lambda layer, model sudah menanganinya.
    # Jika tidak, dan preprocess_input belum diterapkan, lakukan di sini.
    # img_array_preprocessed = PREPROCESS_INPUT_FUNC(img_array * 255) # Jika rescale 1./255 sudah dilakukan, kalikan 255 dulu
    # prediction = model_to_predict.predict(img_array_preprocessed)
    prediction = model_to_predict.predict(img_array) # Asumsi model sudah ada Lambda preprocess_input

    confidence = np.max(prediction)
    predicted_class_idx = np.argmax(prediction)
    predicted_class = class_names_param[predicted_class_idx]
    
    if confidence < conf_threshold:
        predicted_class_label = f"Uncertain (Conf: {confidence:.2f}) - Original: {predicted_class}"
    else:
        predicted_class_label = predicted_class

    print(f"Predicted Class Label: {predicted_class_label}")
    print(f"Original Predicted Class (highest prob): {predicted_class}")
    print(f"Confidence: {confidence:.4f}")
    
    # Tampilkan probabilitas untuk semua kelas
    # probabilities_str = ", ".join([f"{name}: {prob:.3f}" for name, prob in zip(class_names_param, prediction[0])])
    # print(f"All probabilities: {{ {probabilities_str} }}")
    
    plt.imshow(img) # Tampilkan gambar asli
    plt.title(f"{predicted_class_label} (Confidence: {confidence:.3f})", fontsize=10)
    plt.axis('off')
    plt.show()
    
    return predicted_class, confidence

# Contoh penggunaan (ganti dengan path gambar Anda yang valid)
# Pastikan direktori dan file ada. Anda bisa mengambil path dari test_generator.filenames
if test_generator.samples > 0:
    example_image_path = os.path.join(test_dir, test_generator.filenames[0])
    print(f"\nPredicting example image: {example_image_path}")
    if os.path.exists(example_image_path):
        predict_with_threshold(model, example_image_path, img_size, class_names, conf_threshold=0.5)
    else:
        print(f"Example image not found: {example_image_path}")
else:
    print("No samples in test_generator to predict.")