Library

In [7]:
# --- Import Library yang Diperlukan ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json # Tambahan: untuk menyimpan JSON

# Library TensorFlow dan Keras untuk memuat model
import tensorflow as tf
from tensorflow.keras.metrics import AUC, Precision, Recall, BinaryAccuracy # Metrik untuk multi-label

# Library Scikit-learn untuk metrik evaluasi
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
    f1_score,
    precision_recall_curve,
    PrecisionRecallDisplay
)
from sklearn.model_selection import train_test_split # Untuk split data (meskipun data sudah di-split sebelumnya, ini untuk konsistensi)

Global Configuration

In [8]:
# --- 0. Konfigurasi Global (Pastikan Konsisten) ---
# Ini harus sama persis dengan konfigurasi saat Anda melatih model
LABELS_FINAL = ['battery', 'organik', 'glass', 'cardboard', 'metal', 'paper', 'plastic', 'trash']

# Path Dasar untuk Dataset
# Asumsi skrip/notebook ini berada di 'notebooks/' dan dataset di 'dataset/'
BASE_DIR_DATASET = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))

# Path ke data dan model yang sudah dilatih
X_DATA_PATH = os.path.join(BASE_DIR_DATASET, 'X_data.npy')
Y_LABELS_PATH = os.path.join(BASE_DIR_DATASET, 'Y_labels.npy')
CHECKPOINT_PATH = os.path.join(BASE_DIR_DATASET, 'checkpoints', 'best_model.h5')

IMG_SIZE = (224, 224) # Ukuran gambar target model
RANDOM_SEED = 42 # Seed untuk reproduksibilitas split

# Direktori untuk menyimpan hasil evaluasi
EVAL_RESULTS_DIR = os.path.join(os.getcwd(), 'evaluation_results')
os.makedirs(EVAL_RESULTS_DIR, exist_ok=True) # Pastikan direktori ada

Load Data dan Model

In [9]:

print("="*50)
print("--- BAGIAN 1: MEMUAT DATA DAN MODEL UNTUK EVALUASI ---")
print("="*50)

# --- 1. Memuat Data yang Sudah Diproses (NumPy Arrays) ---
print("\n--- 1. Memuat Data X_data.npy dan Y_labels.npy ---")
try:
    X = np.load(X_DATA_PATH)
    Y = np.load(Y_LABELS_PATH)
    print(f"Data X.npy dimuat dengan bentuk: {X.shape}")
    print(f"Data Y.npy dimuat dengan bentuk: {Y.shape}")
except FileNotFoundError:
    print(f"ERROR: File '{X_DATA_PATH}' atau '{Y_LABELS_PATH}' tidak ditemukan.")
    print("Pastikan Anda sudah menjalankan skrip preprocessing dan menyimpannya.")
    exit()

# --- 2. Split Data (Ulangi split yang sama dengan pelatihan untuk mendapatkan Test Set) ---
print("\n--- 2. Membagi Data menjadi Train, Validation, dan Test Set (untuk konsistensi) ---")
# Split awal untuk train + val vs test (15% untuk test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.15, random_state=RANDOM_SEED, shuffle=True)
# Split train_val menjadi train dan val (tidak digunakan di sini, tapi dipertahankan untuk konsistensi)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

print(f"Bentuk data uji (X_test, y_test): {X_test.shape}, {y_test.shape}")


# --- 3. Memuat Model Terbaik yang Sudah Dilatih ---
print("\n--- 3. Memuat Model Terbaik dari Checkpoint ---")
try:
    best_model = tf.keras.models.load_model(CHECKPOINT_PATH)
    print(f"Model terbaik berhasil dimuat kembali dari: {CHECKPOINT_PATH}")
    best_model.summary()
except Exception as e:
    print(f"ERROR: Gagal memuat model terbaik dari '{CHECKPOINT_PATH}'.")
    print(f"Pesan Error: {e}")
    print("Pastikan Anda sudah menjalankan skrip pelatihan dan model disimpan dengan benar.")
    exit()

--- BAGIAN 1: MEMUAT DATA DAN MODEL UNTUK EVALUASI ---

--- 1. Memuat Data X_data.npy dan Y_labels.npy ---
Data X.npy dimuat dengan bentuk: (4992, 224, 224, 3)
Data Y.npy dimuat dengan bentuk: (4992, 8)

--- 2. Membagi Data menjadi Train, Validation, dan Test Set (untuk konsistensi) ---
Bentuk data uji (X_test, y_test): (749, 224, 224, 3), (749, 8)

--- 3. Memuat Model Terbaik dari Checkpoint ---




Model terbaik berhasil dimuat kembali dari: d:\Kuroya\Kuliah\sampah-multilabel-ai\dataset\checkpoints\best_model.h5


Prediksi dan Evaluasi Dasar

In [10]:
print("\n\n" + "="*50)
print("--- BAGIAN 2: PREDIKSI DAN EVALUASI DASAR ---")
print("="*50)

# --- 1. Melakukan Prediksi Probabilitas pada Test Set ---
print("\n--- 1. Melakukan Prediksi Probabilitas pada Test Set ---")
y_pred_proba = best_model.predict(X_test, verbose=1)
print(f"Bentuk probabilitas prediksi: {y_pred_proba.shape}")
print("Contoh Probabilitas Prediksi (5 sampel pertama):\n", y_pred_proba[:5].round(3))


# --- 2. Thresholding Standar (0.5) ---
# Mengonversi probabilitas menjadi label biner dengan threshold 0.5
print("\n--- 2. Menerapkan Threshold 0.5 untuk Prediksi Biner ---")
threshold = 0.5
y_pred_thresholded = (y_pred_proba > threshold).astype(int)
print("Contoh Prediksi Biner (5 sampel pertama):\n", y_pred_thresholded[:5])
print("Contoh Label Sebenarnya (5 sampel pertama):\n", y_test[:5].astype(int))

# --- 3. Evaluasi Model pada Test Set (Metrik dari Keras) ---
print("\n--- 3. Evaluasi Model dengan Metrik yang Digunakan Saat Pelatihan ---")
evaluation_results = best_model.evaluate(X_test, y_test, verbose=1)

metrics_names = best_model.metrics_names
print("\nHasil Evaluasi pada Test Set:")
for name, value in zip(metrics_names, evaluation_results):
    print(f"- {name}: {value:.4f}")



--- BAGIAN 2: PREDIKSI DAN EVALUASI DASAR ---

--- 1. Melakukan Prediksi Probabilitas pada Test Set ---
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 686ms/step
Bentuk probabilitas prediksi: (749, 8)
Contoh Probabilitas Prediksi (5 sampel pertama):
 [[0.295 0.931 0.292 0.383 0.285 0.369 0.18  0.512]
 [0.711 0.25  0.48  0.427 0.534 0.293 0.188 0.4  ]
 [0.24  0.114 0.274 0.845 0.373 0.398 0.519 0.514]
 [0.108 0.692 0.381 0.378 0.267 0.626 0.419 0.789]
 [0.016 0.071 0.145 0.978 0.111 0.528 0.293 0.445]]

--- 2. Menerapkan Threshold 0.5 untuk Prediksi Biner ---
Contoh Prediksi Biner (5 sampel pertama):
 [[0 1 0 0 0 0 0 1]
 [1 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 1 1]
 [0 1 0 0 0 1 0 1]
 [0 0 0 1 0 1 0 0]]
Contoh Label Sebenarnya (5 sampel pertama):
 [[1 1 1 0 0 1 0 0]
 [1 0 1 1 0 1 0 0]
 [0 0 1 1 1 0 0 1]
 [0 1 0 0 1 0 1 1]
 [0 0 0 1 0 0 1 1]]

--- 3. Evaluasi Model dengan Metrik yang Digunakan Saat Pelatihan ---
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m

Evaluasi Mendalam Multi-Label

In [11]:
print("\n\n" + "="*50)
print("--- BAGIAN 3: EVALUASI MENDALAM MODEL MULTI-LABEL ---")
print("="*50)

# --- 1. Classification Report: Precision, Recall, F1-Score per Label (Threshold 0.5) ---
print("\n--- 1. Classification Report (per Label, Threshold 0.5) ---")
report_05 = classification_report(y_test, y_pred_thresholded, target_names=LABELS_FINAL, zero_division=0)
print(report_05)

# --- Tambahan: Simpan Classification Report 0.5 ---
report_05_filename = os.path.join(EVAL_RESULTS_DIR, "classification_report_threshold_05.txt")
with open(report_05_filename, "w") as f:
    f.write(report_05)
print(f"Classification Report (threshold 0.5) disimpan ke: {report_05_filename}")


# --- 2. Ringkasan Metrik Agregat (dari Classification Report) ---
print("\n--- 2. Ringkasan Metrik Agregat (dari Classification Report) ---")
report_dict_05 = classification_report(y_test, y_pred_thresholded, target_names=LABELS_FINAL, output_dict=True, zero_division=0)

print(f"Micro Avg Precision: {report_dict_05['micro avg']['precision']:.4f}")
print(f"Micro Avg Recall:    {report_dict_05['micro avg']['recall']:.4f}")
print(f"Micro Avg F1-Score:  {report_dict_05['micro avg']['f1-score']:.4f}")

print(f"\nMacro Avg Precision: {report_dict_05['macro avg']['precision']:.4f}")
print(f"Macro Avg Recall:    {report_dict_05['macro avg']['recall']:.4f}")
print(f"Macro Avg F1-Score:  {report_dict_05['macro avg']['f1-score']:.4f}")

print(f"\nWeighted Avg Precision: {report_dict_05['weighted avg']['precision']:.4f}")
print(f"Weighted Avg Recall:    {report_dict_05['weighted avg']['recall']:.4f}")
print(f"Weighted Avg F1-Score:  {report_dict_05['weighted avg']['f1-score']:.4f}")

# --- 3. Overall ROC AUC Score (Macro & Weighted) ---
print("\n--- 3. Overall ROC AUC Score (Macro & Weighted) ---")
overall_auc_macro = roc_auc_score(y_test, y_pred_proba, average='macro')
overall_auc_weighted = roc_auc_score(y_test, y_pred_proba, average='weighted')
print(f"Overall ROC AUC (Macro): {overall_auc_macro:.4f}")
print(f"Overall ROC AUC (Weighted): {overall_auc_weighted:.4f}")


# --- 4. Confusion Matrix per Label (Threshold 0.5) ---
print("\n--- 4. Confusion Matrix (per Label, Threshold 0.5) ---")
for i, label in enumerate(LABELS_FINAL):
    cm = confusion_matrix(y_test[:, i], y_pred_thresholded[:, i])
    print(f"\nLabel: {label}")
    print(cm)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not " + label, label])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'Confusion Matrix - {label}')
    plt.grid(False)
    # Simpan Confusion Matrix Plot
    cm_filename = os.path.join(EVAL_RESULTS_DIR, f"confusion_matrix_{label}_05_threshold.png")
    plt.savefig(cm_filename)
    plt.close() # Tutup plot agar tidak menumpuk di memori jika banyak label
    print(f"Confusion Matrix untuk {label} disimpan ke: {cm_filename}")

# --- 5. ROC AUC Score per Label ---
print("\n--- 5. ROC AUC Score (per Label) ---")
for i, label in enumerate(LABELS_FINAL):
    try:
        auc = roc_auc_score(y_test[:, i], y_pred_proba[:, i])
        print(f"- {label:<10}: AUC = {auc:.4f}")
    except ValueError:
        print(f"- {label:<10}: AUC tidak dapat dihitung (label terlalu imbalanced atau konstan)")

# --- 6. Mencari Threshold Optimal per Label (Maksimalkan F1-Score) ---
print("\n--- 6. Mencari Threshold Optimal (Maksimalkan F1-Score) per Label ---")
optimal_thresholds = {}
for i, label in enumerate(LABELS_FINAL):
    precisions, recalls, thresholds_arr = precision_recall_curve(y_test[:, i], y_pred_proba[:, i])
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10) # Tambah epsilon untuk menghindari ZeroDivisionError

    if len(f1_scores) > 0:
        optimal_idx = np.argmax(f1_scores)
        # CONVERT TO STANDARD PYTHON FLOAT HERE
        optimal_thresholds[label] = float(thresholds_arr[optimal_idx]) if optimal_idx < len(thresholds_arr) else float(thresholds_arr[-1])
        print(f"- {label:<10}: Optimal Threshold = {optimal_thresholds[label]:.4f} (F1-score maks: {f1_scores[optimal_idx]:.4f})")
    else:
        print(f"- {label:<10}: Tidak dapat menemukan F1-score optimal (data tidak cukup bervariasi).")

# --- Tambahan: Simpan Threshold Optimal ---
optimal_thresholds_filename = os.path.join(EVAL_RESULTS_DIR, "optimal_thresholds.json")
with open(optimal_thresholds_filename, "w") as f:
    json.dump(optimal_thresholds, f, indent=2)
print(f"Threshold optimal disimpan ke: {optimal_thresholds_filename}")


# --- 7. Classification Report dengan Threshold Optimal ---
print("\n--- 7. Classification Report (per Label, Threshold Optimal) ---")
y_pred_optimal_thresholded = np.zeros_like(y_pred_proba, dtype=int)
for i, label in enumerate(LABELS_FINAL):
    if label in optimal_thresholds:
        y_pred_optimal_thresholded[:, i] = (y_pred_proba[:, i] > optimal_thresholds[label]).astype(int)
    else:
        y_pred_optimal_thresholded[:, i] = (y_pred_proba[:, i] > 0.5).astype(int)

report_optimal = classification_report(y_test, y_pred_optimal_thresholded, target_names=LABELS_FINAL, zero_division=0)
print(report_optimal)

# --- Tambahan: Simpan Classification Report Optimal ---
report_optimal_filename = os.path.join(EVAL_RESULTS_DIR, "classification_report_threshold_optimal.txt")
with open(report_optimal_filename, "w") as f:
    f.write(report_optimal)
print(f"Classification Report (threshold optimal) disimpan ke: {report_optimal_filename}")


# --- 8. Perbandingan F1-Score (0.5 threshold vs Optimal threshold) ---
print("\n--- 8. Perbandingan F1-Score (0.5 threshold vs Optimal threshold) ---")
report_05_dict = classification_report(y_test, y_pred_thresholded, target_names=LABELS_FINAL, output_dict=True, zero_division=0)
report_optimal_dict = classification_report(y_test, y_pred_optimal_thresholded, target_names=LABELS_FINAL, output_dict=True, zero_division=0)

for label in LABELS_FINAL:
    f1_05 = report_05_dict[label]['f1-score'] if label in report_05_dict else 0.0
    f1_opt = report_optimal_dict[label]['f1-score'] if label in report_optimal_dict else 0.0
    print(f"- {label:<10}: F1 (0.5) = {f1_05:.4f}, F1 (Optimal) = {f1_opt:.4f}")

# --- Tambahan: F1 Macro Score secara Manual dari Optimal ---
macro_f1_optimal = np.mean([report_optimal_dict[label]['f1-score'] for label in LABELS_FINAL if label in report_optimal_dict and 'f1-score' in report_optimal_dict[label]])
print(f"\nMacro F1-score (Optimal Threshold): {macro_f1_optimal:.4f}")


# --- 9. Analisis Performa Berdasarkan Jumlah Label per Gambar ---
print("\n--- 9. Analisis Performa Berdasarkan Jumlah Label per Gambar ---")

num_true_labels_test = np.sum(y_test, axis=1)
unique_num_labels = np.unique(num_true_labels_test)

performance_by_num_labels = [] # Untuk menyimpan hasil dalam bentuk tabel/dict

for n_labels in sorted(unique_num_labels):
    if n_labels == 0: continue
    
    indices = np.where(num_true_labels_test == n_labels)[0]
    
    if len(indices) == 0:
        continue
    
    subset_y_test = y_test[indices]
    subset_y_pred = y_pred_optimal_thresholded[indices]
    
    subset_binary_accuracy = np.mean(subset_y_test == subset_y_pred) 
    
    print(f"\nGambar dengan {n_labels} label:")
    print(f"  - Jumlah Sampel: {len(indices)}")
    print(f"  - Binary Accuracy (optimal threshold): {subset_binary_accuracy:.4f}")
    performance_by_num_labels.append({
        'num_labels': n_labels,
        'num_samples': len(indices),
        'binary_accuracy_optimal_threshold': subset_binary_accuracy
    })

# --- Tambahan: Logging ke CSV untuk Metrik Per Label (dari threshold optimal) ---
print("\n--- Logging Metrik Per Label ke CSV ---")
label_metrics_df = pd.DataFrame.from_dict(report_optimal_dict, orient='index')
# Hapus baris 'accuracy', 'macro avg', 'weighted avg', 'samples' jika tidak ingin disimpan
label_metrics_df = label_metrics_df.drop(columns=['support'], errors='ignore') # 'support' juga bisa dihapus
label_metrics_df = label_metrics_df.drop(index=['accuracy', 'macro avg', 'weighted avg'], errors='ignore')
label_metrics_df.index.name = 'label' # Beri nama kolom indeks

label_metrics_filename = os.path.join(EVAL_RESULTS_DIR, "label_metrics_optimal_threshold.csv")
label_metrics_df.to_csv(label_metrics_filename)
print(f"Metrik per label (optimal threshold) disimpan ke: {label_metrics_filename}")

# Optional: Simpan juga performa berdasarkan jumlah label
performance_num_labels_df = pd.DataFrame(performance_by_num_labels)
performance_num_labels_filename = os.path.join(EVAL_RESULTS_DIR, "performance_by_num_labels.csv")
performance_num_labels_df.to_csv(performance_num_labels_filename, index=False)
print(f"Performa berdasarkan jumlah label disimpan ke: {performance_num_labels_filename}")


# --- 10. Visualisasi Kurva Precision-Recall per Label ---
print("\n--- 10. Visualisasi Precision-Recall Curve (per Label) ---")

for i, label in enumerate(LABELS_FINAL):
    precisions, recalls, _ = precision_recall_curve(y_test[:, i], y_pred_proba[:, i])
    
    plt.figure(figsize=(7, 6))
    disp = PrecisionRecallDisplay(precision=precisions, recall=recalls)
    disp.plot()
    plt.title(f'Precision-Recall Curve - {label}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.grid(True)
    
    # Simpan Precision-Recall Curve Plot
    pr_curve_filename = os.path.join(EVAL_RESULTS_DIR, f"precision_recall_curve_{label}.png")
    plt.savefig(pr_curve_filename)
    plt.close()
    print(f"Precision-Recall Curve untuk {label} disimpan ke: {pr_curve_filename}")


# --- 11. Visualisasi Beberapa Kesalahan Prediksi ---
print("\n--- 11. Visualisasi Beberapa Kesalahan Prediksi ---")

samples_shown = 0
max_errors = 5

for i in range(len(y_test)):
    true = y_test[i]
    pred = y_pred_optimal_thresholded[i]
    
    if not np.array_equal(true, pred):
        print(f"\n[Contoh {samples_shown+1}] Index: {i}")
        true_labels = [LABELS_FINAL[j] for j in range(len(LABELS_FINAL)) if true[j] == 1]
        predicted_labels = [LABELS_FINAL[j] for j in range(len(LABELS_FINAL)) if pred[j] == 1]
        
        print(f"  True Labels     : {true_labels}")
        print(f"  Predicted Labels: {predicted_labels}")
        
        plt.figure(figsize=(8, 8))
        plt.imshow(X_test[i])
        plt.axis('off')
        plt.title(f"True: {', '.join(true_labels)}\nPred: {', '.join(predicted_labels)}", fontsize=12, color='red', wrap=True)
        
        # Simpan Gambar Kesalahan Prediksi
        error_img_filename = os.path.join(EVAL_RESULTS_DIR, f"error_sample_{samples_shown+1}.png")
        plt.savefig(error_img_filename)
        plt.close()
        print(f"Gambar kesalahan {samples_shown+1} disimpan ke: {error_img_filename}")
        
        samples_shown += 1
        if samples_shown >= max_errors:
            break

print("\n--- Evaluasi Model Multi-Label Selesai ---")
print(f"\nSemua hasil evaluasi (laporan, threshold, grafik, gambar kesalahan) tersimpan di direktori: {EVAL_RESULTS_DIR}")



--- BAGIAN 3: EVALUASI MENDALAM MODEL MULTI-LABEL ---

--- 1. Classification Report (per Label, Threshold 0.5) ---
              precision    recall  f1-score   support

     battery       0.91      0.45      0.60       260
     organik       0.89      0.74      0.81       268
       glass       0.73      0.27      0.40       273
   cardboard       0.73      0.65      0.69       291
       metal       0.74      0.41      0.53       290
       paper       0.74      0.33      0.46       275
     plastic       0.73      0.32      0.45       279
       trash       0.59      0.37      0.46       277

   micro avg       0.76      0.44      0.56      2213
   macro avg       0.76      0.44      0.55      2213
weighted avg       0.76      0.44      0.55      2213
 samples avg       0.72      0.45      0.53      2213

Classification Report (threshold 0.5) disimpan ke: d:\Kuroya\Kuliah\sampah-multilabel-ai\notebook\evaluation_results\classification_report_threshold_05.txt

--- 2. Ringkasan Metr

<Figure size 700x600 with 0 Axes>

<Figure size 700x600 with 0 Axes>

<Figure size 700x600 with 0 Axes>

<Figure size 700x600 with 0 Axes>

<Figure size 700x600 with 0 Axes>

<Figure size 700x600 with 0 Axes>

<Figure size 700x600 with 0 Axes>

<Figure size 700x600 with 0 Axes>