<a href="https://colab.research.google.com/github/Triniti0/klasifikasi-penyakit-daun-cabai/blob/main/Mcemar_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# McNemar test untuk model baseline dan model dengan trade-off terbaik

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
import json
import numpy as np
import tensorflow as tf
from statsmodels.stats.contingency_tables import mcnemar
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input

Mounted at /content/drive


In [3]:
baseline_model = tf.keras.models.load_model(
    "/content/drive/MyDrive/Seminar Hasil/Experiments/Baseline/Baseline_Model.h5"
)

best_model = tf.keras.models.load_model(
    "/content/drive/MyDrive/Seminar Hasil/Experiments/Partial_FineTuning/Best_Model_Partial_FineTuning_LR0.0001_OPTsgd_BS16_DR0.2_SCHstep.h5"
)



In [4]:
IMG_SIZE = 224
BATCH_SIZE = 32

TEST_PATH = "/content/drive/MyDrive/Seminar Hasil/dataset_processed/test"

test_ds = tf.keras.utils.image_dataset_from_directory(
    TEST_PATH,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    shuffle=False
)

class_names = test_ds.class_names
print("Class order:", class_names)

def preprocess(x, y):
    return preprocess_input(x), y

test_ds = test_ds.map(preprocess)

Found 620 files belonging to 8 classes.
Class order: ['Anthracnose', 'Bacterial Spot', 'Cercospora Leaf Spot', 'Curl Virus', 'Healthy Leaf', 'Nutrition Deficiency', 'White spot', 'yellow disease']


In [5]:
baseline_loss, baseline_acc = baseline_model.evaluate(test_ds, verbose=0)
best_loss, best_acc = best_model.evaluate(test_ds, verbose=0)

print("Baseline Accuracy:", baseline_acc)
print("Optimized Accuracy:", best_acc)

Baseline Accuracy: 0.8725806474685669
Optimized Accuracy: 0.9967741966247559


In [6]:
y_true = []
y_pred_baseline = []
y_pred_best = []

for images, labels in test_ds:
    pred_base = baseline_model.predict(images, verbose=0)
    pred_best = best_model.predict(images, verbose=0)

    y_true.extend(labels.numpy())
    y_pred_baseline.extend(np.argmax(pred_base, axis=1))
    y_pred_best.extend(np.argmax(pred_best, axis=1))

y_true = np.array(y_true)
y_pred_baseline = np.array(y_pred_baseline)
y_pred_best = np.array(y_pred_best)

In [7]:
print("Manual Baseline Acc:", np.mean(y_pred_baseline == y_true))
print("Manual Optimized Acc:", np.mean(y_pred_best == y_true))
print("Total Samples:", len(y_true))

Manual Baseline Acc: 0.8725806451612903
Manual Optimized Acc: 0.9967741935483871
Total Samples: 620


In [8]:
baseline_correct = (y_pred_baseline == y_true)
optimized_correct = (y_pred_best == y_true)

n11 = np.sum(baseline_correct & optimized_correct)
n00 = np.sum(~baseline_correct & ~optimized_correct)
n01 = np.sum(~baseline_correct & optimized_correct)
n10 = np.sum(baseline_correct & ~optimized_correct)

print("\nContingency Table:")
print("n11 (keduanya benar):", n11)
print("n00 (keduanya salah):", n00)
print("n01 (baseline salah, optimasi benar):", n01)
print("n10 (baseline benar, optimasi salah):", n10)


Contingency Table:
n11 (keduanya benar): 540
n00 (keduanya salah): 1
n01 (baseline salah, optimasi benar): 78
n10 (baseline benar, optimasi salah): 1


In [9]:
table = [[n11, n01],
         [n10, n00]]

result = mcnemar(table, exact=False, correction=True)

print("\nMcNemar Statistic:", result.statistic)
print("p-value:", result.pvalue)


McNemar Statistic: 73.11392405063292
p-value: 1.2237811471935489e-17


In [10]:
alpha = 0.05

if result.pvalue < alpha:
    print("\nPerbedaan SIGNIFIKAN secara statistik (p < 0.05)")
else:
    print("\nTidak ada perbedaan signifikan secara statistik (p â‰¥ 0.05)")


Perbedaan SIGNIFIKAN secara statistik (p < 0.05)


# McNemar test untuk semua model yang digunakan (McNemar Pairwise)

In [11]:
model_feature = tf.keras.models.load_model("/content/drive/MyDrive/Seminar Hasil/Experiments/Feature_Extraction/best_LR0.001_OPTadam_BS32_DR0.3.h5")
model_partial = tf.keras.models.load_model("/content/drive/MyDrive/Seminar Hasil/Experiments/Partial_FineTuning/Best_Model_Partial_FineTuning_LR0.0001_OPTsgd_BS16_DR0.2_SCHstep.h5")
model_progressive = tf.keras.models.load_model("/content/drive/MyDrive/Seminar Hasil/Experiments/Progressive_FineTuning/Best_Model_Progressive_FineTuning_EPOCHS20.h5")
model_two_stage = tf.keras.models.load_model("/content/drive/MyDrive/Seminar Hasil/Experiments/Two_Stage_Training/Best_Model_Two_Stage_Training.h5")



In [12]:
y_true = []

pred_feature = []
pred_partial = []
pred_progressive = []
pred_two_stage = []

for images, labels in test_ds:
    y_true.extend(labels.numpy())

    pred_feature.extend(np.argmax(model_feature.predict(images, verbose=0), axis=1))
    pred_partial.extend(np.argmax(model_partial.predict(images, verbose=0), axis=1))
    pred_progressive.extend(np.argmax(model_progressive.predict(images, verbose=0), axis=1))
    pred_two_stage.extend(np.argmax(model_two_stage.predict(images, verbose=0), axis=1))

y_true = np.array(y_true)
pred_feature = np.array(pred_feature)
pred_partial = np.array(pred_partial)
pred_progressive = np.array(pred_progressive)
pred_two_stage = np.array(pred_two_stage)



In [13]:
from statsmodels.stats.contingency_tables import mcnemar

def mcnemar_test(pred1, pred2, y_true, name1, name2):

    correct1 = (pred1 == y_true)
    correct2 = (pred2 == y_true)

    n01 = np.sum(~correct1 & correct2)
    n10 = np.sum(correct1 & ~correct2)

    table = [[0, n01],
             [n10, 0]]

    result = mcnemar(table, exact=False, correction=True)

    print(f"\n{name1} vs {name2}")
    print("n01:", n01)
    print("n10:", n10)
    print("p-value:", result.pvalue)

    return result.pvalue

In [14]:
p_values = []

p_values.append(mcnemar_test(pred_partial, pred_progressive, y_true, "Partial", "Progressive"))
p_values.append(mcnemar_test(pred_partial, pred_feature, y_true, "Partial", "Feature"))
p_values.append(mcnemar_test(pred_partial, pred_two_stage, y_true, "Partial", "Two-Stage"))
p_values.append(mcnemar_test(pred_progressive, pred_feature, y_true, "Progressive", "Feature"))
p_values.append(mcnemar_test(pred_progressive, pred_two_stage, y_true, "Progressive", "Two-Stage"))
p_values.append(mcnemar_test(pred_feature, pred_two_stage, y_true, "Feature", "Two-Stage"))


Partial vs Progressive
n01: 0
n10: 1
p-value: 1.0

Partial vs Feature
n01: 0
n10: 4
p-value: 0.13361440253771584

Partial vs Two-Stage
n01: 0
n10: 4
p-value: 0.13361440253771584

Progressive vs Feature
n01: 0
n10: 3
p-value: 0.24821307898992026

Progressive vs Two-Stage
n01: 1
n10: 4
p-value: 0.37109336952269756

Feature vs Two-Stage
n01: 3
n10: 3
p-value: 0.6830913983096086


In [15]:
#bonferonni correction
alpha = 0.05
bonf_alpha = alpha / 6

print("\nBonferroni corrected alpha:", bonf_alpha)

for i, p in enumerate(p_values):
    if p < bonf_alpha:
        print(f"Comparison {i+1} SIGNIFICANT")
    else:
        print(f"Comparison {i+1} NOT significant")


Bonferroni corrected alpha: 0.008333333333333333
Comparison 1 NOT significant
Comparison 2 NOT significant
Comparison 3 NOT significant
Comparison 4 NOT significant
Comparison 5 NOT significant
Comparison 6 NOT significant


In [16]:
#menghitung effect size antar model
def pairwise_effect_size(pred1, pred2, y_true, name1, name2):

    correct1 = (pred1 == y_true)
    correct2 = (pred2 == y_true)

    n01 = np.sum(~correct1 & correct2)
    n10 = np.sum(correct1 & ~correct2)
    N = len(y_true)

    # Odds Ratio
    if n10 == 0:
        odds_ratio = np.inf
    else:
        odds_ratio = n01 / n10

    # Cohen's g
    cohens_g = (n01 - n10) / N

    print(f"\nEffect Size: {name1} vs {name2}")
    print("n01:", n01)
    print("n10:", n10)
    print("Odds Ratio:", odds_ratio)
    print("Cohen's g:", cohens_g)

    return odds_ratio, cohens_g

In [17]:
pairwise_effect_size(pred_partial, pred_progressive, y_true, "Partial", "Progressive")
pairwise_effect_size(pred_partial, pred_feature, y_true, "Partial", "Feature")
pairwise_effect_size(pred_partial, pred_two_stage, y_true, "Partial", "Two-Stage")
pairwise_effect_size(pred_progressive, pred_feature, y_true, "Progressive", "Feature")
pairwise_effect_size(pred_progressive, pred_two_stage, y_true, "Progressive", "Two-Stage")
pairwise_effect_size(pred_feature, pred_two_stage, y_true, "Feature", "Two-Stage")


Effect Size: Partial vs Progressive
n01: 0
n10: 1
Odds Ratio: 0.0
Cohen's g: -0.0016129032258064516

Effect Size: Partial vs Feature
n01: 0
n10: 4
Odds Ratio: 0.0
Cohen's g: -0.0064516129032258064

Effect Size: Partial vs Two-Stage
n01: 0
n10: 4
Odds Ratio: 0.0
Cohen's g: -0.0064516129032258064

Effect Size: Progressive vs Feature
n01: 0
n10: 3
Odds Ratio: 0.0
Cohen's g: -0.004838709677419355

Effect Size: Progressive vs Two-Stage
n01: 1
n10: 4
Odds Ratio: 0.25
Cohen's g: -0.004838709677419355

Effect Size: Feature vs Two-Stage
n01: 3
n10: 3
Odds Ratio: 1.0
Cohen's g: 0.0


(np.float64(1.0), np.float64(0.0))