# Statistical testing in binary classification

## Neccessary stuff
1. Train two or more models.
1. Evaluate trained models on one or more test datasets (it's a little bit tricky when you want to make several datasets from one).
1. Ask a question you want to answer about this models.
1. Choose statistical test equivalent to your question.
1. Measure neccessary things for this test.
1. Assume some significance level, e.g. $\alpha = 0.05$. 
1. Calculate p-value for the test.

In [1]:
%load_ext dotenv
%dotenv

In [60]:
import os
import shutil
from random import shuffle
import numpy as np
import tensorflow as tf
import scipy
from tensorflow.keras import Input
from tensorflow.keras.utils import load_img
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, Dense, MaxPooling2D, Flatten, Dropout
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from statsmodels.stats.contingency_tables import mcnemar
from sklearn.metrics import accuracy_score
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Two models, one data set, what can I do?

### Data

In [4]:
import os

# The path to store trained models
models_dir = './models/'
if not os.path.exists(models_dir):
    os.mkdir(models_dir)

# The path to the directory where the original dataset was uncompressed
original_dataset_dir = './Dogs-vs-Cats-1'
original_cat_dir = './Dogs-vs-Cats-1/Cat'
original_dog_dir = './Dogs-vs-Cats-1/Dog'

# The directory where we will store our smaller dataset
base_dir = './Dogs-vs-Cats-1/working'
if not os.path.exists(base_dir):
    os.mkdir(base_dir)

# Directories for our training, validation and test splits
train_dir = os.path.join(base_dir, 'train')
if not os.path.exists(train_dir):
    os.mkdir(train_dir)
validation_dir = os.path.join(base_dir, 'validation')
if not os.path.exists(validation_dir):
    os.mkdir(validation_dir)
test_dir = os.path.join(base_dir, 'test')
if not os.path.exists(test_dir):
    os.mkdir(test_dir)

# Directory with training cat pictures
train_cats_dir = os.path.join(train_dir, 'cats')
if not os.path.exists(train_cats_dir):
    os.mkdir(train_cats_dir)

# Directory with training dog pictures
train_dogs_dir = os.path.join(train_dir, 'dogs')
if not os.path.exists(train_dogs_dir):
    os.mkdir(train_dogs_dir)

# Directory with validation cat pictures
validation_cats_dir = os.path.join(validation_dir, 'cats')
if not os.path.exists(validation_cats_dir):
    os.mkdir(validation_cats_dir)
    
# Directory with validation dog pictures
validation_dogs_dir = os.path.join(validation_dir, 'dogs')
if not os.path.exists(validation_dogs_dir):
    os.mkdir(validation_dogs_dir)

# Directory with test cat pictures
test_cats_dir = os.path.join(test_dir, 'cats')
if not os.path.exists(test_cats_dir):
    os.mkdir(test_cats_dir)

# Directory with test dog pictures
test_dogs_dir = os.path.join(test_dir, 'dogs')
if not os.path.exists(test_dogs_dir):
    os.mkdir(test_dogs_dir)

In [16]:
# Copy first 1000 cat images to train_cats_dir
fnames = ['{}.jpg'.format(i) for i in range(1001)]
for fname in fnames:
    if fname == '666.jpg':
        continue
    src = os.path.join(original_cat_dir, fname)
    dst = os.path.join(train_cats_dir, fname)
    shutil.copyfile(src, dst)

# Copy next 500 cat images to validation_cats_dir
fnames = ['{}.jpg'.format(i) for i in range(1001, 1501)]
for fname in fnames:
    src = os.path.join(original_cat_dir, fname)
    dst = os.path.join(validation_cats_dir, fname)
    shutil.copyfile(src, dst)
    
# Copy next 500 cat images to test_cats_dir
fnames = ['{}.jpg'.format(i) for i in range(1501, 2001)]
for fname in fnames:
    src = os.path.join(original_cat_dir, fname)
    dst = os.path.join(test_cats_dir, fname)
    shutil.copyfile(src, dst)
    
# Copy first 1000 dog images to train_dogs_dir
fnames = ['{}.jpg'.format(i) for i in range(1000)]
for fname in fnames:
    src = os.path.join(original_dog_dir, fname)
    dst = os.path.join(train_dogs_dir, fname)
    shutil.copyfile(src, dst)
    
# Copy next 500 dog images to validation_dogs_dir
fnames = ['{}.jpg'.format(i) for i in range(1000, 1500)]
for fname in fnames:
    src = os.path.join(original_dog_dir, fname)
    dst = os.path.join(validation_dogs_dir, fname)
    shutil.copyfile(src, dst)
    
# Copy next 500 dog images to test_dogs_dir
fnames = ['{}.jpg'.format(i) for i in range(1500, 2000)]
for fname in fnames:
    src = os.path.join(original_dog_dir, fname)
    dst = os.path.join(test_dogs_dir, fname)
    shutil.copyfile(src, dst)

In [46]:
img_rows = 150
img_cols = 150

# data generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=50,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,)
test_datagen = ImageDataGenerator(rescale=1./255)

batch_size = 32

train_generator = train_datagen.flow_from_directory(
        train_dir,
        target_size=(img_rows, img_cols),
        batch_size=batch_size, 
        class_mode='binary')

val_generator = test_datagen.flow_from_directory(
        validation_dir,
        target_size=(img_rows, img_cols),
        batch_size=batch_size, 
        class_mode='binary')

test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=(img_rows, img_cols),
        batch_size=batch_size,
        shuffle=False,
        class_mode='binary')

Found 2000 images belonging to 2 classes.
Found 1000 images belonging to 2 classes.
Found 1000 images belonging to 2 classes.


### Models

In [30]:
# First model
model1 = Sequential()
model1.add(Conv2D(32, (3, 3), activation='relu', input_shape=(img_rows, img_cols, 3)))
model1.add(MaxPooling2D((2, 2)))
model1.add(Conv2D(64, (3, 3), activation='relu'))
model1.add(MaxPooling2D((2, 2)))
model1.add(Conv2D(128, (3, 3), activation='relu'))
model1.add(MaxPooling2D((2, 2)))
model1.add(Conv2D(256, (3, 3), activation='relu'))
model1.add(MaxPooling2D((2, 2)))
model1.add(Conv2D(256, (3, 3), activation='relu'))
model1.add(MaxPooling2D((2, 2)))
model1.add(Flatten())
model1.add(Dropout(0.5))
model1.add(Dense(512, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
model1.compile(loss='binary_crossentropy',
              optimizer=RMSprop(learning_rate=3e-4),
              metrics=['acc'])

model1.fit(
      train_generator,
      batch_size=batch_size,
      epochs=30,
      validation_data=val_generator)

Epoch 1/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 473ms/step - acc: 0.5028 - loss: 0.6946 - val_acc: 0.5010 - val_loss: 0.6925
Epoch 2/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 465ms/step - acc: 0.5406 - loss: 0.6914 - val_acc: 0.5220 - val_loss: 0.6869
Epoch 3/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 464ms/step - acc: 0.5575 - loss: 0.6885 - val_acc: 0.5690 - val_loss: 0.6763
Epoch 4/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 455ms/step - acc: 0.5764 - loss: 0.6805 - val_acc: 0.6270 - val_loss: 0.6659
Epoch 5/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 461ms/step - acc: 0.5898 - loss: 0.6742 - val_acc: 0.5860 - val_loss: 0.6650
Epoch 6/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 451ms/step - acc: 0.6102 - loss: 0.6653 - val_acc: 0.6640 - val_loss: 0.6384
Epoch 7/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 466ms/

<keras.src.callbacks.history.History at 0x7fe858359060>

In [32]:
model1.save(models_dir+"binary_model1.h5")



In [6]:
# Second model
model2 = Sequential()
model2.add(Conv2D(32, (3, 3), activation='relu', input_shape=(img_rows, img_cols, 3)))
model2.add(MaxPooling2D((2, 2)))
model2.add(Conv2D(64, (3, 3), activation='relu'))
model2.add(MaxPooling2D((2, 2)))
model2.add(Conv2D(128, (3, 3), activation='relu'))
model2.add(MaxPooling2D((2, 2)))
model2.add(Conv2D(256, (3, 3), activation='relu'))
model2.add(MaxPooling2D((2, 2)))
model2.add(Conv2D(256, (3, 3), activation='relu'))
model2.add(MaxPooling2D((2, 2)))
model2.add(Flatten())
model2.add(Dropout(0.5))
model2.add(Dense(512, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-06-04 17:58:47.745390: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-04 17:58:47.745619: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-04 17:58:47.745806: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.co

In [8]:
model2.compile(loss='binary_crossentropy',
              optimizer=Adam(learning_rate=3e-4),
              metrics=['acc'])

model2.fit(
      train_generator,
      batch_size=batch_size,
      epochs=30,
      validation_data=val_generator)

Epoch 1/30


  self._warn_if_super_not_called()
I0000 00:00:1717516750.160268   20110 service.cc:145] XLA service 0x7b60a4005120 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1717516750.160312   20110 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce GTX 1650 Ti, Compute Capability 7.5
2024-06-04 17:59:10.239191: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-06-04 17:59:10.543940: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m 1/63[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10:33[0m 10s/step - acc: 0.6562 - loss: 0.6889

I0000 00:00:1717516757.767481   20110 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 252ms/step - acc: 0.5286 - loss: 0.6947 - val_acc: 0.5000 - val_loss: 0.6943
Epoch 2/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 129ms/step - acc: 0.4966 - loss: 0.6931 - val_acc: 0.5150 - val_loss: 0.6912
Epoch 3/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 132ms/step - acc: 0.5422 - loss: 0.6908 - val_acc: 0.5040 - val_loss: 0.6872
Epoch 4/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 134ms/step - acc: 0.5752 - loss: 0.6818 - val_acc: 0.5590 - val_loss: 0.6643
Epoch 5/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 131ms/step - acc: 0.6185 - loss: 0.6555 - val_acc: 0.6460 - val_loss: 0.6345
Epoch 6/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 139ms/step - acc: 0.6139 - loss: 0.6446 - val_acc: 0.6630 - val_loss: 0.6288
Epoch 7/30
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 148ms/step - acc: 0.6

<keras.src.callbacks.history.History at 0x7b61ee3a7a00>

In [9]:
model2.save(models_dir+"binary_model2.h5")



### Want to know if one model is better than other?
Perform McNemar's test! It's simple statistic based on *False Negatives* clasified only one of the two models. The statistic of this test is calucated by equation below:
$$\chi^2 = \frac{(|b-c|-1)^2}{b+c}$$
* b - *False Negatives* clasified by first model, but not by the second,
* c - *False Negatives* clasified by second model, but not the first.

This test follows chi-squared distribution with one degree of freedom when $b+c\geq20$, and binomial distribution in the other case.  

In [34]:
# Load models
model1 = load_model(models_dir+"binary_model1.h5")
model2 = load_model(models_dir+"binary_model2.h5")



In [48]:
# Predict 
y_true = test_generator.labels
size = len(y_true)
y_pred1 = np.array([model1.predict(test_generator) > 0.5], dtype=int).reshape((size,))
y_pred2 = np.array([model2.predict(test_generator) > 0.5], dtype=int).reshape((size,))

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step


In [53]:
# Calculate b and c

b = 0
c = 0
for i in range(size):
    if y_true[i] == 1:
        if y_pred1[i] == 0 and y_pred2[i] == 1:
            b += 1
        elif y_pred1[i] == 1 and y_pred2[i] == 0:
            c += 1 

print("False negatives classified by first model, but not by the second:", b)
print("False negatives classified by second model, but not by the first:", c)

False negatives classified by first model, but not by the second: 52
False negatives classified by second model, but not by the first: 19


In [61]:
# Calulate test statistic
chi2 = (abs(b-c) - 1)**2/(b+c)
p = 2 * scipy.stats.chi2.sf(chi2, 1)
print(p)

0.00029208831796774206
