In [1]:
# Setup working dir and install Kaggle CLI
!pip -q install -U kaggle

import os, zipfile, random, shutil
from pathlib import Path

BASE = Path("/content/cifake_hw7")
BASE.mkdir(parents=True, exist_ok=True)
os.chdir(BASE)
print("Working directory:", Path.cwd())


Working directory: /content/cifake_hw7


In [2]:
from google.colab import files

# If no zip in the folder, prompt for upload
zips = list(Path("/content/archive (1).zip").glob("*.zip"))
if not zips:
    print("No ZIP found. Please upload the dataset ZIP you downloaded from Kaggle.")
    uploaded = files.upload()  # choose the downloaded ZIP file
    # Nothing else needed; next cell will find it.
else:
    print("Found existing ZIP:", zips[0].name)


No ZIP found. Please upload the dataset ZIP you downloaded from Kaggle.


Saving archive (1).zip to archive (1).zip


In [3]:
# Unzip the first ZIP found
zip_candidates = sorted(Path(".").glob("*.zip"))
assert zip_candidates, "No ZIP file found. Run Cell 2 or 3 first."
zip_path = zip_candidates[0]
print("Unzipping:", zip_path.name)

with zipfile.ZipFile(zip_path, "r") as zf:
    zf.extractall(Path("."))

print("Top-level dirs:", [p.name for p in Path('.').iterdir() if p.is_dir()])
print("train subdirs:", [p.name for p in Path('train').iterdir() if p.is_dir()])
print("test subdirs :", [p.name for p in Path('test').iterdir() if p.is_dir()])


Unzipping: archive (1).zip
Top-level dirs: ['test', 'train']
train subdirs: ['FAKE', 'REAL']
test subdirs : ['FAKE', 'REAL']


In [4]:
# Create validation split: move 10,000 from train/REAL -> validation/REAL
# and 10,000 from train/FAKE -> validation/FAKE
random.seed(1)

train_dir = Path("train")
val_dir = Path("validation")
val_dir.mkdir(exist_ok=True)

for cls in ["REAL", "FAKE"]:
    (val_dir/cls).mkdir(parents=True, exist_ok=True)
    src = train_dir/cls
    imgs = [p for p in src.iterdir() if p.is_file()]
    assert len(imgs) >= 50000, f"Expected 50k in {src}, found {len(imgs)}"
    pick = random.sample(imgs, 10000)
    for p in pick:
        shutil.move(str(p), val_dir/cls/p.name)
    print(f"{cls}: moved {len(pick)} → validation; remaining in train: {len(list(src.iterdir()))}")


REAL: moved 10000 → validation; remaining in train: 40000
FAKE: moved 10000 → validation; remaining in train: 40000


In [5]:
def count_images(split_dir):
    d = Path(split_dir)
    if not d.exists(): return None
    return {
        "REAL": sum(1 for _ in (d/"REAL").glob("*.*")) if (d/"REAL").exists() else 0,
        "FAKE": sum(1 for _ in (d/"FAKE").glob("*.*")) if (d/"FAKE").exists() else 0,
    }

print("train counts     :", count_images("train"))
print("validation counts:", count_images("validation"))
print("test counts      :", count_images("test"))
# Expect:
# train: REAL~40k, FAKE~40k
# validation: REAL=10k, FAKE=10k
# test: REAL=10k, FAKE=10k


train counts     : {'REAL': 40000, 'FAKE': 40000}
validation counts: {'REAL': 10000, 'FAKE': 10000}
test counts      : {'REAL': 10000, 'FAKE': 10000}


In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

AUTOTUNE = tf.data.AUTOTUNE

IMG_SIZE = (32, 32)   # CIFAKE paper Figure 5 uses 32x32 inputs
BATCH    = 256        # adjust if you run out of RAM
SEED     = 1


In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

AUTOTUNE = tf.data.AUTOTUNE
IMG_SIZE = (32, 32)
BATCH = 256
SEED = 1

# Load datasets
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    "train",
    labels="inferred",
    label_mode="binary",
    image_size=IMG_SIZE,
    batch_size=BATCH,
    shuffle=True,
    seed=SEED
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    "validation",
    labels="inferred",
    label_mode="binary",
    image_size=IMG_SIZE,
    batch_size=BATCH,
    shuffle=False
)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    "test",
    labels="inferred",
    label_mode="binary",
    image_size=IMG_SIZE,
    batch_size=BATCH,
    shuffle=False
)

# ✅ Capture class names BEFORE caching/prefetching
class_names = train_ds.class_names
print("Classes:", class_names)

# Now add caching & prefetching for performance
train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds   = val_ds.cache().prefetch(AUTOTUNE)
test_ds  = test_ds.cache().prefetch(AUTOTUNE)


Found 80000 files belonging to 2 classes.
Found 20000 files belonging to 2 classes.
Found 20000 files belonging to 2 classes.
Classes: ['FAKE', 'REAL']


In [9]:
def build_cifake_fig5(input_shape=(32,32,3)):
    inputs = keras.Input(shape=input_shape)
    x = layers.Rescaling(1./255)(inputs)                    # Rescale
    x = layers.Conv2D(32, 3, activation="relu", padding="same")(x)  # Conv(32)
    x = layers.MaxPooling2D()(x)                            # MaxPool
    x = layers.Conv2D(32, 3, activation="relu", padding="same")(x)  # Conv(32)
    x = layers.MaxPooling2D()(x)                            # MaxPool
    x = layers.Flatten()(x)                                 # Flatten
    x = layers.Dense(64, activation="relu")(x)              # Dense(64)
    outputs = layers.Dense(1, activation="sigmoid")(x)      # Dense(1) sigmoid (binary)
    return keras.Model(inputs, outputs, name="cifake_fig5_cnn")

model = build_cifake_fig5(input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))
model.summary()


In [10]:
checkpoint_path = "best_cifake.keras"

callbacks = [
    keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_path,
        monitor="val_loss",
        save_best_only=True,
        save_weights_only=False
    )
    # You can also add EarlyStopping if you want:
    # , keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
]

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)


In [11]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=30,
    callbacks=callbacks
)


Epoch 1/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 250ms/step - accuracy: 0.7441 - loss: 0.4988 - val_accuracy: 0.8673 - val_loss: 0.3083
Epoch 2/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 212ms/step - accuracy: 0.8802 - loss: 0.2875 - val_accuracy: 0.9099 - val_loss: 0.2259
Epoch 3/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 200ms/step - accuracy: 0.9067 - loss: 0.2316 - val_accuracy: 0.9179 - val_loss: 0.2077
Epoch 4/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 205ms/step - accuracy: 0.9193 - loss: 0.2031 - val_accuracy: 0.9303 - val_loss: 0.1808
Epoch 5/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 209ms/step - accuracy: 0.9284 - loss: 0.1843 - val_accuracy: 0.9340 - val_loss: 0.1730
Epoch 6/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 199ms/step - accuracy: 0.9346 - loss: 0.1687 - val_accuracy: 0.9334 - val_loss: 0.1696
Epoch 7/30

In [12]:
best_model = keras.models.load_model(checkpoint_path)
test_loss, test_acc = best_model.evaluate(test_ds, verbose=2)
print(f"Test accuracy: {test_acc:.4f} | Test loss: {test_loss:.4f}")

if test_acc >= 0.92:
    print("✅ Requirement met: ≥ 92% test accuracy.")
else:
    print("⚠️ Accuracy below 92%. Consider re-running or tweaking batch size/lr.")


79/79 - 8s - 106ms/step - accuracy: 0.9460 - loss: 0.1461
Test accuracy: 0.9460 | Test loss: 0.1461
✅ Requirement met: ≥ 92% test accuracy.
