<a href="https://colab.research.google.com/github/SarveshPatil99/Adversarial-Robustness-Enhancement/blob/main/TML_Dataset_Curation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Directory Structure Creation for All Datasets

In [1]:
import os

def create_dataset_structure(base_path='original'):
    categories = ['train', 'val', 'test']
    types = ['real', 'fake']

    for category in categories:
        for image_type in types:
            # Construct the path to the directory
            dir_path = os.path.join(base_path, category, image_type)
            # Create the directory with nested structure
            # exist_ok=True is used to avoid error if the directory already exists
            os.makedirs(dir_path, exist_ok=True)
            print(f"Directory created: {dir_path}")

if __name__ == "__main__":
    create_dataset_structure()

Directory created: original/train/real
Directory created: original/train/fake
Directory created: original/val/real
Directory created: original/val/fake
Directory created: original/test/real
Directory created: original/test/fake


In [2]:
import os

def create_noise_dataset_structure(base_paths):
    categories = ['train', 'val', 'test']
    types = ['real', 'fake']

    for base_path in base_paths:
        for category in categories:
            for image_type in types:
                # Construct the path to the directory
                dir_path = os.path.join(base_path, category, image_type)
                # Create the directory with nested structure
                # exist_ok=True is used to avoid error if the directory already exists
                os.makedirs(dir_path, exist_ok=True)
                print(f"Directory created: {dir_path}")

if __name__ == "__main__":
    noises = ['gaussian', 'uniform', 'salt_and_pepper', 'speckle']
    # Create a separate root folder for each type of noise
    base_paths = [f'noise_{noise}' for noise in noises]
    create_noise_dataset_structure(base_paths)

Directory created: noise_gaussian/train/real
Directory created: noise_gaussian/train/fake
Directory created: noise_gaussian/val/real
Directory created: noise_gaussian/val/fake
Directory created: noise_gaussian/test/real
Directory created: noise_gaussian/test/fake
Directory created: noise_uniform/train/real
Directory created: noise_uniform/train/fake
Directory created: noise_uniform/val/real
Directory created: noise_uniform/val/fake
Directory created: noise_uniform/test/real
Directory created: noise_uniform/test/fake
Directory created: noise_salt_and_pepper/train/real
Directory created: noise_salt_and_pepper/train/fake
Directory created: noise_salt_and_pepper/val/real
Directory created: noise_salt_and_pepper/val/fake
Directory created: noise_salt_and_pepper/test/real
Directory created: noise_salt_and_pepper/test/fake
Directory created: noise_speckle/train/real
Directory created: noise_speckle/train/fake
Directory created: noise_speckle/val/real
Directory created: noise_speckle/val/fake


In [3]:
import os

def create_mixed_dataset_structure(base_path, ratios):
    categories = ['train', 'val']
    types = ['real', 'fake']

    for ratio in ratios:
            # Construct the base path including the noise ratio
            ratio_path = f"{base_path}_{ratio}"
            for category in categories:
                for image_type in types:
                    # Construct the path to the directory
                    dir_path = os.path.join(ratio_path, category, image_type)
                    # Create the directory with nested structure
                    # exist_ok=True is used to avoid error if the directory already exists
                    os.makedirs(dir_path, exist_ok=True)
                    print(f"Directory created: {dir_path}")

if __name__ == "__main__":
    # Names of the noises will be determined after Phase II, but we prepare for all possibilities.
    noises = ['']
    # Ratios of original to noisy data
    ratios = ['1_1', '1_2', '2_1']

    # Create a base path for mixed datasets
    base_path = 'mixed_noise'
    create_mixed_dataset_structure(base_path, ratios)

Directory created: mixed_noise_1_1/train/real
Directory created: mixed_noise_1_1/train/fake
Directory created: mixed_noise_1_1/val/real
Directory created: mixed_noise_1_1/val/fake
Directory created: mixed_noise_1_2/train/real
Directory created: mixed_noise_1_2/train/fake
Directory created: mixed_noise_1_2/val/real
Directory created: mixed_noise_1_2/val/fake
Directory created: mixed_noise_2_1/train/real
Directory created: mixed_noise_2_1/train/fake
Directory created: mixed_noise_2_1/val/real
Directory created: mixed_noise_2_1/val/fake


In [4]:
import os

# Define the adversarial attack types
adversarial_attacks = ['fgsm', 'bim', 'pgd']

# Directory structure for the test phase
dir_structure = {
    'test': ['real', 'fake']
}

# Create base directories for each adversarial attack type
for attack in adversarial_attacks:
    # Create a base directory for the current attack (e.g., 'fgsm_test')
    base_attack_dir = f'adversarial_{attack}'

    for phase in dir_structure:
        # Create a sub-directory for test (e.g., 'fgsm_test/test')
        phase_dir = os.path.join(base_attack_dir, phase)

        for category in dir_structure[phase]:
            # Create sub-directories for real and fake categories (e.g., 'fgsm_test/test/real')
            category_dir = os.path.join(phase_dir, category)
            os.makedirs(category_dir, exist_ok=True)
            print(f"Created directory: {category_dir}")

Created directory: adversarial_fgsm/test/real
Created directory: adversarial_fgsm/test/fake
Created directory: adversarial_bim/test/real
Created directory: adversarial_bim/test/fake
Created directory: adversarial_pgd/test/real
Created directory: adversarial_pgd/test/fake


### Real Dataset Curation

In [5]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!cp ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

rm: cannot remove '/root/.kaggle': No such file or directory


In [6]:
!kaggle datasets download -d denislukovnikov/ffhq256-images-only

Downloading ffhq256-images-only.zip to /content
100% 6.90G/6.91G [01:13<00:00, 118MB/s] 
100% 6.91G/6.91G [01:13<00:00, 100MB/s]


In [None]:
!unzip -q ffhq256-images-only.zip

In [10]:
import os
import shutil

# Define the source and destination directories
source_dir = 'ffhq256'
destination_dir = 'real_images_directory'

# Create the destination directory if it doesn't already exist
os.makedirs(destination_dir, exist_ok=True)

# Iterate over the first 10,000 images and move them to the destination
for i in range(10000):
    # Format the filename with leading zeros
    file_name = f"{i:05d}.png"
    # Define the source and destination file paths
    source_file = os.path.join(source_dir, file_name)
    destination_file = os.path.join(destination_dir, file_name)

    # Move the file
    shutil.move(source_file, destination_file)

print(f"First 10,000 images moved to {destination_dir}.")

First 5,000 images moved to real_images_directory.


In [11]:
import os
import shutil

# Define the base path for the 'original' directory
base_original_dir = 'original'

# Define the directories for train, validation, and test sets
train_dir = os.path.join(base_original_dir, 'train', 'real')
val_dir = os.path.join(base_original_dir, 'val', 'real')
test_dir = os.path.join(base_original_dir, 'test', 'real')

# Create the directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Source directory containing all real images
source_dir = 'real_images_directory'

# Move the first 8000 images to the train directory
for i in range(8000):
    file_name = f"{i:05d}.png"
    source_file = os.path.join(source_dir, file_name)
    destination_file = os.path.join(train_dir, file_name)
    shutil.move(source_file, destination_file)

# Move the next 1000 images to the validation directory
for i in range(8000, 9000):
    file_name = f"{i:05d}.png"
    source_file = os.path.join(source_dir, file_name)
    destination_file = os.path.join(val_dir, file_name)
    shutil.move(source_file, destination_file)

# Move the last 1000 images to the test directory
for i in range(9000, 10000):
    file_name = f"{i:05d}.png"
    source_file = os.path.join(source_dir, file_name)
    destination_file = os.path.join(test_dir, file_name)
    shutil.move(source_file, destination_file)

print("Images distributed to train, val, and test directories.")

Images distributed to train, val, and test directories.


In [12]:
import os

def count_images_in_directory(directory):
    # Initialize a counter
    count = 0
    # Walk through all files and directories within the given directory
    for root, dirs, files in os.walk(directory):
        # Count the number of PNG files
        count += len([file for file in files if file.endswith('.png')])
        print(f"There are {count} images in '{root}' directory.")
        # Reset the counter after printing out the count for the current directory
        count = 0

# Define the base path for the 'original' directory
base_original_dir = 'original'

# Call the function with the path to the 'original' directory
count_images_in_directory(base_original_dir)

There are 0 images in 'original' directory.
There are 0 images in 'original/test' directory.
There are 0 images in 'original/test/fake' directory.
There are 1000 images in 'original/test/real' directory.
There are 0 images in 'original/train' directory.
There are 0 images in 'original/train/fake' directory.
There are 8000 images in 'original/train/real' directory.
There are 0 images in 'original/val' directory.
There are 0 images in 'original/val/fake' directory.
There are 1000 images in 'original/val/real' directory.


### Fake Dataset Curation

In [13]:
!git clone https://github.com/NVlabs/stylegan3.git

Cloning into 'stylegan3'...
remote: Enumerating objects: 212, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 212 (delta 0), reused 1 (delta 0), pack-reused 207[K
Receiving objects: 100% (212/212), 4.17 MiB | 30.30 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [14]:
import os
os.chdir('stylegan3/')

In [15]:
!sudo apt-get install ninja-build

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  ninja-build
0 upgraded, 1 newly installed, 0 to remove and 19 not upgraded.
Need to get 111 kB of archives.
After this operation, 358 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 ninja-build amd64 1.10.1-1 [111 kB]
Fetched 111 kB in 1s (136 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package ninja-build.
(Reading database ... 120874 files and directorie

In [None]:
# Generate an image using pre-trained AFHQv2 model ("Ours" in Figure 1, left).
# stylegan3-t-ffhq-1024x1024.pkl
#stylegan3-r-ffhq-1024x1024.pkl
!python gen_images.py --outdir=out --seeds=0-9999 \
    --network=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhq-1024x1024.pkl

In [17]:
!mv out ../

In [18]:
os.chdir('../')

In [21]:
import os
import shutil
from PIL import Image

# Define the base path for the 'original' directory
base_original_dir = 'original'

# Define the directories for train, validation, and test sets
train_dir = os.path.join(base_original_dir, 'train', 'fake')
val_dir = os.path.join(base_original_dir, 'val', 'fake')
test_dir = os.path.join(base_original_dir, 'test', 'fake')

# Create the directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Source directory containing all synthetic images
source_dir = 'out'

# Function to resize and move images
def resize_and_move(start_index, end_index, source_folder, destination_folder):
    for i in range(start_index, end_index):
        # Adjust for the filename format 'seed0000.png'
        file_name = f"seed{i:04d}.png"
        source_file = os.path.join(source_folder, file_name)

        # Check if the file exists to avoid errors
        if os.path.isfile(source_file):
            # Open the image, resize it, and save it to the destination folder
            with Image.open(source_file) as img:
                img = img.resize((256, 256), Image.BICUBIC)
                img.save(os.path.join(destination_folder, file_name))
            # Optionally delete the original if needed
            # os.remove(source_file)

# Move the first 4000 images to the train directory after resizing
resize_and_move(0, 8000, source_dir, train_dir)

# Move the next 500 images to the validation directory after resizing
resize_and_move(8000, 9000, source_dir, val_dir)

# Move the last 500 images to the test directory after resizing
resize_and_move(9000, 10000, source_dir, test_dir)

print("Synthetic images resized and distributed to train, val, and test directories.")


Synthetic images resized and distributed to train, val, and test directories.


In [23]:
import os

def count_images_in_directory(directory):
    # Initialize a counter
    count = 0
    # Walk through all files and directories within the given directory
    for root, dirs, files in os.walk(directory):
        # Count the number of PNG files
        count += len([file for file in files if file.endswith('.png')])
        print(f"There are {count} images in '{root}' directory.")
        # Reset the counter after printing out the count for the current directory
        count = 0

# Define the base path for the 'original' directory
base_original_dir = 'original'

# Call the function with the path to the 'original' directory
count_images_in_directory(base_original_dir)

There are 0 images in 'original' directory.
There are 0 images in 'original/test' directory.
There are 1000 images in 'original/test/fake' directory.
There are 1000 images in 'original/test/real' directory.
There are 0 images in 'original/train' directory.
There are 8000 images in 'original/train/fake' directory.
There are 8000 images in 'original/train/real' directory.
There are 0 images in 'original/val' directory.
There are 1000 images in 'original/val/fake' directory.
There are 1000 images in 'original/val/real' directory.


### Original Dataset Compiling

In [24]:
!zip -r original_dataset_stylegan3_10000.zip original/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: original/train/real/00617.png (deflated 0%)
  adding: original/train/real/07358.png (deflated 0%)
  adding: original/train/real/01775.png (deflated 0%)
  adding: original/train/real/02152.png (deflated 0%)
  adding: original/train/real/04878.png (deflated 0%)
  adding: original/train/real/03806.png (deflated 0%)
  adding: original/train/real/06380.png (deflated 0%)
  adding: original/train/real/00578.png (deflated 0%)
  adding: original/train/real/04409.png (deflated 0%)
  adding: original/train/real/00753.png (deflated 0%)
  adding: original/train/real/06727.png (deflated 0%)
  adding: original/train/real/05957.png (deflated 0%)
  adding: original/train/real/04537.png (deflated 0%)
  adding: original/train/real/01056.png (deflated 0%)
  adding: original/train/real/07570.png (deflated 0%)
  adding: original/train/real/00803.png (deflated 0%)
  adding: original/train/real/05760.png (deflated 0%)
  adding: origina

In [25]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
!cp original_dataset_stylegan3_10000.zip /content/drive/MyDrive/

### Initial Deep Learning Model Benchmarking

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model

# Load the pre-trained ResNet50 model
base_model = ResNet50(include_top=False, input_shape=(256, 256, 3))

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = True

# Add new top layers
x = GlobalAveragePooling2D()(base_model.output)
predictions = Dense(2, activation='softmax')(x)  # Assuming binary classification (real vs fake)
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Setup data generators
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    'original/train/',
    target_size=(256, 256),
    batch_size=32,
    class_mode='categorical'
)

validation_generator = val_datagen.flow_from_directory(
    'original/val/',
    target_size=(256, 256),
    batch_size=32,
    class_mode='categorical'
)

# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // validation_generator.batch_size,
    epochs=10
)

# Evaluate the model on the test set
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    'original/test/',
    target_size=(256, 256),
    batch_size=32,
    class_mode='categorical',
    shuffle=False
)

# Evaluate on test data
results = model.evaluate(test_generator)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Found 16000 images belonging to 2 classes.
Found 2000 images belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10