In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models, optimizers
from tqdm import tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify the path to the 'mhist.zip' file
mhist_zip_path = 'drive/MyDrive/mhist_dataset'

# Load annotations from the CSV file
annotations_path = "drive/MyDrive/mhist_dataset/annotations.csv"
annotations_df = pd.read_csv(annotations_path, delimiter=',')

# Assuming the correct label column in your annotations.csv file is 'Majority Vote Label'
# Adjust this column name based on your actual dataset
label_column = 'Majority Vote Label'

# Filter and split data based on the 'Partition' column
train_annotations = annotations_df[annotations_df['Partition'] == 'train']
test_annotations = annotations_df[annotations_df['Partition'] == 'test']

# Convert string labels to integers using LabelEncoder for training set
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_annotations[label_column].values)

# Ensure that 'test' is also transformed consistently
test_labels = label_encoder.transform(test_annotations[label_column].values)

# Function to load and preprocess images
def load_and_preprocess_image(image_path):
    try:
        img = Image.open(image_path)
        img = img.resize((64, 64))  # Assuming your images are 64x64 pixels
        img_array = np.array(img) / 255.0
        return img_array
    except FileNotFoundError:
        return None

# Load images and labels
images_dir = "drive/MyDrive/mhist_dataset/images"
image_paths = [os.path.join(images_dir, img_name) for img_name in train_annotations['Image Name']]

# Filter out images that couldn't be loaded
loaded_images = [load_and_preprocess_image(img_path) for img_path in image_paths]
valid_indices = [i for i in range(len(loaded_images)) if loaded_images[i] is not None]

# Remove None values from loaded_images and labels
loaded_images = [img for img in loaded_images if img is not None]
train_labels = train_labels[valid_indices]

# Split data into train and validation sets
train_image_paths, val_image_paths, train_labels, val_labels = train_test_split(
    loaded_images, train_labels, test_size=0.2, random_state=42
)

# Number of classes (assuming it's the unique number of labels in the training set)
num_classes = len(np.unique(train_labels))

# Set the number of random weight initializations
K = 200

# Set the number of iterations for the Gradient Matching algorithm
T = 10

# Set the learning rate for the condensed samples
ηS = 0.1

# Set the number of optimization steps for the condensed samples
ζS = 1

# Set the learning rate for the model
ηθ = 0.01

# Set the number of optimization steps for the model
ζθ = 50

# Batch size
batch_size = 128

# Create a simple CNN model
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(optimizer=optimizers.SGD(learning_rate=0.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Initialize condensed images with Gaussian noise
condensed_images = np.random.normal(loc=0, scale=1, size=(K, 64, 64, 3))
condensed_images_tensor = tf.constant(condensed_images, dtype=tf.float32)

# Gradient Matching algorithm
for iteration in tqdm(range(T)):
    # Update condensed samples
    for _ in range(ζS):
        with tf.GradientTape() as tape:
            tape.watch(condensed_images_tensor)
            loss_S = tf.reduce_sum(model(condensed_images_tensor))
        grads_S = tape.gradient(loss_S, condensed_images_tensor)
        condensed_images_tensor -= ηS * grads_S.numpy()

    # Convert back to NumPy array
    condensed_images = condensed_images_tensor.numpy()

    # Update model
    for _ in range(ζθ):
        indices = np.random.choice(len(train_image_paths), batch_size, replace=False)
        batch_images = np.array(train_image_paths)[indices]
        batch_labels = train_labels[indices]

        with tf.GradientTape() as tape:
            predictions = model(batch_images)
            loss = tf.keras.losses.sparse_categorical_crossentropy(batch_labels, predictions)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

# Assuming you have a clone_model function defined
condensed_model = models.clone_model(model)
condensed_model.set_weights(model.get_weights())
condensed_model.compile(optimizer=optimizers.SGD(learning_rate=0.01), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Assuming you have the necessary code for loading and preprocessing images for the test set
test_image_paths = [os.path.join(images_dir, img_name) for img_name in test_annotations['Image Name']]

# Load and preprocess test images
test_images = [load_and_preprocess_image(img_path) for img_path in test_image_paths]

# Filter out images that couldn't be loaded
test_images = [img for img in test_images if img is not None]

# Convert to NumPy array
test_images = np.array(test_images)

# Ensure that the test labels are transformed consistently
test_labels = label_encoder.transform(test_annotations[label_column].values)

# Train the model on the condensed dataset
condensed_model.fit(condensed_images, test_labels, epochs=20, batch_size=128, validation_split=0.2)

# Evaluate on the real testing data
condensed_test_accuracy = condensed_model.evaluate(test_images, test_labels, verbose=2)[1]
print("Condensed Test Accuracy:", condensed_test_accuracy)

