In [None]:
# Importing required libraries
import os
import numpy as np
import matplotlib.pyplot as plt
import random
import shutil
import zipfile

# Paths for dataset
balanced_path = "/kaggle/input/rgb-normalizedataset"  
shuffled_path = "/kaggle/working/ShuffledDataset"
os.makedirs(shuffled_path, exist_ok=True)

# Map categories to numeric labels
labels_map = {"WithoutMask": 0, "WithMask": 1}

# Count number of images in each class (before shuffling)
class_counts = {cat: len([f for f in os.listdir(os.path.join(balanced_path, cat)) if f.endswith(".npy")])
                for cat in os.listdir(balanced_path)}
# Plot class distribution (before shuffle)
plt.bar(class_counts.keys(), class_counts.values(), color=['red','green'])
plt.title(" Class Distribution (Before Label Encoding & Shuffle)")
plt.ylabel("Number of Images")
plt.show()
print("Before Label Encoding:", class_counts)

# Store file paths and their labels
all_data = []
all_labels = []

 # For each folder (WithoutMask, WithMask)
for category in os.listdir(balanced_path):
    folder = os.path.join(balanced_path, category)
    files = [f for f in os.listdir(folder) if f.endswith(".npy")]
    for file in files:
        file_path = os.path.join(folder, file)
        all_data.append(file_path)
        all_labels.append(labels_map[category])

# Convert lists to numpy arrays for easier operations
all_data = np.array(all_data)
all_labels = np.array(all_labels)

# Backup labels before shuffling
before_labels = all_labels.copy()

# Shuffle dataset
indices = np.arange(len(all_data))
np.random.shuffle(indices)
all_data = all_data[indices]
all_labels = all_labels[indices]

# Copy shuffled files into new folders
for i, file_path in enumerate(all_data):
    category_label = "WithoutMask" if all_labels[i]==0 else "WithMask"
    dst_folder = os.path.join(shuffled_path, category_label)
     
    # Create folder if not exists
    os.makedirs(dst_folder, exist_ok=True)
    shutil.copy(file_path, os.path.join(dst_folder, os.path.basename(file_path)))
    
# Count number of images in shuffled dataset
shuffled_counts = {cat: len([f for f in os.listdir(os.path.join(shuffled_path, cat)) if f.endswith(".npy")])
                   for cat in os.listdir(shuffled_path)}
# Plot class distribution after shuffle
plt.bar(shuffled_counts.keys(), shuffled_counts.values(), color=['red','green'])
plt.title("📊 Class Distribution (After Label Encoding & Shuffle)")
plt.ylabel("Number of Images")
plt.show()
print("After Label Encoding & Shuffle:", shuffled_counts)

# Heatmap before shuffle
plt.figure(figsize=(12,2))
plt.imshow([before_labels], cmap="coolwarm", aspect="auto")
plt.title("📊 Filenames Order Before Shuffle (Heatmap)")
plt.yticks([])
plt.xlabel("Image Index")
plt.colorbar(label="Class (0=WithoutMask, 1=WithMask)")
plt.show()

# Heatmap after shuffle
plt.figure(figsize=(12,2))
plt.imshow([all_labels], cmap="coolwarm", aspect="auto")
plt.title("📊 Filenames Order After Shuffle (Heatmap)")
plt.yticks([])
plt.xlabel("Image Index")
plt.colorbar(label="Class (0=WithoutMask, 1=WithMask)")
plt.show()

# Scatter plot comparison (before vs after shuffle)
indices = np.arange(len(all_labels))
plt.figure(figsize=(12,4))
plt.scatter(indices, before_labels, alpha=0.3, label="Before Shuffle", color="blue")
plt.scatter(indices, all_labels, alpha=0.3, label="After Shuffle", color="green")
plt.title("Dataset Order Before vs After Shuffle")
plt.xlabel("Image Index")
plt.ylabel("Class Label (0=WithoutMask, 1=WithMask)")
plt.legend()
plt.show()


In [None]:
# Define the path where the final zip file will be stored
zip_path = "/kaggle/working/FinaShuffledDataset.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(shuffled_path):
        for file in files:
            abs_path = os.path.join(root, file)
            rel_path = os.path.relpath(abs_path, shuffled_path)
            zipf.write(abs_path, rel_path)

# Print success message
print(f"\n Final shuffled dataset saved at: {zip_path}")