# Data Preparation

In [1]:
# Importing necessary modules
import torch  # Core library for deep learning and tensor operations
import torchvision  # Library for computer vision tasks and datasets
from torchvision import datasets, transforms  # Importing datasets and transforms for data preparation
import os  # Module for interacting with the file system
from torch.utils.data import random_split #random_split`: Utility function from PyTorch for splitting datasets
import pandas as pd  # Library for data manipulation and analysis; used here for saving datasets as CSV files
# Setting up data transformations (e.g., converting images to tensors)
transform = transforms.ToTensor()

## Data Splitting

In [2]:
# Define the root directory for storing datasets
data_dir = "../data"

# Check if the directory exists, if not create it
os.makedirs(data_dir, exist_ok=True)

# Define transformations (converting images to tensors)
transform = transforms.ToTensor()

# Load the full training dataset
full_train_data = datasets.FashionMNIST(
    root=data_dir,
    train=True,
    download=True,
    transform=transform
)

# Define the split ratio (e.g., 80% for training, 20% for validation)
train_size = int(0.8 * len(full_train_data))
validation_size = len(full_train_data) - train_size

# Split the dataset into training and validation sets
train_data, validation_data = random_split(full_train_data, [train_size, validation_size])

# Load the test dataset
test_data = datasets.FashionMNIST(
    root=data_dir,
    train=False,
    download=True,
    transform=transform
)

print("\n✅ Dataset loading and splitting completed successfully!")
print(f"📂 Data directory: {data_dir}")

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████████████████████████████████████████████████████████████████████████████| 26.4M/26.4M [00:48<00:00, 543kB/s]


Extracting ../data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████████████████████████████████████████████████████████████████████████████| 29.5k/29.5k [00:00<00:00, 246kB/s]


Extracting ../data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████████████████████████████████████████████████████████████████████████████| 4.42M/4.42M [00:07<00:00, 615kB/s]


Extracting ../data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ../data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|█████████████████████████████████████████████████████████████████████████████████| 5.15k/5.15k [00:00<00:00, 9.41MB/s]


Extracting ../data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ../data/FashionMNIST/raw


✅ Dataset loading and splitting completed successfully!
📂 Data directory: ../data


In [3]:
# Print dataset sizes for confirmation
print(f"🟢 Training set size: {len(train_data)} samples")
print(f"🟡 Validation set size: {len(validation_data)} samples")
print(f"🔵 Test set size: {len(test_data)} samples")

🟢 Training set size: 48000 samples
🟡 Validation set size: 12000 samples
🔵 Test set size: 10000 samples


In [4]:
# Directory for saving CSV files
csv_dir = "../data_preparation"

# Check if the directories exist, if not create them
os.makedirs(csv_dir, exist_ok=True)

# Function to save a dataset as a CSV file
def save_to_csv(dataset, filename):
    data_list = []
    for img, label in dataset:
        # Flatten the image tensor and convert to a list
        img_flat = img.view(-1).tolist()
        data_list.append([label] + img_flat)

    # Create a DataFrame
    columns = ['label'] + [f'pixel_{i}' for i in range(len(img_flat))]
    df = pd.DataFrame(data_list, columns=columns)

    # Save the DataFrame as a CSV file in the data_preparation directory
    csv_path = os.path.join(csv_dir, filename)
    df.to_csv(csv_path, index=False)
    print(f"✅ Saved {filename} to {csv_dir}")

# Save the training, validation, and test datasets as CSV files
save_to_csv(train_data, "train_data.csv")
save_to_csv(validation_data, "validation_data.csv")
save_to_csv(test_data, "test_data.csv")

print("\n🎉 Data successfully saved to the data_preparation directory!")

✅ Saved train_data.csv to ../data_preparation
✅ Saved validation_data.csv to ../data_preparation
✅ Saved test_data.csv to ../data_preparation

🎉 Data successfully saved to the data_preparation directory!
