<a href="https://colab.research.google.com/github/ParitKansal/Pytorch/blob/main/3%2C%20Dataset_and_Dataloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import make_classification
import torch

In [2]:
# Step 1: Create a synthetic classification dataset using sklearn
X, y = make_classification(
    n_samples=10,       # Number of samples
    n_features=2,       # Number of features
    n_informative=2,    # Number of informative features
    n_redundant=0,      # Number of redundant features
    n_classes=2,        # Number of classes
    random_state=42     # For reproducibility
)

In [3]:
X.shape, y.shape

((10, 2), (10,))

In [4]:
# Convert the data to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

In [5]:
from torch.utils.data import Dataset, DataLoader

In [6]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, features, labels, transform=None):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        sample, label = self.features[index], self.labels[index]
        # Apply transformation to features if any
        sample = sample*100
        return sample, label

In [7]:
dataset = CustomDataset(X, y)

In [8]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

In [9]:
for batch_features, batch_labels in dataloader:

  print(batch_features)
  print(batch_labels)
  print("-"*50)

tensor([[ 106.8339,  -97.0073],
        [-114.0215,  -83.8792]])
tensor([1., 0.])
--------------------------------------------------
tensor([[-289.5397,  197.6862],
        [ -72.0634,  -96.0592]])
tensor([0., 0.])
--------------------------------------------------
tensor([[-196.2874,  -99.2251],
        [ -93.8205,  -54.3048]])
tensor([0., 1.])
--------------------------------------------------
tensor([[ 172.7259, -118.5827],
        [ 177.7366,  151.1576]])
tensor([1., 1.])
--------------------------------------------------
tensor([[ 189.9693,   83.4445],
        [ -58.7231, -197.1718]])
tensor([1., 0.])
--------------------------------------------------


In [10]:
import os
import csv
from torchvision import datasets
from torchvision.transforms import ToTensor
from PIL import Image

# Step 1: Load the FashionMNIST dataset
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

# Step 2: Define directories
output_dir = "fashion_mnist_images"
train_dir = os.path.join(output_dir, "train_images")
test_dir = os.path.join(output_dir, "test_images")
labels_dir = os.path.join(output_dir, "labels")  # Directory for CSV files

# Create directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(labels_dir, exist_ok=True)

# Step 3: Function to save images and create a CSV file
def save_images_and_csv(dataset, image_directory, csv_directory, csv_filename):
    """
    Save images from the dataset into the specified directory and create a CSV file
    mapping filenames to their labels.

    Args:
        dataset: Dataset object containing images and labels.
        image_directory: Directory to save the images.
        csv_directory: Directory to save the CSV file.
        csv_filename: Name of the CSV file.
    """
    csv_path = os.path.join(csv_directory, csv_filename)
    with open(csv_path, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["file_name", "label"])  # Write header row

        for index, (image, label) in enumerate(dataset):
            # Convert tensor to PIL image
            image = Image.fromarray((image.numpy().squeeze() * 255).astype('uint8'))
            # Define file name
            file_name = f"{index}.png"
            # Save the image
            image.save(os.path.join(image_directory, file_name))
            # Write the file name and label to the CSV
            writer.writerow([file_name, label])

# Step 4: Save training data
save_images_and_csv(training_data, train_dir, labels_dir, "train_labels.csv")

# Step 5: Save testing data
save_images_and_csv(test_data, test_dir, labels_dir, "test_labels.csv")

print(f"Images saved in {output_dir}/train and {output_dir}/test.")
print(f"Labels saved in {output_dir}/labels.")

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26.4M/26.4M [00:01<00:00, 18.7MB/s]


Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29.5k/29.5k [00:00<00:00, 281kB/s]


Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 4.42M/4.42M [00:00<00:00, 5.26MB/s]


Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5.15k/5.15k [00:00<00:00, 2.93MB/s]


Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw

Images saved in fashion_mnist_images/train and fashion_mnist_images/test.
Labels saved in fashion_mnist_images/labels.


In [11]:
import os
import pandas as pd
from torchvision.io import read_image
class CustomImageDataset(Dataset):
    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotations_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

dataset = CustomImageDataset("fashion_mnist_images/labels/train_labels.csv", "fashion_mnist_images/train_images")

In [12]:
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [13]:
for batch_features, batch_labels in dataloader:

  print(batch_features.shape)
  print(batch_labels.shape)
  break

torch.Size([64, 1, 28, 28])
torch.Size([64])


In [14]:
import os
import csv
from torchvision import datasets
from torchvision.transforms import ToTensor
from PIL import Image

# Step 1: Load the FashionMNIST dataset
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

# Step 2: Define directories
output_dir = "fashion_mnist_images"
train_dir = os.path.join(output_dir, "train_images")
test_dir = os.path.join(output_dir, "test_images")
labels_dir = os.path.join(output_dir, "labels")  # Directory for CSV files

# Create directories if they don't exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(labels_dir, exist_ok=True)

# Step 3: Function to save images and create a CSV file
def save_images_and_csv(dataset, base_image_directory, csv_directory, csv_filename):
    """
    Save images from the dataset into labeled subdirectories and create a CSV file
    mapping filenames to their labels.

    Args:
        dataset: Dataset object containing images and labels.
        base_image_directory: Base directory to save the images.
        csv_directory: Directory to save the CSV file.
        csv_filename: Name of the CSV file.
    """
    csv_path = os.path.join(csv_directory, csv_filename)
    with open(csv_path, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["file_name", "label"])  # Write header row

        for index, (image, label) in enumerate(dataset):
            # Convert tensor to PIL image
            image = Image.fromarray((image.numpy().squeeze() * 255).astype('uint8'))

            # Create subdirectory for the label if it doesn't exist
            label_dir = os.path.join(base_image_directory, str(label))
            os.makedirs(label_dir, exist_ok=True)

            # Define file name
            file_name = f"{index}.png"
            file_path = os.path.join(label_dir, file_name)

            # Save the image
            image.save(file_path)

            # Write the file name and label to the CSV
            writer.writerow([os.path.relpath(file_path, base_image_directory), label])

# Step 4: Save training data
save_images_and_csv(training_data, train_dir, labels_dir, "train_labels.csv")

# Step 5: Save testing data
save_images_and_csv(test_data, test_dir, labels_dir, "test_labels.csv")

print(f"Images saved in labeled directories under {train_dir} and {test_dir}.")
print(f"Labels saved in {labels_dir}.")

Images saved in labeled directories under fashion_mnist_images/train_images and fashion_mnist_images/test_images.
Labels saved in fashion_mnist_images/labels.


In [15]:
import os
from torch.utils.data import Dataset
from torchvision.io import read_image

class CustomImageDataset(Dataset):
    def __init__(self, data_dir, transform=None, target_transform=None):
        """
        Args:
            data_dir (str): Directory containing image subdirectories organized by label.
            transform (callable, optional): Optional transform to apply to images.
            target_transform (callable, optional): Optional transform to apply to labels.
        """
        self.image_paths = []
        self.labels = []

        # Load images with labels
        for label_dir in os.listdir(data_dir):
            label_path = os.path.join(data_dir, label_dir)
            if os.path.isdir(label_path):
                label = int(label_dir)  # Convert label folder name to integer
                for img_name in os.listdir(label_path):
                    self.image_paths.append(os.path.join(label_path, img_name))
                    self.labels.append(label)

        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = read_image(img_path)
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)

        return image, label

# Example usage
data_dir = "/content/fashion_mnist_images/train_images"
dataset = CustomImageDataset(data_dir)

In [16]:
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

for batch_features, batch_labels in dataloader:
  print(batch_features.shape)
  break

torch.Size([64, 1, 28, 28])
