<a href="https://colab.research.google.com/github/Shakilkhan24/Playground_DL/blob/main/HF_DATA_TO_PYTORCH_CUSTOM_DATA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset

dataset = load_dataset("glue", "mrpc")

In [None]:

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Convert tokenized dataset to PyTorch Dataset
class TorchDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.examples["input_ids"][idx],
            "attention_mask": self.examples["attention_mask"][idx],
            "labels": self.examples["label"][idx]
        }

train_dataset = TorchDataset(tokenized_dataset["train"])
eval_dataset = TorchDataset(tokenized_dataset["validation"])

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=8)


In [None]:
import torch
from datasets import load_dataset
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset

class CustomImageDataset(Dataset):
    def __init__(self, dataset_name, split_name, transform=None):
        self.dataset = load_dataset(dataset_name, split=split_name)
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image_path = self.dataset[idx]['image_file_path']
        image = Image.open(image_path)
        label = self.dataset[idx]['label']

        if self.transform:
            image = self.transform(image)

        return image, label

# Define transforms for preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize the image to 224x224
    transforms.ToTensor(),           # Convert PIL Image to tensor
    transforms.Normalize(            # Normalize image tensor
      mean=[0.485, 0.456, 0.406],   # Mean values for RGB channels
      std=[0.229, 0.224, 0.225]     # Standard deviations for RGB channels
    )
])

# Create custom dataset instance
dataset = CustomImageDataset(dataset_name="beans", split_name="train", transform=transform)

# Example usage
# Accessing a sample image and label
sample_image, sample_label = dataset[0]
print("Sample image shape:", sample_image.shape)
print("Sample label:", sample_label)