# ---> torchvision <---  shikte hobe

##### Normal Example

In [None]:
from torch.utils.data import Dataset
import torch

class MyDataset(Dataset):
    def __init__(self, data, labels, transform=None): # It's like def function parameters style. Can pass n number
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self): # Returns the total number of data samples in your dataset.
        return len(self.data) # Returns the number of items in the dataset (i.e., how many data samples it contains).

    def __getitem__(self, idx): # Returns a single data sample at a specific index idx along with its label (or target value).
        sample = self.data[idx]
        label = self.labels[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample, label

from torch.utils.data import DataLoader

dataset = MyDataset(data, labels, transform=None)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)
""" 
    Parameters:
        1. dataset: The Dataset object that holds your data.

        2. batch_size: The number of samples to return in each batch.

        3. shuffle: If True, data is shuffled before every epoch.

        4. num_workers: The number of subprocesses to use for data loading. 0 means no additional subprocesses, and a higher number means parallel loading of data.

        5. collate_fn: A function that allows you to customize how the data is merged into a batch.

        6. drop_last: If True, drops the last batch if it is smaller than the specified batch size.

        7. pin_memory: If True, data is copied to the GPU (useful when training with a GPU).
"""

for data_batch, label_batch in dataloader:
    # Training loop or validation loop
    pass



##### Tabular data

In [None]:
import pandas as pd
from torch.utils.data import Dataset
import torch

class TabularDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        # Read the data
        self.data = pd.read_csv(csv_file)
        self.features = self.data.drop(columns='target')  # Assuming 'target' is the label column
        self.labels = self.data['target']
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the features and label
        sample = torch.tensor(self.features.iloc[idx].values, dtype=torch.float32)
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)  # assuming classification task
        if self.transform:
            sample = self.transform(sample)
        return sample, label

dataset = TabularDataset(csv_file='data.csv')
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


##### Json data

In [None]:
import json
from torch.utils.data import Dataset
import torch

class JSONDataset(Dataset):
    def __init__(self, json_file, transform=None):
        # Read JSON data
        with open(json_file, 'r') as file:
            self.data = json.load(file)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the text from the JSON file (assuming each entry has a 'text' key)
        text = self.data[idx]['text']
        sample = torch.tensor(text)  # You might want to use tokenization here
        if self.transform:
            sample = self.transform(sample)
        return sample

dataset = JSONDataset(json_file='data.json')
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


##### Image data

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import Dataset
from PIL import Image
import os

class ImageFolderDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.classes = os.listdir(root_dir)  # List of subdirectory names (classes)
        self.image_paths = []
        
        for class_idx, class_name in enumerate(self.classes):
            class_dir = os.path.join(root_dir, class_name)
            for img_name in os.listdir(class_dir):
                img_path = os.path.join(class_dir, img_name)
                self.image_paths.append((img_path, class_idx))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path, label = self.image_paths[idx]
        img = Image.open(img_path)
        
        if self.transform:
            img = self.transform(img)

        return img, label

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

dataset = ImageFolderDataset(root_dir='/images', transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


##### Image preprocessing task on loaded data example

In [None]:
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset

class ImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        """
        Args:
            image_paths (list): List of paths to the images.
            labels (list): Corresponding labels for the images.
            transform (callable, optional): A function/transform to apply to the images.
        """
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load image
        image = Image.open(self.image_paths[idx])
        label = self.labels[idx]

        # Apply the transformations (if any)
        if self.transform:
            image = self.transform(image)

        return image, label

# Define transformations to apply
transform = transforms.Compose([
    transforms.Resize((128, 128)),               # Resize the image
    transforms.RandomHorizontalFlip(),           # Random horizontal flip for augmentation
    transforms.ToTensor(),                       # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize
])

# Instantiate dataset with transformations
image_dataset = ImageDataset(image_paths=["path/to/image1.jpg", "path/to/image2.jpg"], 
                             labels=[0, 1], transform=transform)

# DataLoader
dataloader = DataLoader(image_dataset, batch_size=32, shuffle=True)


##### Tabular data preprocessing taks on loaded data example

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset
import torch

class TabularDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        """
        Args:
            csv_file (str): Path to the CSV file.
            transform (callable, optional): A function/transform to apply to the data.
        """
        self.data = pd.read_csv(csv_file)  # Load data from CSV
        self.features = self.data.drop(columns='target')  # Drop the target column
        self.labels = self.data['target']
        self.transform = transform

        # Apply preprocessing (e.g., standardization) to features
        self.scaler = StandardScaler()  # You can use more complex scalers if needed
        self.features = self.scaler.fit_transform(self.features)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get features and label
        sample = torch.tensor(self.features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)

        # Apply any transformations to the sample (if required)
        if self.transform:
            sample = self.transform(sample)

        return sample, label

# Instantiate dataset
dataset = TabularDataset(csv_file="data.csv", transform=None)

# DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


##### LLM or text base data preprocessing on loaded data example

In [None]:
from transformers import BertTokenizer
from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=512):
        """
        Args:
            texts (list): List of raw text samples.
            labels (list, optional): List of labels for each text (for supervised learning).
            tokenizer (transformers.Tokenizer): Tokenizer to convert text into tokens.
            max_length (int): Maximum length for the tokenized sequences.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer or BertTokenizer.from_pretrained('bert-base-uncased')  # Load default tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx] if self.labels else None

        # Tokenize the text (convert text into token IDs, with padding/truncation)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add [CLS] and [SEP] tokens for BERT
            max_length=self.max_length,
            padding='max_length',    # Pad sequences to the max length
            truncation=True,         # Truncate sequences longer than max_length
            return_tensors='pt',     # Return PyTorch tensors
        )

        input_ids = encoding['input_ids'].squeeze()  # Remove extra batch dimension (if any)
        attention_mask = encoding['attention_mask'].squeeze()

        # Return tokenized data and corresponding label
        return input_ids, attention_mask, label

# Example texts (list of sentences) and labels (for supervised tasks)
texts = ["This is the first sentence.", "Here is another sentence."]
labels = [0, 1]  # Example labels for a classification task

# Instantiate tokenizer and dataset
dataset = TextDataset(texts=texts, labels=labels, tokenizer=None, max_length=32)

# DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


#### Custom collete function

In PyTorch, a custom collate function is used when you want to customize how data is collated (i.e., combined into batches) from a dataset. By default, PyTorchâ€™s DataLoader will automatically stack tensors in a batch, but there are situations where you need to manipulate or modify the data in more complex ways before batching it, such as when the data is of different shapes, or when you want to apply special handling for certain data types (e.g., padding sequences for NLP tasks).

What is the role of the collate function?
    When the DataLoader iterates through the dataset, it retrieves samples one by one (from the __getitem__ method). The collate function combines these samples into a batch, which is then passed to the model. The default collate function simply stacks samples into a batch, but if your data has more complex structures, you might need to write a custom collate function.

When to use a custom collate function?
    You may need a custom collate function if:
        Data with variable lengths: In NLP, for example, text sequences may have different lengths, and padding might be needed to make all sequences the same length.

    Non-tensor data: 
        If you have non-tensor data (e.g., images with different sizes), you might need to handle them before batching.

    Custom transformations: 
        When you want to apply special transformations or custom data processing during batching.

    Handling complex data structures: 
        If your dataset consists of complex structures like dictionaries or tuples.

How to create a custom collate function?
A custom collate function takes in a list of samples (which are returned by the __getitem__ method of your dataset) and combines them into a single batch. It should return a batch that your model can work with.



In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F

# Define a simple Dataset class for NLP (example with variable-length sentences)
class MyDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Example data (list of sentences and labels)
texts = ["Hello", "How are you?", "Goodbye"]
labels = [0, 1, 0]

# Define a custom collate function
def custom_collate_fn(batch):
    """
    Custom collate function that pads sentences to the same length and
    returns a batch of input sequences and their labels.
    
    Args:
        batch (list of tuples): Each element in the batch is a tuple (text, label).
        
    Returns:
        tuple: A batch containing the padded input tensor and the label tensor.
    """
    
    # Step 1: Find the maximum sentence length in the batch
    max_length = max(len(sentence) for sentence, _ in batch)
    
    # Step 2: Pad all sentences to the same length
    padded_texts = []
    labels = []
    
    for sentence, label in batch:
        # Pad each sentence to the max length with 0s
        padded_sentence = list(sentence) + [0] * (max_length - len(sentence))
        padded_texts.append(padded_sentence)
        labels.append(label)
    
    # Convert lists to tensors
    input_tensor = torch.tensor(padded_texts)
    label_tensor = torch.tensor(labels)
    
    return input_tensor, label_tensor

# Create the dataset and dataloader
dataset = MyDataset(texts, labels)
dataloader = DataLoader(dataset, batch_size=2, collate_fn=custom_collate_fn)

# Iterate through the DataLoader
for inputs, labels in dataloader:
    print("Input Tensor:", inputs)
    print("Label Tensor:", labels)
