<a href="https://colab.research.google.com/github/PARTHIBAN-007/PyTorch/blob/main/PyTorch_Dataloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from torchvision.data import Dataset, DataLoader
from torch.utils.datasets import ImageFolder
from tqdm.auto import tqdm

import os
import numpy as np
from PIL import Image
from collections import Counter

In [None]:
class DogsVsCats(Dataset):
  def __init__(self,path_to_folder):
    path_to_dogs = os.path.join(path_to_folder,"dogs")
    path_to_cats = os.path.join(path_to_folder,"cats")

    dog_files = os.listdir(path_to_dogs)
    cat_files = os.listdir(path_to_cats)

    path_to_dog_files = [os.path.join(path_to_dogs,file) for file in dog_files]
    path_to_cat_files = [os.path.join(path_to_cats,file) for file in cat_files]

    self.training_files = path_to_dog_files + path_to_cat_files
    self.dog_label , self.cat_label = 0,1

    self.transform = transforms.ToTensor()

  def __len__(self):
    return len(self.training_files)

  def __getitem__(self,idx):
    path_to_image = self.training_files[idx]

    if "Dog" in path_to_image:
      label = self.dog_label
    else:
      label = self.cat_label
  image = Image.open(path_to_image)
  image = self.transform(image)

  return image,label

dogvscat = DogsVsCats("../../data/dogsvscats/")

print(f"Total Training samples: {len(dogvscat)}")

for image, labels in dogvscat:
  print(f"Image Label: {labels}")
  print(f"Image Shape: {image.shape}")
  break


### Built In PyTorch DataLoader

In [None]:
dogsvscatsloader = DataLoader(
    dogvscat,
    batch_size = 16,
    shuffle = False
)

for image,labels in dogsvscatsloader:
  print(f"Image label: {label}")
  print(f"Image Shape: {image.shape}")
  break


In [None]:

img_transforms = transforms.Compose(
    [
        transforms.Resize((224,224)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
    ]
)

class DogsVsCats(Dataset):
  def __init__(self,path_to_folder):
    path_to_dogs = os.path.join(path_to_folder,"dogs")
    path_to_cats = os.path.join(path_to_folder,"cats")

    dog_files = os.listdir(path_to_dogs)
    cat_files = os.listdir(path_to_cats)

    path_to_dog_files = [os.path.join(path_to_dogs,file) for file in dog_files]
    path_to_cat_files = [os.path.join(path_to_cats,file) for file in cat_files]

    self.training_files = path_to_dog_files + path_to_cat_files
    self.dog_label , self.cat_label = 0,1

    self.transform = transforms.ToTensor()

  def __len__(self):
    return len(self.training_files)

  def __getitem__(self,idx):
    path_to_image = self.training_files[idx]

    if "Dog" in path_to_image:
      label = self.dog_label
    else:
      label = self.cat_label
    image = Image.open(path_to_image)
    image = self.transform(image)

    return image,label

dogvscat = DogsVsCats("../../data/dogsvscats/")

dogsvscatsloader = DataLoader(dogvscat,
                             batch_size=16,
                             shuffle=True)

for images, labels in dogsvscatsloader:
    print(images.shape)
    print(labels)
    break

In [None]:
train_samples = int(0.9 * len(dogvscat))
test_samples = len(dogvscat) - train_samples

print(f"Number of Training Samples: {train_samples} ")
print(f"Number of Test Samples: {test_samples} ")


train_dataset ,  test_dataset = torch.utils.data.random_split(dogvscat, lengths=  [train_samples,test_samples])


train_loader = DataLoader(train_dataset,
                          batch_size = 16,
                          shuffle = True)

test_loader = DataLoader(test_dataset,
                         batch_size = 16,
                         shuffle = False)

for images , labels in train_loader:
    print(images.shape)
    print(labels)
    break

for images, labels in test_loader:
    print(images.shape)
    print(labels)
    break

In [None]:
# Built in DataLoader
dogvscat = ImageFolder(root = "./dogvscat",
                       transform = img_transforms)

print(dogvscat.classes)

train_dataset , test_dataset = torch.utils.data.random_split(dogvscat,lengths=  [train_samples,test_samples])

train_loader = DataLoader(train_dataset,
                          batch_size = 16,
                          shuffle = True)

test_loader = DataLoader(test_loader,
                         batch_size = 16,
                         shuffle = False)

for images,labels in train_loader:
    print(images.shape)
    print(labels)
    break

for images,labels in test_loader:
    print(images.shape)
    print(labels)
    break

In [None]:
path_to_data = "/aclImdb/train"

path_to_pos_fld = os.path.join(path_to_data, "pos")
path_to_neg_fld = os.path.join(path_to_data, "neg")

path_to_pos_txt = [os.path.join(path_to_pos_fld, file) for file in os.listdir(path_to_pos_fld)]
path_to_neg_txt = [os.path.join(path_to_neg_fld, file) for file in os.listdir(path_to_neg_fld)]

training_files = path_to_pos_txt + path_to_neg_txt

alltxt = ""
for file in tqdm(training_files):
    with open(file, "r") as f:
        text = f.readlines()
        alltxt += text[0]

unique_counts = dict(Counter(alltxt))
characters = sorted([key for (key,value) in unique_counts.items() if value > 1500])

characters.append("<UNK>")

characters.append("<PAD>")

char2idx = {c:i for i,c in enumerate(characters)}
idx2char = {i:c for i,c in enumerate(characters)}

print(char2idx)

In [None]:
class IMDBDataset(Dataset):
    def __init__(self, path_to_data, char2idx): 
        path_to_pos_fld = os.path.join(path_to_data, "pos")
        path_to_neg_fld = os.path.join(path_to_data, "neg")
        
        path_to_pos_txt = [os.path.join(path_to_pos_fld, file) for file in os.listdir(path_to_pos_fld)]
        path_to_neg_txt = [os.path.join(path_to_neg_fld, file) for file in os.listdir(path_to_neg_fld)]
        
        self.training_files = path_to_pos_txt + path_to_neg_txt
        self.tokenizer = char2idx
        
    def __len__(self):
        return len(self.training_files)
    
    def __getitem__(self, idx):
        path_to_txt = self.training_files[idx]
        
        with open(path_to_txt, "r") as f:
            txt = f.readlines()[0]
            
        tokenized = []
        for char in txt:
            if char in self.tokenizer.keys(): 
                tokenized.append(self.tokenizer[char])
            else:
                tokenized.append(self.tokenizer["<UNK>"]) 
                
                
        sample = torch.tensor(tokenized) 
        if "neg" in path_to_txt:
            label = 0
        else:
            label = 1
        
        return sample, label
        
        
    
imdbdataset = IMDBDataset("/aclImdb/train", char2idx)

In [None]:
### Pad sequence

a = torch.ones(10)
b = torch.ones(8)
c = torch.ones(2)
padded = nn.utils.rnn.pad_sequence([a, b, c], batch_first=True, padding_value=999) 

print(padded)
print(padded.shape)

In [None]:
def data_collator(batch):
    texts, labels = [], []
    
    for text, label in batch:
        labels.append(label)
        texts.append(text)
        
    labels = torch.tensor(labels)
    
    texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=char2idx["<PAD>"])
    return texts, labels    

In [None]:
# Without Data Collator
imdbloader = DataLoader(imdbdataset, batch_size=16, shuffle=True, collate_fn=data_collator)

for text, label in imdbloader:
    print(text.shape)
    break