# COURSE: A deep understanding of deep learning
## SECTION: Data matrices and loaders
### LECTURE: Anatomy of a torch dataset and dataloader
#### TEACHER: Mike X Cohen, sincxpress.com
##### COURSE URL: udemy.com/course/deeplearning_x/?couponCode=202401
##### Modified from original code

In [1]:
# import libraries
import numpy as np
import torch
from torch.utils.data import DataLoader,TensorDataset

# Datasets

In [2]:
# create some data in numpy

nObservations = 100
nFeatures = 20

data = np.random.randn(nObservations,nFeatures)

In [4]:
# Convert to pytorch tensor
dataT = torch.tensor( data ) 

# print out some information
print('Numpy data:')
print(type(data))
print(data.shape) # numpy -> .shape
print(data.dtype)
print(' ')

print('Tensor data:')
print(type(dataT))
print(dataT.size()) # torch -> .size(), .shape
print(dataT.dtype)
print(' ')

Numpy data:
<class 'numpy.ndarray'>
(100, 20)
float64
 
Tensor data:
<class 'torch.Tensor'>
torch.Size([100, 20])
torch.float64
 


In [5]:
# Sometimes you need to convert data types

dataT2 = torch.tensor( data ).float()
print(dataT2.dtype)

# "long" is for ints
dataT3 = torch.tensor( data ).long()
print(dataT3.dtype)

torch.float32
torch.int64


In [9]:
print(dataT2)
print(dataT3)

tensor([[ 1.6046, -0.4972,  0.6407,  ..., -0.5713,  1.0585, -1.2979],
        [-0.6741, -0.3466,  0.2241,  ...,  0.4458, -1.5515,  2.3055],
        [ 0.9924, -2.3596,  1.5630,  ...,  1.1902, -0.5177, -0.6143],
        ...,
        [ 1.7630,  0.8127, -0.0171,  ...,  0.3745, -1.7686,  0.2046],
        [ 0.8156, -1.4926, -0.0838,  ...,  0.6181, -0.5963,  1.3386],
        [ 0.5590,  1.5688,  0.6729,  ..., -0.2202,  0.3429,  0.4807]])
tensor([[ 1,  0,  0,  ...,  0,  1, -1],
        [ 0,  0,  0,  ...,  0, -1,  2],
        [ 0, -2,  1,  ...,  1,  0,  0],
        ...,
        [ 1,  0,  0,  ...,  0, -1,  0],
        [ 0, -1,  0,  ...,  0,  0,  1],
        [ 0,  1,  0,  ...,  0,  0,  0]])


In [13]:
# Convert tensor into PyTorch Datasets

# dataset = TensorDataset(data) # this creates ERROR: not a tensor!
dataset = TensorDataset(dataT)

# pytorch Dataset: a two-element tuple comprising data,labels
# so far it only contains data
dataset.tensors

(tensor([[ 1.6046, -0.4972,  0.6407,  ..., -0.5713,  1.0585, -1.2979],
         [-0.6741, -0.3466,  0.2241,  ...,  0.4458, -1.5515,  2.3055],
         [ 0.9924, -2.3596,  1.5630,  ...,  1.1902, -0.5177, -0.6143],
         ...,
         [ 1.7630,  0.8127, -0.0171,  ...,  0.3745, -1.7686,  0.2046],
         [ 0.8156, -1.4926, -0.0838,  ...,  0.6181, -0.5963,  1.3386],
         [ 0.5590,  1.5688,  0.6729,  ..., -0.2202,  0.3429,  0.4807]],
        dtype=torch.float64),)

In [21]:
# Let's try again with labels
labels = torch.ceil(torch.linspace(.01,4,nObservations))

# transform to an actual matrix (column vector)
labels = labels.reshape(( len(labels),1 ))
print( labels.T )

# now make another dataset
dataset = TensorDataset(dataT,labels)
print( dataset.tensors[0].size() )
print( dataset.tensors[1].size() )

# for comparison
# this is an ordered set, not a matrix like dataset.tensors[1]
print( np.shape(np.random.randint(5,size=nObservations)) )

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 3., 3., 3.,
         3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3., 3., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4., 4., 4., 4., 4., 4., 4., 4., 4.]])
torch.Size([100, 20])
torch.Size([100, 1])
(100,)


# DataLoaders

In [30]:
# create a dataloader object
batchsize = 25
dataloader = DataLoader(dataset,batch_size=batchsize)#,shuffle=True,drop_last=True)

# dataset.tensors is where the full dataset is stored
dataloader.dataset.tensors[0].size()

torch.Size([100, 20])

In [25]:
# pytorch Dataloader: an iterable that breaks dataset into mini-batches

for dat,labs in dataloader:
  # sizes of each batch
  print('BATCH INFO:')
  print(dat.size()) # shape: 25 - batch size, 20 - # features
  print(labs.size())
  print(' ')

BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 


In [26]:
# inspect the labels
for dat,labs in dataloader:
  print(labs.T)
  print(' ')

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1.]])
 
tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2.]])
 
tensor([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3., 3., 3., 3., 3., 3.]])
 
tensor([[4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4., 4., 4., 4., 4., 4.]])
 


In [28]:
# try again with shuffling (shuffling occurs whenever u call for loop on dataloader)
# dataloader = DataLoader(dataset,batch_size=batchsize,shuffle=True)

for dat,labs in dataloader:
  print(labs.T)
  print(' ')

tensor([[1., 3., 2., 3., 1., 4., 3., 2., 4., 3., 1., 3., 4., 4., 4., 2., 1., 1.,
         2., 3., 2., 2., 3., 1., 2.]])
 
tensor([[3., 2., 4., 1., 3., 2., 3., 1., 2., 2., 4., 4., 3., 3., 2., 4., 1., 1.,
         4., 4., 2., 1., 4., 4., 2.]])
 
tensor([[3., 3., 2., 3., 1., 4., 1., 3., 3., 3., 2., 2., 2., 2., 2., 1., 1., 1.,
         4., 1., 1., 4., 1., 4., 4.]])
 
tensor([[2., 1., 3., 2., 3., 1., 4., 3., 1., 4., 1., 4., 3., 3., 4., 3., 1., 4.,
         2., 4., 4., 2., 2., 3., 1.]])
 


In [29]:
# To get only one batch (e.g., for testing)

dat,labs = next(iter(dataloader))

labs

tensor([[1.],
        [3.],
        [4.],
        [1.],
        [2.],
        [4.],
        [1.],
        [4.],
        [1.],
        [3.],
        [3.],
        [4.],
        [3.],
        [3.],
        [1.],
        [2.],
        [1.],
        [2.],
        [2.],
        [2.],
        [3.],
        [1.],
        [1.],
        [1.],
        [1.]])