# COURSE: A deep understanding of deep learning
## SECTION: Data matrices and loaders
### LECTURE: Anatomy of a torch dataset and dataloader
#### TEACHER: Mike X Cohen, sincxpress.com
##### COURSE URL: udemy.com/course/deeplearning_x/?couponCode=202401

In [1]:
# import libraries
import numpy as np
import torch
from torch.utils.data import DataLoader,TensorDataset

# Datasets

In [2]:
# create some data in numpy

nObservations = 100
nFeatures = 20

data = np.random.randn(nObservations,nFeatures)

In [3]:
# Convert to pytorch tensor
dataT = torch.tensor( data ) 

# print out some information
print('Numpy data:')
print(type(data))
print(data.shape) # numpy -> .shape
print(data.dtype)
print(' ')

print('Tensor data:')
print(type(dataT))
print(dataT.size()) # torch -> .size()
print(dataT.dtype)
print(' ')

Numpy data:
<class 'numpy.ndarray'>
(100, 20)
float64
 
Tensor data:
<class 'torch.Tensor'>
torch.Size([100, 20])
torch.float64
 


In [4]:
# Sometimes you need to convert data types

dataT2 = torch.tensor( data ).float()
print(dataT2.dtype)

# "long" is for ints
dataT3 = torch.tensor( data ).long()
print(dataT3.dtype)


torch.float32
torch.int64


In [6]:
dataT3

tensor([[ 0,  0,  0,  ...,  0,  0,  1],
        [ 0,  0,  1,  ...,  0,  0,  0],
        [ 2,  0,  1,  ...,  0, -1,  0],
        ...,
        [ 0, -2,  1,  ...,  0,  0,  0],
        [ 0,  1,  0,  ...,  1, -1,  0],
        [ 0,  1,  3,  ...,  0,  0,  0]])

In [9]:
# Convert tensor into PyTorch Datasets

# dataset = TensorDataset(data) # not a tensor!
dataset = TensorDataset(dataT)

# dataset is a two-element tuple comprising data,labels
dataset.tensors[0]

tensor([[-0.2601, -0.9713,  0.6210,  ...,  0.3944, -0.2781,  1.3086],
        [-0.3185, -0.5051,  1.8369,  ...,  0.3521, -0.5613, -0.0629],
        [ 2.4158,  0.3269,  1.0094,  ..., -0.3357, -1.0568, -0.2230],
        ...,
        [ 0.1154, -2.4342,  1.9601,  ..., -0.9821,  0.0314,  0.3541],
        [-0.2545,  1.3478, -0.9128,  ...,  1.5338, -1.5637, -0.9770],
        [-0.2386,  1.1177,  3.3067,  ..., -0.4467,  0.1703,  0.9257]],
       dtype=torch.float64)

In [10]:
dataset.tensors

(tensor([[-0.2601, -0.9713,  0.6210,  ...,  0.3944, -0.2781,  1.3086],
         [-0.3185, -0.5051,  1.8369,  ...,  0.3521, -0.5613, -0.0629],
         [ 2.4158,  0.3269,  1.0094,  ..., -0.3357, -1.0568, -0.2230],
         ...,
         [ 0.1154, -2.4342,  1.9601,  ..., -0.9821,  0.0314,  0.3541],
         [-0.2545,  1.3478, -0.9128,  ...,  1.5338, -1.5637, -0.9770],
         [-0.2386,  1.1177,  3.3067,  ..., -0.4467,  0.1703,  0.9257]],
        dtype=torch.float64),)

In [11]:
# Let's try again with labels
labels = torch.ceil(torch.linspace(.01,4,nObservations))

# transform to an actual matrix (column vector)
labels = labels.reshape(( len(labels),1 ))
# print( labels )

# now make another dataset
dataset = TensorDataset(dataT,labels)
print( dataset.tensors[0].size() )
print( dataset.tensors[1].size() )

# for comparison
print( np.shape(np.random.randint(5,size=nObservations)) )

torch.Size([100, 20])
torch.Size([100, 1])
(100,)


In [20]:
dataset[5]

(tensor([ 0.7607, -1.0827, -1.0460, -1.4737,  0.1439,  0.0895,  0.0740,  0.8358,
          0.0662, -1.0842,  2.0084, -1.2590,  0.0740, -0.8535,  0.8497, -1.9365,
         -1.8634,  0.3323,  0.6412,  0.3717], dtype=torch.float64),
 tensor([1.]))

# DataLoaders

In [29]:
# create a dataloader object
batchsize = 25
dataloader = DataLoader(dataset,batch_size=batchsize,shuffle=True,drop_last=True)

dataloader.dataset.tensors[0].size()

torch.Size([100, 20])

In [25]:
dataloader.dataset.tensors[0]

tensor([[-0.2601, -0.9713,  0.6210,  ...,  0.3944, -0.2781,  1.3086],
        [-0.3185, -0.5051,  1.8369,  ...,  0.3521, -0.5613, -0.0629],
        [ 2.4158,  0.3269,  1.0094,  ..., -0.3357, -1.0568, -0.2230],
        ...,
        [ 0.1154, -2.4342,  1.9601,  ..., -0.9821,  0.0314,  0.3541],
        [-0.2545,  1.3478, -0.9128,  ...,  1.5338, -1.5637, -0.9770],
        [-0.2386,  1.1177,  3.3067,  ..., -0.4467,  0.1703,  0.9257]],
       dtype=torch.float64)

In [30]:
# sizes of each batch
for dat,labs in dataloader:
  print('BATCH INFO:')
  print(dat.size())
  print(labs.size())
  print(' ')

BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 


In [37]:
# inspect the labels
for dat,labs in dataloader:
  print(labs.T)
  print(' ')

tensor([[3., 3., 1., 3., 4., 1., 4., 2., 1., 4., 4., 2., 1., 3., 1., 4., 4., 2.,
         4., 2., 2., 2., 3., 1., 4.]])
 
tensor([[3., 1., 2., 2., 4., 3., 4., 3., 2., 4., 3., 1., 3., 2., 3., 4., 1., 4.,
         1., 3., 4., 2., 3., 1., 2.]])
 
tensor([[2., 2., 3., 4., 4., 3., 4., 2., 2., 2., 1., 1., 2., 3., 3., 1., 4., 3.,
         3., 1., 3., 1., 1., 4., 4.]])
 
tensor([[3., 4., 1., 2., 1., 4., 4., 1., 1., 3., 2., 1., 4., 3., 4., 2., 2., 3.,
         3., 1., 2., 1., 2., 1., 2.]])
 


In [40]:
# try again with shuffling (shuffling happens during iterations)
# dataloader = DataLoader(dataset,batch_size=batchsize,shuffle=True)

for dat,labs in dataloader:
  print(labs.T)
  print(' ')

tensor([[4., 2., 3., 3., 4., 2., 1., 2., 2., 3., 4., 3., 4., 4., 1., 3., 1., 1.,
         4., 3., 4., 3., 4., 1., 1.]])
 
tensor([[4., 1., 1., 2., 3., 1., 4., 3., 1., 1., 2., 3., 3., 3., 2., 1., 1., 3.,
         4., 4., 4., 4., 2., 4., 2.]])
 
tensor([[3., 2., 4., 2., 1., 4., 3., 1., 2., 3., 3., 4., 3., 3., 2., 2., 3., 2.,
         1., 2., 2., 1., 3., 2., 3.]])
 
tensor([[2., 1., 4., 4., 1., 4., 1., 3., 2., 4., 1., 1., 2., 1., 4., 2., 2., 2.,
         1., 4., 1., 3., 4., 3., 2.]])
 


In [50]:
# To get only one batch (e.g., for testing)

dat,labs = next(iter(dataloader))

labs

tensor([[2.],
        [3.],
        [4.],
        [3.],
        [2.],
        [1.],
        [2.],
        [1.],
        [3.],
        [3.],
        [3.],
        [1.],
        [2.],
        [4.],
        [1.],
        [4.],
        [3.],
        [1.],
        [1.],
        [1.],
        [3.],
        [4.],
        [4.],
        [4.],
        [3.]])

In [56]:
next(iter(dataloader))

[tensor([[-1.2176e+00, -1.0600e+00,  1.3773e+00, -2.0692e-01,  3.5014e-01,
          -5.5920e-01, -9.3903e-01,  1.7570e+00, -6.3880e-01, -1.2483e-01,
           9.8132e-01, -6.8762e-01, -5.0401e-01,  6.3474e-01,  2.0590e+00,
          -1.0898e+00, -1.1040e+00,  2.2441e-01, -4.2010e-01,  1.5521e+00],
         [-3.1848e-01, -5.0513e-01,  1.8369e+00,  8.9383e-01, -4.5666e-01,
          -1.3548e+00,  1.2022e+00, -1.2216e+00,  1.2792e+00,  5.7821e-01,
          -7.4716e-01, -5.0253e-01,  3.0975e-01, -2.0060e+00,  1.4655e+00,
          -1.0543e+00,  1.2481e+00,  3.5207e-01, -5.6132e-01, -6.2854e-02],
         [ 6.5771e-01,  2.7950e-01, -1.1058e+00,  1.8530e+00, -7.2894e-01,
          -1.1508e-01, -5.6146e-01,  1.9489e+00, -1.5967e-01,  9.4240e-02,
           4.0581e-01,  4.5450e-02,  1.0266e+00, -6.7366e-01,  1.4480e+00,
           6.1255e-01,  1.1715e-01, -1.8482e+00,  2.0969e+00, -2.2161e+00],
         [ 1.2532e-01, -8.9717e-01,  5.7965e-02, -7.5585e-01, -2.9874e-01,
          -3.5239e-01,