# COURSE: A deep understanding of deep learning
## SECTION: Data matrices and loaders
### LECTURE: Anatomy of a torch dataset and dataloader
#### TEACHER: Mike X Cohen, sincxpress.com
##### COURSE URL: udemy.com/course/dudl/?couponCode=202108

In [1]:
# import libraries
import numpy as np
import torch
from torch.utils.data import DataLoader,TensorDataset

# Datasets

In [2]:
# create some data in numpy

nObservations = 100
nFeatures = 20

data = np.random.randn(nObservations,nFeatures)

In [3]:
# Convert to pytorch tensor
dataT = torch.tensor( data )

# print out some information
print('Numpy data:')
print(type(data))
print(data.shape) # numpy -> .shape
print(data.dtype)
print(' ')

print('Tensor data:')
print(type(dataT))
print(dataT.size()) # torch -> .size()
print(dataT.dtype)
print(' ')

Numpy data:
<class 'numpy.ndarray'>
(100, 20)
float64
 
Tensor data:
<class 'torch.Tensor'>
torch.Size([100, 20])
torch.float64
 


In [4]:
# Sometimes you need to convert data types

dataT2 = torch.tensor( data ).float()
print(dataT2.dtype)

# "long" is for ints
dataT3 = torch.tensor( data ).long()
print(dataT3.dtype)


torch.float32
torch.int64


In [5]:
# Convert tensor into PyTorch Datasets

# dataset = TensorDataset(data) # not a tensor!
dataset = TensorDataset(dataT)

# dataset is a two-element tuple comprising data,labels
dataset.tensors[0]

tensor([[-0.9322,  1.5456,  0.5342,  ..., -1.4724, -1.3023, -1.2730],
        [-1.0022, -0.0612, -1.3442,  ...,  0.6396, -0.9947,  0.3481],
        [ 0.3182,  0.7951,  1.0954,  ..., -0.3776, -1.4407,  2.0893],
        ...,
        [-0.8597, -0.6311, -0.9383,  ...,  1.0672, -0.6199,  0.9440],
        [-1.2928, -0.1159,  0.2660,  ...,  0.2714,  0.1890, -0.1886],
        [ 1.1745,  1.1916,  0.3675,  ...,  0.3102, -0.3955,  1.3038]],
       dtype=torch.float64)

In [6]:
# Let's try again with labels
labels = torch.ceil(torch.linspace(.01,4,nObservations))

# transform to an actual matrix (column vector)
labels = labels.reshape(( len(labels),1 ))
# print( labels )

# now make another dataset
dataset = TensorDataset(dataT,labels)
print( dataset.tensors[0].size() )
print( dataset.tensors[1].size() )

# for comparison
print( np.shape(np.random.randint(5,size=nObservations)) )

torch.Size([100, 20])
torch.Size([100, 1])
(100,)


# DataLoaders

In [12]:
# create a dataloader object
batchsize = 25
dataloader = DataLoader(dataset,batch_size=batchsize)#,shuffle=True,drop_last=True)

dataloader.dataset.tensors[0]

tensor([[-0.9322,  1.5456,  0.5342,  ..., -1.4724, -1.3023, -1.2730],
        [-1.0022, -0.0612, -1.3442,  ...,  0.6396, -0.9947,  0.3481],
        [ 0.3182,  0.7951,  1.0954,  ..., -0.3776, -1.4407,  2.0893],
        ...,
        [-0.8597, -0.6311, -0.9383,  ...,  1.0672, -0.6199,  0.9440],
        [-1.2928, -0.1159,  0.2660,  ...,  0.2714,  0.1890, -0.1886],
        [ 1.1745,  1.1916,  0.3675,  ...,  0.3102, -0.3955,  1.3038]],
       dtype=torch.float64)

In [13]:
dataloader.dataset.tensors[0]

tensor([[-0.9322,  1.5456,  0.5342,  ..., -1.4724, -1.3023, -1.2730],
        [-1.0022, -0.0612, -1.3442,  ...,  0.6396, -0.9947,  0.3481],
        [ 0.3182,  0.7951,  1.0954,  ..., -0.3776, -1.4407,  2.0893],
        ...,
        [-0.8597, -0.6311, -0.9383,  ...,  1.0672, -0.6199,  0.9440],
        [-1.2928, -0.1159,  0.2660,  ...,  0.2714,  0.1890, -0.1886],
        [ 1.1745,  1.1916,  0.3675,  ...,  0.3102, -0.3955,  1.3038]],
       dtype=torch.float64)

In [8]:
# sizes of each batch
for dat,labs in dataloader:
  print('BATCH INFO:')
  print(dat.size())
  print(labs.size())
  print(' ')

BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 


In [17]:
# inspect the labels
for dat,labs in dataloader:
  print(labs.T)
  print(' ')
  print(dataloader.dataset.tensors)
  print(' ')

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1.]])
 
(tensor([[-0.9322,  1.5456,  0.5342,  ..., -1.4724, -1.3023, -1.2730],
        [-1.0022, -0.0612, -1.3442,  ...,  0.6396, -0.9947,  0.3481],
        [ 0.3182,  0.7951,  1.0954,  ..., -0.3776, -1.4407,  2.0893],
        ...,
        [-0.8597, -0.6311, -0.9383,  ...,  1.0672, -0.6199,  0.9440],
        [-1.2928, -0.1159,  0.2660,  ...,  0.2714,  0.1890, -0.1886],
        [ 1.1745,  1.1916,  0.3675,  ...,  0.3102, -0.3955,  1.3038]],
       dtype=torch.float64), tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [2.],
        [2.],
        [2.],
        

In [10]:
# try again with shuffling (shuffling happens during iterations)
# dataloader = DataLoader(dataset,batch_size=batchsize,shuffle=True)

for dat,labs in dataloader:
  print(labs.T)
  print(' ')

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1.]])
 
tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2.]])
 
tensor([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3., 3., 3., 3., 3., 3.]])
 
tensor([[4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4., 4., 4., 4., 4., 4.]])
 


In [11]:
# To get only one batch (e.g., for testing)

dat,labs = next(iter(dataloader))

labs

tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]])

In [40]:
x =torch.randint(10,100,(100,100))
print(x)
z = torch.randint(0,10,(10,10))
z = torch.reshape(z,(z.shape[1]**2,1))

tensor([[92, 93, 27,  ..., 86, 80, 79],
        [61, 24, 29,  ..., 90, 12, 26],
        [33, 24, 50,  ..., 16, 77, 57],
        ...,
        [76, 49, 94,  ..., 50, 92, 59],
        [62, 93, 35,  ..., 14, 76, 32],
        [56, 60, 16,  ..., 96, 33, 18]])


In [53]:
dataxes = TensorDataset(x,z,dataT,dataT,dataT)

In [60]:
DataLoader(dataxes, 1000,drop_last= True)

<torch.utils.data.dataloader.DataLoader at 0x78a9d275a6b0>