In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# create some data in numpy

nObservations = 100
nFeatures = 20

data = np.random.randn(nObservations, nFeatures)

In [4]:
# Convert to pytorch tensor
dataT = torch.tensor(data)

# print out some information
print("Numpy data:")
print(type(data))
print(data.shape)
print(data.dtype)
print(" ")

print("Tensor data:")
print(type(dataT))
print(dataT.size())
print(dataT.dtype)
print(" ")

Numpy data:
<class 'numpy.ndarray'>
(100, 20)
float64
 
Tensor data:
<class 'torch.Tensor'>
torch.Size([100, 20])
torch.float64
 


In [6]:
# Sometimes you need to convert data types
dataT2 = torch.tensor(data).float()
print(dataT2.dtype)

# "long" is for ints
dataT3 = torch.tensor(data).long()
print(dataT3.dtype)

torch.float32
torch.int64


In [9]:
# convert tensor into Pytorch Datasets

# dataset = TensorDataset(data) # not a tensor!
dataset = TensorDataset(dataT)

# dataset is a two element tuple comprising data, labels
dataset.tensors # no labels yet

(tensor([[ 0.2200,  1.0642,  0.2388,  ...,  1.0292,  0.2233,  1.0167],
         [-0.7413,  1.7400, -1.4753,  ..., -0.7594, -0.1031, -1.1339],
         [-0.4953,  1.9421, -1.7871,  ...,  0.5850, -1.2120, -0.3330],
         ...,
         [-0.4359, -0.2080, -0.7729,  ...,  0.8956, -0.9071, -0.0758],
         [ 0.0303, -0.3487, -1.1710,  ...,  1.0197, -1.4157,  1.0277],
         [-0.7216,  1.8037,  0.4829,  ...,  2.0950,  0.3064,  0.6370]],
        dtype=torch.float64),)

In [10]:
# let's try again with labels
labels = torch.ceil(torch.linspace(.01, 4, nObservations))

# transform to an actual matrix (column vector)
labels = labels.reshape((len(labels), 1))
print(labels)

# now make another dataset
dataset = TensorDataset(dataT, labels)
print(dataset.tensors[0].size())
print(dataset.tensors[1].size())

# for comparison
print(np.shape(np.random.randint(5, size = nObservations)))

tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
      

In [12]:
# create a dataloader object
batchsize = 25
dataloader = DataLoader(dataset, batch_size = batchsize) # shuffle = True, drop_last = True)

dataloader.dataset.tensors[0].size()

torch.Size([100, 20])

In [13]:
# size of each batch
for dat, labs in dataloader:
    print("Batch Info:")
    print(dat.size())
    print(labs.size())
    print(" ")

Batch Info:
torch.Size([25, 20])
torch.Size([25, 1])
 
Batch Info:
torch.Size([25, 20])
torch.Size([25, 1])
 
Batch Info:
torch.Size([25, 20])
torch.Size([25, 1])
 
Batch Info:
torch.Size([25, 20])
torch.Size([25, 1])
 


In [14]:
# inspect the labels
for dat, labs in dataloader:
    print(labs.T)
    print(" ")

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1.]])
 
tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2.]])
 
tensor([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3., 3., 3., 3., 3., 3.]])
 
tensor([[4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4., 4., 4., 4., 4., 4.]])
 


In [15]:
# try again with shuffle = True
dataloader = DataLoader(dataset, batch_size = batchsize, shuffle = True)

for dat, labs in dataloader:
    print(labs.T)
    print(" ")

tensor([[1., 3., 4., 4., 2., 2., 3., 2., 4., 3., 1., 2., 1., 3., 3., 3., 2., 4.,
         4., 2., 3., 4., 3., 1., 4.]])
 
tensor([[1., 2., 4., 3., 3., 1., 1., 3., 2., 3., 3., 2., 4., 4., 4., 1., 3., 1.,
         1., 4., 4., 2., 2., 2., 1.]])
 
tensor([[4., 4., 3., 4., 1., 3., 3., 3., 1., 2., 2., 4., 1., 2., 4., 2., 4., 2.,
         4., 2., 2., 1., 4., 1., 3.]])
 
tensor([[1., 2., 4., 2., 3., 4., 3., 1., 1., 4., 2., 1., 1., 1., 3., 4., 3., 1.,
         2., 2., 1., 3., 3., 2., 1.]])
 


In [16]:
# to get only one batch (eg: for testing)
dat, labs = next(iter(dataloader))
labs

tensor([[3.],
        [4.],
        [2.],
        [4.],
        [2.],
        [1.],
        [4.],
        [4.],
        [3.],
        [1.],
        [1.],
        [1.],
        [2.],
        [4.],
        [2.],
        [3.],
        [3.],
        [2.],
        [4.],
        [2.],
        [3.],
        [3.],
        [3.],
        [1.],
        [3.]])