In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader,TensorDataset

# DataSets

In [2]:
# create some data in numpy

nObservations = 100
nFeatures = 20

data = np.random.randn(nObservations, nFeatures)

In [3]:
# Convert to pytorch tensor
dataT = torch.tensor( data )

# print some info
print("Numpy data:")
print(type(data))
print(data.shape)
print(data.dtype)
print(' ')

print("Tensor data:")
print(type(dataT))
print(dataT.size()) # .shape works too
print(dataT.dtype)
print(' ')

Numpy data:
<class 'numpy.ndarray'>
(100, 20)
float64
 
Tensor data:
<class 'torch.Tensor'>
torch.Size([100, 20])
torch.float64
 


In [5]:
# converting data types
# float is float32
dataT2 = torch.tensor(data).float()
print(dataT2.dtype)

# "long" is for ints
dataT3 = torch.tensor( data ).long()
print(dataT3.dtype)

torch.float32
torch.int64


In [7]:
# Convert tensor into Pytorch Datasets

# dataset = TensorDataset(data) # not a tensor
dataset = TensorDataset(dataT)

# dataset is two-element tuple comprising data,labels
dataset.tensors

(tensor([[ 0.3114, -1.1392, -0.2701,  ...,  0.2587,  0.7307,  0.0051],
         [-1.3686,  0.3959, -0.2209,  ..., -0.9541,  0.3131, -1.1727],
         [-0.2768, -0.0041,  0.4392,  ..., -0.3512, -0.3465, -0.4454],
         ...,
         [-0.3442,  0.2780,  1.3481,  ..., -0.3066,  0.4597, -0.7031],
         [ 1.1471, -0.6799, -0.8186,  ..., -0.0042,  0.0171,  2.0671],
         [-0.5900, -0.6610,  1.5104,  ...,  0.8764,  0.4374,  0.2344]],
        dtype=torch.float64),)

In [9]:
# let's try again with labels
labels = torch.ceil(torch.linspace(.01,4,nObservations))

# transform to an acutal matrix 
labels = labels.reshape((len(labels),1))
print( labels )

# another dataset
dataset = TensorDataset(dataT, labels)
print(dataset.tensors[0].size())
print(dataset.tensors[1].size())

# for comparison , doesn't have orientation.
print( np.shape(np.random.randint(5,size=nObservations)))

tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
      

# DataLoaders

In [10]:
batchSize = 25
data_loader = DataLoader(dataset, batch_size=batchSize) # shuffle=True,drop_last=True

data_loader.dataset.tensors[0].size()

torch.Size([100, 20])

In [11]:
# sizes of each batch 
for dat,labs in data_loader:
    print("Batch info:")
    print(dat.size())
    print(labs.size())
    print(' ')

Batch info:
torch.Size([25, 20])
torch.Size([25, 1])
 
Batch info:
torch.Size([25, 20])
torch.Size([25, 1])
 
Batch info:
torch.Size([25, 20])
torch.Size([25, 1])
 
Batch info:
torch.Size([25, 20])
torch.Size([25, 1])
 


In [12]:
# inspecting data
for dat,labs in data_loader:
    print(labs.T,end="\n\n")

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1.]])

tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2.]])

tensor([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3., 3., 3., 3., 3., 3.]])

tensor([[4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4., 4., 4., 4., 4., 4.]])



In [13]:
# with shuffle
dataloader = DataLoader(dataset, batch_size=batchSize,shuffle=True)

# its get shuffled each time it gets called.
for dat,labs in dataloader:
    print(labs.T,end="\n\n")

tensor([[3., 3., 2., 2., 1., 4., 1., 4., 2., 3., 4., 2., 3., 1., 4., 1., 4., 3.,
         1., 3., 3., 4., 1., 4., 1.]])

tensor([[3., 3., 2., 4., 2., 2., 4., 2., 2., 3., 1., 3., 1., 3., 2., 3., 4., 1.,
         2., 1., 1., 4., 2., 1., 1.]])

tensor([[2., 3., 4., 4., 1., 4., 4., 4., 3., 1., 3., 2., 2., 4., 3., 3., 2., 4.,
         1., 2., 2., 4., 1., 2., 1.]])

tensor([[2., 3., 4., 4., 2., 1., 1., 4., 3., 1., 3., 3., 1., 3., 2., 4., 1., 3.,
         2., 4., 2., 3., 4., 1., 2.]])



In [14]:
# get only one batch

dat,labs = next(iter(dataloader))

labs

tensor([[1.],
        [1.],
        [4.],
        [2.],
        [4.],
        [2.],
        [2.],
        [1.],
        [3.],
        [2.],
        [3.],
        [1.],
        [3.],
        [2.],
        [1.],
        [1.],
        [2.],
        [2.],
        [1.],
        [4.],
        [3.],
        [2.],
        [2.],
        [3.],
        [3.]])