# 数据

## 1. Pytorch中的张量
pytorch中数据都是以张量(Tensor)的形式存在的，可以在GPU中被加速运算。

## 1.1 张量的构建

In [1]:
import torch
x = torch.empty(5, 3, 1) # 没有初始化的张量
print(x)

tensor([[[1.1065e+36],
         [4.5835e-41],
         [1.1065e+36]],

        [[4.5835e-41],
         [       nan],
         [0.0000e+00]],

        [[7.6194e+31],
         [1.5564e+28],
         [1.8484e+31]],

        [[1.8370e+25],
         [1.4603e-19],
         [2.7517e+12]],

        [[7.5338e+28],
         [3.0313e+32],
         [6.3828e+28]]])


In [2]:
x = torch.rand(5, 3, 1) # 随机初始化的张量
print(x)

tensor([[[0.9492],
         [0.3743],
         [0.0843]],

        [[0.1654],
         [0.4311],
         [0.4783]],

        [[0.4646],
         [0.4374],
         [0.7879]],

        [[0.1969],
         [0.0027],
         [0.3977]],

        [[0.4488],
         [0.9870],
         [0.0769]]])


In [3]:
x = torch.zeros(5, 3, 1, dtype=torch.long) # 0张量并且设置数据类型
print(x)

tensor([[[0],
         [0],
         [0]],

        [[0],
         [0],
         [0]],

        [[0],
         [0],
         [0]],

        [[0],
         [0],
         [0]],

        [[0],
         [0],
         [0]]])


In [4]:
x = torch.ones(5, 3, 1, dtype=torch.float) # 1张量并且设置数据类型
print(x)

tensor([[[1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.]]])


In [5]:
x = torch.tensor([5.5, 3]) # 给定初始值
print(x)

tensor([5.5000, 3.0000])


In [6]:
x = x.new_ones(5, 3, dtype=torch.double)
print(x)

x = torch.randn_like(x, dtype=torch.float)    # 随机构造相同结构的张量
print(x)                                      

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)
tensor([[ 0.4675,  0.2680,  0.9838],
        [-0.1511,  0.6528,  0.4041],
        [-0.0873,  0.3797,  0.0462],
        [ 0.4091,  0.7814, -2.4635],
        [ 2.7739,  1.8931,  0.2729]])


In [7]:
print(x.size())
print(x.shape)

torch.Size([5, 3])
torch.Size([5, 3])


## 1.2 张量的常用操作

- 加法的三种操作：

In [8]:
# 加法1
x = torch.rand(5, 3)
y = torch.rand(5, 3)
print(x + y)

tensor([[1.2984, 0.8573, 0.9113],
        [1.5727, 0.9564, 1.0015],
        [0.8358, 1.1201, 0.9199],
        [1.6553, 1.3855, 1.1116],
        [1.5401, 1.2140, 1.1371]])


In [9]:
# 加法2
x = torch.rand(5, 3)
y = torch.rand(5, 3)
print(torch.add(x, y))

tensor([[1.1379, 1.7125, 1.6423],
        [1.1837, 0.9561, 0.9103],
        [0.8028, 0.8807, 0.7998],
        [1.0932, 1.5097, 0.7958],
        [0.7308, 0.8268, 0.7817]])


In [10]:
# 加法3
x = torch.ones(5, 3)
y = torch.ones(5, 3)
y.add_(x) # 结果将赋在y中
print(y)

tensor([[2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.]])


- 点乘用mul:

In [11]:
x = torch.randn(2, 3)
y = torch.randn(2, 3)
z = x.mul(y)
print(x)
print(y)
print(z)

tensor([[ 0.7398, -0.2220,  0.0118],
        [-0.0872, -0.7426,  0.8106]])
tensor([[-0.1157, -1.2632,  1.6489],
        [ 0.8958, -1.1349,  2.1087]])
tensor([[-0.0856,  0.2804,  0.0195],
        [-0.0781,  0.8427,  1.7093]])


- 矩阵相乘用mm:

In [12]:
x = torch.randn(2, 3)
y = torch.randn(3, 1)
z = x.mm(y)
print(x)
print(y)
print(z)

tensor([[ 3.1653,  0.3104, -1.3886],
        [-0.7404, -0.0398,  1.4493]])
tensor([[ 0.0834],
        [-0.3810],
        [-0.2269]])
tensor([[ 0.4607],
        [-0.3754]])


- 切片：

In [13]:
# slice
x = torch.randn(4, 4, 4)
print(x)
print(x[:1, :2, :3])

tensor([[[-0.6036, -0.2512, -1.0602, -1.2882],
         [-0.4644, -0.0300, -1.4665, -2.5153],
         [ 0.1905,  0.6340,  2.1740, -0.4201],
         [ 0.7100, -0.6776,  0.6291,  0.5813]],

        [[-0.7039,  0.9065,  2.4795, -0.6853],
         [-0.7674, -0.7433, -0.5101,  1.5018],
         [ 0.4127,  1.7331,  0.2184,  0.2135],
         [ 0.7710,  0.7265, -1.9051,  2.5707]],

        [[-0.2509,  1.9189, -0.7235, -1.0685],
         [-0.3304,  0.1506,  1.0501, -0.9906],
         [-0.6847,  1.1520, -0.7548,  1.0213],
         [-0.3221,  0.1954, -0.2828, -0.7224]],

        [[ 2.1759,  0.6578, -0.9254, -1.1232],
         [-0.4219, -1.0864,  1.1685,  0.6516],
         [-1.5549,  1.0785, -0.9203, -0.3653],
         [ 0.9027,  0.3860,  0.8551,  0.1665]]])
tensor([[[-0.6036, -0.2512, -1.0602],
         [-0.4644, -0.0300, -1.4665]]])


- 改变尺寸：

In [14]:
# resize
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
print(x.size(), y.size(), z.size())

torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


- 拼接用cat：

In [15]:
# 拼接
x = torch.randn(2,3)
y = torch.randn(1,3)
print(x)
print(y)
z = torch.cat((x,y),0)
print(z)

tensor([[ 0.3607,  0.0036, -0.4574],
        [ 0.7501, -0.7826, -1.5577]])
tensor([[-0.5751, -0.6576, -2.4835]])
tensor([[ 0.3607,  0.0036, -0.4574],
        [ 0.7501, -0.7826, -1.5577],
        [-0.5751, -0.6576, -2.4835]])


- 堆叠用stack：

In [16]:
# 堆叠会增加新的维度进行堆叠
a=torch.rand((1,2))
b=torch.rand((1,2))
c=torch.stack((a,b),0)
print(a)
print(b)
print(c)
print(c.shape)

tensor([[0.6878, 0.9567]])
tensor([[0.5822, 0.9759]])
tensor([[[0.6878, 0.9567]],

        [[0.5822, 0.9759]]])
torch.Size([2, 1, 2])


In [17]:
# 在维度1堆叠，则先在维度1上增加一个维度，再在这个维度上堆叠
a=torch.rand((1,2))
b=torch.rand((1,2))
c=torch.stack((a,b),1)
print(a)
print(b)
print(c)
print(c.shape)

tensor([[0.2314, 0.7093]])
tensor([[0.6609, 0.7514]])
tensor([[[0.2314, 0.7093],
         [0.6609, 0.7514]]])
torch.Size([1, 2, 2])


- 交换维度用transpose：

In [18]:
# 交换维度
x = torch.randn(2,3)
print(x)
print(x.transpose(0, 1))

tensor([[ 1.1919, -1.0047,  0.3610],
        [-0.3882, -0.1698,  0.1947]])
tensor([[ 1.1919, -0.3882],
        [-1.0047, -0.1698],
        [ 0.3610,  0.1947]])


- 多维度交换用permute：

In [19]:
# 多维度交换
x = torch.randn(2,3,4)
print(x.size())
x_p = x.permute(1,0,2) # 将原来第1维变为0维，同理，0→1,2→2 
print(x_p.size())

torch.Size([2, 3, 4])
torch.Size([3, 2, 4])


- 压缩维度用squeeze：

In [20]:
# 压缩
x = torch.Tensor(2, 1)
print(x.shape)
# 不加参数，去掉所有为元素个数为1的维度
y = x.squeeze()
print(y.shape)

torch.Size([2, 1])
torch.Size([2])


- 增加维度用unsqueeze：

In [21]:
# 增加维度
x = torch.Tensor(2)
print(x.shape)
# 不加参数，增加维度
y = x.unsqueeze(0)
print(y.shape)
# 不加参数，增加维度
z = x.unsqueeze(1)
print(z.shape)

torch.Size([2])
torch.Size([1, 2])
torch.Size([2, 1])


## 1.3  GPU计算

In [22]:
# cuda 方法
if torch.cuda.is_available():
    x = x.cuda()
print(x)

tensor([1.1065e+36, 4.5835e-41], device='cuda:0')


In [23]:
# to 方法
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
x = x.to(device)
print(x)

tensor([1.1065e+36, 4.5835e-41], device='cuda:0')


## 1.4 Numpy Array 与 Torch Tensor相互转换

- Numpy Array 转 Torch Tensor：

In [24]:
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a) # 注意！tensor会随numpy array的变化而变化
np.add(a, 1, out=a)
print(a)
print(b)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


- Torch Tensor 转 Numpy Array：

In [25]:
a = torch.ones(5)
b = a.numpy() 
a.add_(1) #注意！numpy array会随tensor的变化而变化
print(a)
print(b)

tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


In [26]:
# 如果tensor在GPU中则需要先转回CPU中再转成numpy array
a = torch.ones(5).cuda()
b = a.cpu().numpy() 
a.add_(1) #注意！numpy array会随tensor的变化而变化
print(a)
print(b)

tensor([2., 2., 2., 2., 2.], device='cuda:0')
[1. 1. 1. 1. 1.]


## 2. Pytorch数据集加载范例

本小节以oxflowers17数据集为例讲解pytorch如何加载数据集。oxflowers17数据集的文件夹如下：

![flowers17.png](attachment:flowers17.png)

### 2.1  构建小数据集
如果数据集较小，直接将实例和标签读入内存，再实例化为TensorDataset的对象。

 - 引用包及数据集路径：

In [27]:
import os
from PIL import Image  
import numpy as np  
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torchvision import transforms, utils

dataset_path = './flower_dataset/'

- 将数据集中所有实例和标签读入内存中：

In [28]:
instances = []
labels = []
for k in range(16): # 遍历每个类别文件夹
    sub_folder = dataset_path+'{}/'.format(k)
    file_list = os.listdir(sub_folder) 
    for f_name in file_list: # 读取每个样本
        f_path = os.path.join(sub_folder, f_name)
        if os.path.isfile(f_path):
            img = Image.open(f_path) # 读取图片
            img = img.resize((224, 224))
            instances.append(np.array(img)/255.0) # 归一化操作
            labels.append(k)
            img.close()
t_instances = torch.tensor(instances)
t_labels = torch.tensor(labels)

- 将数据实例化为TensorDataset的对象:

In [29]:
flower_dataset = TensorDataset(t_instances, t_labels)

### 2.2  构建大数据集
如果数据集的数据量比较大，无法一次性加载到内存，需要继承Dataset类，来自定义数据集的类。

### 2.2.1  自定义数据集类（继承Dataset类）

自定义数据集的类，一般需要重构三个函数：构造函数\_\_init\_\_();获取单个样本\_\_getitem\_\_();获取长度信息\_\_len\_\_()。大概框架如下：

In [30]:
from torch.utils.data import Dataset, DataLoader 
class MyDataset(Dataset): 
    def __init__(self, filepath, transform=None,keys = None, target_transform=None): 
        pass 
    '''
    首先说明一下以上的初始化参数，filepath是数据集的路径，transform是对源数据（features）的一些变化，target_transform是对目标数据（labels）的一些变换，keys是键，因为我的数据是这样的，整体是字典格式的，每个键对应的值又是ndarray数据，所以我通过键来索引对应的值 
    ''' 
    def __getitem__(self,index): 
        pass 
    def __len__(self): 
        pass

获取单个样本的流程下图所示：

![dataloader.jpg](attachment:dataloader.jpg)

In [None]:
import random
from PIL import Image  
import numpy as np  
from torch.utils.data import Dataset,DataLoader 


class OXFlowerDataset(Dataset):
    """OX Flower 17 dataset."""

    def __init__(self, root_dir, transform=None):
        """
        Args:
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.len = len(open(os.path.join(root_dir, 'files.txt'),'r').readlines())

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        label = random.randint(0, 16)
        imgs_folder = os.path.join(self.root_dir,
                                '{}'.format(label))
        img_list = os.listdir(imgs_folder)
        img_name = random.choice(img_list)
        img_path = os.path.join(imgs_folder, img_name)
        img = Image.open(img_path)
        img = img.resize((128, 128))

        if self.transform:
            t_img = self.transform(img)
        else:
            np_img = np.array(img)/256.0
            np_img = np_img.transpose((2,0,1))
            t_img = torch.tensor(np_img, dtype=torch.float)
        img.close()
        return (t_img, label)


### 2.2.2 调用datasets.ImageFolder

将数据按标签分开，一个标签一个文件夹，每个文件夹里放对应标签的数据。可以直接用ImageFolder构建数据集：

In [1]:
from torchvision import datasets

data_dir = './flower_dataset'
ox_flower_dataset=datasets.ImageFolder(data_dir)

### 2.3  预处理

torchvision.transforms是pytorch中的图像预处理包，包含许多预处理与数据增广的操作，比如裁剪、翻转、调整对比度与饱和度等。一般用Compose把多个步骤整合到一起：

In [None]:
transform=transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(), #将图片转换为Tensor，自动除以256.0
    transforms.Normalize(mean=[.5,.5,.5],std=[.5,.5,.5])
])
ox_flower_dataset = OXFlowerDataset(root_dir='./flower_dataset', 
                                    transform=transform)

### 2.4 Dataloader的使用

在训练模型时，通常采用批训练，对一个batch的数据进行操作，同时还需要对数据进行shuffle和并行加载数据等。对此，PyTorch提供了DataLoader帮助我们实现这些功能。

In [None]:
dataloader = DataLoader(ox_flower_dataset, 
                        batch_size=64,
                        shuffle=True, 
                        num_workers=8)

for idx, (imgs, labels) in enumerate(dataloader):
    print(imgs, labels)