# 数据工具

## 数据加载
pytorch中提供抽象的数据集对象Dataset类，实现自定义的数据集需要继承Dataset，并实现两个Python魔法方法：
* \_\_getitem\_\_：返回一条数据，或一个样本。obj[index]等价于obj.\_\_getitem\_\_(index)
* \_\_len\_\_：返回样本的数量。len(obj)等价于obj.\_\_len\_\_()

下面是一个猫狗分类样例：


In [34]:
# 取子目录
filelist = os.listdir('/content/drive/MyDrive/pytorch/utilities/data/dogcat')
filelist

['cat.12485.jpg',
 'dog.12496.jpg',
 'dog.12497.jpg',
 'cat.12484.jpg',
 'dog.12498.jpg',
 'cat.12487.jpg',
 'cat.12486.jpg',
 'dog.12499.jpg']

In [35]:
# 合成为绝对路径
filelist = [os.path.join('/content/drive/MyDrive/pytorch/utilities/data/dogcat', img) for img in filelist]
filelist

['/content/drive/MyDrive/pytorch/utilities/data/dogcat/cat.12485.jpg',
 '/content/drive/MyDrive/pytorch/utilities/data/dogcat/dog.12496.jpg',
 '/content/drive/MyDrive/pytorch/utilities/data/dogcat/dog.12497.jpg',
 '/content/drive/MyDrive/pytorch/utilities/data/dogcat/cat.12484.jpg',
 '/content/drive/MyDrive/pytorch/utilities/data/dogcat/dog.12498.jpg',
 '/content/drive/MyDrive/pytorch/utilities/data/dogcat/cat.12487.jpg',
 '/content/drive/MyDrive/pytorch/utilities/data/dogcat/cat.12486.jpg',
 '/content/drive/MyDrive/pytorch/utilities/data/dogcat/dog.12499.jpg']

In [29]:
# 对路径进行分割
filelist = [img.split('/') for img in filelist]
filelist

[['',
  'content',
  'drive',
  'MyDrive',
  'pytorch',
  'utilities',
  'data',
  'dogcat',
  'cat.12485.jpg'],
 ['',
  'content',
  'drive',
  'MyDrive',
  'pytorch',
  'utilities',
  'data',
  'dogcat',
  'dog.12496.jpg'],
 ['',
  'content',
  'drive',
  'MyDrive',
  'pytorch',
  'utilities',
  'data',
  'dogcat',
  'dog.12497.jpg'],
 ['',
  'content',
  'drive',
  'MyDrive',
  'pytorch',
  'utilities',
  'data',
  'dogcat',
  'cat.12484.jpg'],
 ['',
  'content',
  'drive',
  'MyDrive',
  'pytorch',
  'utilities',
  'data',
  'dogcat',
  'dog.12498.jpg'],
 ['',
  'content',
  'drive',
  'MyDrive',
  'pytorch',
  'utilities',
  'data',
  'dogcat',
  'cat.12487.jpg'],
 ['',
  'content',
  'drive',
  'MyDrive',
  'pytorch',
  'utilities',
  'data',
  'dogcat',
  'cat.12486.jpg'],
 ['',
  'content',
  'drive',
  'MyDrive',
  'pytorch',
  'utilities',
  'data',
  'dogcat',
  'dog.12499.jpg']]

In [43]:
import torch
from torch.utils import data
import os
from PIL import  Image
from torchvision.transforms import ToTensor
import numpy as np

#需要继承Dataset的
class Dogcatset(data.Dataset):
  def __init__(self,root):
    imgs = os.listdir(root)
    self.imgs = [os.path.join(root,img) for img in imgs]
  def __getitem__(self,index):
    sample = self.imgs[index]
    label = 1 if 'dog' in sample.split('/')[-1] else 0
    pil_img = Image.open(sample)
    data = ToTensor()(pil_img)# transforms.ToTensor() 是一个类，需要实例化后才能使用。
    return data,label
  def __len__(self):
    return len(self.imgs)
dataset = Dogcatset("/content/drive/MyDrive/pytorch/utilities/data/dogcat")
dataset[1]

(tensor([[[0.1333, 0.1333, 0.1294,  ..., 0.7490, 0.7490, 0.7490],
          [0.1333, 0.1373, 0.1373,  ..., 0.7529, 0.7529, 0.7529],
          [0.1373, 0.1451, 0.1490,  ..., 0.7529, 0.7529, 0.7529],
          ...,
          [0.8000, 0.8235, 0.8196,  ..., 0.2471, 0.2510, 0.2510],
          [0.8000, 0.8353, 0.8314,  ..., 0.2549, 0.2549, 0.2549],
          [0.7961, 0.8353, 0.8392,  ..., 0.2588, 0.2588, 0.2588]],
 
         [[0.1333, 0.1333, 0.1294,  ..., 0.9882, 0.9882, 0.9882],
          [0.1333, 0.1373, 0.1373,  ..., 0.9922, 0.9922, 0.9922],
          [0.1333, 0.1412, 0.1451,  ..., 0.9961, 0.9961, 0.9961],
          ...,
          [0.8118, 0.8353, 0.8314,  ..., 0.2000, 0.2000, 0.2000],
          [0.8118, 0.8471, 0.8431,  ..., 0.2000, 0.2039, 0.2039],
          [0.8078, 0.8471, 0.8510,  ..., 0.2039, 0.2078, 0.2078]],
 
         [[0.1255, 0.1255, 0.1216,  ..., 0.9922, 0.9922, 0.9922],
          [0.1255, 0.1294, 0.1294,  ..., 0.9961, 0.9961, 0.9961],
          [0.1255, 0.1333, 0.1373,  ...,