## 02-01 DataLoader and Dataset

In [24]:
import os
import random
import shutil
from pathlib import Path

import torch
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader

random.seed(1)
torch.manual_seed(0)
torch.__version__

'1.3.0'

### 1.将人民币原始数据划分为训练集、验证集、测试集

设置需要处理的各个文件路径

In [6]:
rmb_data_path = Path('/media/bnu/file/datasets/pytorch-tutorials/rmb_data')
raw_path = rmb_data_path / 'raw_data'
split_path = rmb_data_path / 'split_data'
train_path = split_path / 'train'
valid_path = split_path / 'valid'
test_path = split_path / 'test'

print('raw_path:', raw_path)
print('train_path:', train_path)
print('valid_path:', valid_path)
print('test_path:', test_path)


raw_path: /media/bnu/file/datasets/pytorch-tutorials/rmb_data/raw_data
train_path: /media/bnu/file/datasets/pytorch-tutorials/rmb_data/split_data/train
valid_path: /media/bnu/file/datasets/pytorch-tutorials/rmb_data/split_data/valid
test_path: /media/bnu/file/datasets/pytorch-tutorials/rmb_data/split_data/test


设定训练集、验证集、测试集比例并进行划分

In [12]:
train_pct, valid_pct, test_pct = 0.8, 0.1, 0.1

for root, dirs, files in os.walk(raw_path):
    for sub_dir in dirs:
        # 获取目录中所有图片名称，并进行排序
        image_file_list = os.listdir(raw_path / sub_dir)
        image_file_list = list(filter(lambda x: x.endswith('.jpg'), image_file_list))
        random.shuffle(image_file_list)
        
        # 计算划分区间
        image_count = len(image_file_list)
        train_point = int(image_count * train_pct)
        valid_point = int(image_count * (train_pct + valid_pct))

        # 根据数据集划分将图片保存到对应目录
        for i in range(image_count):
            if i < train_point:
                out_dir = train_path / sub_dir
            elif i < valid_point:
                out_dir = valid_path / sub_dir
            else:
                out_dir = test_path / sub_dir
            
            if not os.path.exists(out_dir):
                os.makedirs(out_dir)

            target_path = out_dir / image_file_list[i]
            source_path = raw_path / sub_dir / image_file_list[i]
            shutil.copy(source_path, target_path)
        
        print(f'Class: {sub_dir}, Train: {train_point}, Valid: {valid_point - train_point}, Test: {image_count - valid_point}')


Class: 100, Train: 80, Valid: 10, Test: 10
Class: 1, Train: 80, Valid: 10, Test: 10


### 2.定义数据集的Dataset

In [23]:
class RMBDataset(Dataset):
    
    def __init__(self, image_path, transform=None):
        self.label_dict = {'1': 0, '100': 1}
        self.transform = transform

        self.image_data = []
        for root, dirs, files in os.walk(image_path):
            for sub_dir in dirs:
                # 获取目录下所有图片列表
                image_file_list = os.listdir(image_path / sub_dir)
                image_file_list = list(filter(lambda x: x.endswith('.jpg'), image_file_list))

                # 保存每个图片的路径和标签
                for i in range(len(image_file_list)):
                    image_name = image_file_list[i]
                    file_path = image_path / sub_dir / image_name
                    label = self.label_dict[sub_dir]
                    self.image_data.append((file_path, label))

    def __getitem__(self, index):
        file_path, label = self.image_data[index]
        image = Image.open(file_path).convert('RGB')  # 数据范围0-255
        
        # 对图片进行transform
        if self.transform is not None:
            image = self.transform(image)
        
        return image, label

    def __len__(self):
        return len(self.image_data)


temp_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor()
])
temp_dataset = RMBDataset(train_path, transform=temp_transform)
temp_dataset[0]

(tensor([[[0.7647, 0.7373, 0.7216,  ..., 0.7608, 0.7765, 0.8157],
          [0.7451, 0.7176, 0.7137,  ..., 0.7255, 0.7490, 0.8000],
          [0.7255, 0.7725, 0.8275,  ..., 0.7333, 0.7529, 0.7922],
          ...,
          [0.8392, 0.8745, 0.9137,  ..., 0.8784, 0.8902, 0.9059],
          [0.8824, 0.8824, 0.8824,  ..., 0.9137, 0.9216, 0.9333],
          [0.8980, 0.8941, 0.8941,  ..., 0.9333, 0.9373, 0.9412]],
 
         [[0.7804, 0.7529, 0.7373,  ..., 0.7843, 0.8000, 0.8314],
          [0.7490, 0.7255, 0.7294,  ..., 0.7451, 0.7647, 0.8118],
          [0.7255, 0.7686, 0.8235,  ..., 0.7490, 0.7647, 0.7961],
          ...,
          [0.8471, 0.8706, 0.9137,  ..., 0.8980, 0.9020, 0.9176],
          [0.8941, 0.8941, 0.8941,  ..., 0.9294, 0.9373, 0.9451],
          [0.9059, 0.9059, 0.9059,  ..., 0.9451, 0.9490, 0.9529]],
 
         [[0.7961, 0.7686, 0.7529,  ..., 0.8078, 0.8196, 0.8510],
          [0.7765, 0.7412, 0.7255,  ..., 0.8000, 0.8157, 0.8510],
          [0.7922, 0.8078, 0.8078,  ...,

### 3.定义数据集的DataLoader

In [28]:
# 标准化三通道的均值和标准差
norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]

# 定义训练集和验证集的transforms
train_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std)
])
valid_transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(norm_mean, norm_std)
])

# 构建Dataset实例
train_dataset = RMBDataset(image_path=train_path, transform=train_transform)
valid_dataset = RMBDataset(image_path=valid_path, transform=valid_transform)

# 构建DataLoader
train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(dataset =valid_dataset, batch_size=16)

for i, (inputs, labels) in enumerate(train_loader):
    print('Batch Shape: ', inputs.shape, 'Labels Shape:', labels.shape)
    break

Batch Shape:  torch.Size([16, 3, 32, 32]) Labels Shape: torch.Size([16])
