# <center>一 、由csv文件和img文件夹组合而成的数据集读取方法
# <center>(以Kaggle的classify leaves为例)

## 1、获取图像的所有标签

In [44]:
import pandas as pd 
from tqdm import tqdm

In [45]:
train_path = r'dataset/classify-leaves/train.csv'
label_df = pd.read_csv(train_path)
label_df.head() 

Unnamed: 0,image,label
0,images/0.jpg,maclura_pomifera
1,images/1.jpg,maclura_pomifera
2,images/2.jpg,maclura_pomifera
3,images/3.jpg,maclura_pomifera
4,images/4.jpg,maclura_pomifera


In [46]:
leave_label = sorted(set(list(label_df['label'])))
n_class = len(leave_label)
n_class

176

## 2、构建class 2 num 和 num 2 class 的映射

In [47]:
class_to_num = dict(zip(leave_label,range(n_class)))
class_to_num

{'abies_concolor': 0,
 'abies_nordmanniana': 1,
 'acer_campestre': 2,
 'acer_ginnala': 3,
 'acer_griseum': 4,
 'acer_negundo': 5,
 'acer_palmatum': 6,
 'acer_pensylvanicum': 7,
 'acer_platanoides': 8,
 'acer_pseudoplatanus': 9,
 'acer_rubrum': 10,
 'acer_saccharinum': 11,
 'acer_saccharum': 12,
 'aesculus_flava': 13,
 'aesculus_glabra': 14,
 'aesculus_hippocastamon': 15,
 'aesculus_pavi': 16,
 'ailanthus_altissima': 17,
 'albizia_julibrissin': 18,
 'amelanchier_arborea': 19,
 'amelanchier_canadensis': 20,
 'amelanchier_laevis': 21,
 'asimina_triloba': 22,
 'betula_alleghaniensis': 23,
 'betula_jacqemontii': 24,
 'betula_lenta': 25,
 'betula_nigra': 26,
 'betula_populifolia': 27,
 'broussonettia_papyrifera': 28,
 'carpinus_betulus': 29,
 'carpinus_caroliniana': 30,
 'carya_cordiformis': 31,
 'carya_glabra': 32,
 'carya_ovata': 33,
 'carya_tomentosa': 34,
 'castanea_dentata': 35,
 'catalpa_bignonioides': 36,
 'catalpa_speciosa': 37,
 'cedrus_atlantica': 38,
 'cedrus_deodara': 39,
 'cedru

In [48]:
num_to_class = dict(zip(class_to_num.values(),class_to_num.keys()))
num_to_class

{0: 'abies_concolor',
 1: 'abies_nordmanniana',
 2: 'acer_campestre',
 3: 'acer_ginnala',
 4: 'acer_griseum',
 5: 'acer_negundo',
 6: 'acer_palmatum',
 7: 'acer_pensylvanicum',
 8: 'acer_platanoides',
 9: 'acer_pseudoplatanus',
 10: 'acer_rubrum',
 11: 'acer_saccharinum',
 12: 'acer_saccharum',
 13: 'aesculus_flava',
 14: 'aesculus_glabra',
 15: 'aesculus_hippocastamon',
 16: 'aesculus_pavi',
 17: 'ailanthus_altissima',
 18: 'albizia_julibrissin',
 19: 'amelanchier_arborea',
 20: 'amelanchier_canadensis',
 21: 'amelanchier_laevis',
 22: 'asimina_triloba',
 23: 'betula_alleghaniensis',
 24: 'betula_jacqemontii',
 25: 'betula_lenta',
 26: 'betula_nigra',
 27: 'betula_populifolia',
 28: 'broussonettia_papyrifera',
 29: 'carpinus_betulus',
 30: 'carpinus_caroliniana',
 31: 'carya_cordiformis',
 32: 'carya_glabra',
 33: 'carya_ovata',
 34: 'carya_tomentosa',
 35: 'castanea_dentata',
 36: 'catalpa_bignonioides',
 37: 'catalpa_speciosa',
 38: 'cedrus_atlantica',
 39: 'cedrus_deodara',
 40: 'c

# 3、定义dataset （从训练集中拆部分出来作验证集）

In [49]:
import torch
from torch.utils.data import Dataset,DataLoader
from torchvision import transforms
from torch import nn
from PIL import Image
import os
import matplotlib.pyplot as plt
import numpy as np
import torchvision.models as models

In [50]:
class LeavesData(Dataset):
    def __init__(self , csv_path , img_path , mode = 'train' , valid_ratio = 0.2 , resize_height = 256 , resize_weight = 256):
        '''
        Args:
            csv_path(string): csv文件路径 
            img_path(string): 图像文件夹所在路径
            mode(string): 训练模式，测试模式
            valid_ratio(float) : 验证集比例
        '''

        #对原图片尺寸进行统一
        self.resize_height = resize_height
        self.resize_weight = resize_weight
        
        #图像文件路径
        self.file_path = img_path
        
        #设置使用时的模式：train ， valid ， test
        self.mode = mode

        #csv文件路径，包含图像的路径和标签
        self.data_info = pd.read_csv(csv_path,header=None) # 不要读取表头
        
        
        #计算len
        self.data_len = len(self.data_info.index)-1
        
        #train_len
        self.train_len = int(self.data_len*(1-valid_ratio))

        if mode == 'train':
            #train图像的名称,label
            self.train_image = np.asarray(self.data_info.iloc[1:self.train_len,0])
            self.train_label = np.asarray(self.data_info.iloc[1:self.train_len,1])
            self.image_arr = self.train_image
            self.labe_arr = self.train_label

        elif mode == 'valid':
            #valid图像的名称,label
            self.valid_image = np.asarray(self.data_info.iloc[self.train_len:,0])
            self.valid_label = np.asarray(self.data_info.iloc[self.train_len:,1])
            self.image_arr = self.valid_image
            self.labe_arr = self.valid_label

        elif mode == 'test':
            #test是没有label的，label需要模型预测
            self.test_image = np.asarray(self.data_info.iloc[1:,0])
            self.image_arr = self.test_image

        self.real_len = len(self.image_arr)

        print('Finished reading the {} set of Leaves Dataset {} samples found'.format(mode,self.real_len))
        
    def __getitem__(self,index):
        single_image_name  = self.image_arr[index]
        
        # 之前读取的是文件名，现在读取图像文件
        img_as_img = Image.open(self.file_path + single_image_name)
        
        if self.mode == 'train':
            transform = transforms.Compose([
                transforms.Resize((224,224)),
                
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.RandomVerticalFlip(p=0.5),
                transforms.ToTensor()
            ])
        
        else:
            transform = transforms.Compose([
                transforms.Resize((224,224)),
                transforms.ToTensor()
            ])

        img_as_img = transform(img_as_img)
        
        if self.mode == 'test':
            return img_as_img
        
        else:
            #返回图像label
            label = self.labe_arr[index]
            number_label = class_to_num[label]
        
            return img_as_img , number_label
    
    def __len__(self):
        return self.real_len    

In [51]:
train_csv_path = r'dataset/classify-leaves/train.csv'
test_path = r'dataset/classify-leaves/test.csv'
img_path = r'dataset/classify-leaves/'

In [52]:
train_dataset = LeavesData(train_path , img_path , mode = 'train',valid_ratio = 0.3)
val_dataset = LeavesData(train_path , img_path, mode = 'valid',valid_ratio = 0.3)
test_dataset = LeavesData(test_path , img_path , mode = 'test') 

Finished reading the train set of Leaves Dataset 12846 samples found
Finished reading the valid set of Leaves Dataset 5507 samples found
Finished reading the test set of Leaves Dataset 8800 samples found


# 4、 定义datasetloader

In [53]:
train_loader = torch.utils.data.DataLoader(
    dataset = train_dataset,
    batch_size = 16,
    shuffle = True
)

val_loader = torch.utils.data.DataLoader(
    dataset = val_dataset,
    batch_size = 16,
    shuffle = True
)

test_loader = torch.utils.data.DataLoader(
    dataset = test_dataset,
    batch_size = 16,
    shuffle = True
)

# <center>二 、文件夹分类组成格式
![image.png](attachment:image.png)

## 1、使用 torchvision.datasets.ImageFolder()

In [57]:
import torchvision
train_img_folder = r'dataset/example'
transforms_train = transforms.Compose (
    [
        transforms.Resize ([112, 112]),
        transforms.ToTensor ()
    ])

train_dataset = torchvision.datasets.ImageFolder (root = train_img_folder, transform=transforms_train)
train_loader = DataLoader (dataset=train_dataset, batch_size=3, shuffle=True)

In [63]:
train_dataset.class_to_idx

{'保持架故障': 0, '内圈故障': 1, '外圈故障': 2, '混合故障': 3}