In [None]:
from nvidia.dali.pipeline import pipeline_def
import nvidia.dali.types as types
import nvidia.dali.fn as fn
from nvidia.dali.plugin.pytorch import DALIGenericIterator
import os
import sys
import pickle
import numpy as np

In [None]:
#### 加载cifar10数据集 ####
def load_cifar10(batch_size, train=True, root='/data/cifar10'):
    '''该函数返回的结果与torchvision.datasets.CIFAR10()函数取self.data和self.targets返回的结果相同''''
    base_folder = 'cifar-10-batches-py'
    train_list = [
        ['data_batch_1', 'c99cafc152244af753f735de768cd75f'],
        ['data_batch_2', 'd4bba439e000b95fd0a9bffe97cbabec'],
        ['data_batch_3', '54ebc095f3ab1f0389bbae665268c751'],
        ['data_batch_4', '634d18415352ddfa80567beed471001a'],
        ['data_batch_5', '482c414d41f54cd18b22e5b47cb7c3cb'],
    ]

    test_list = [
        ['test_batch', '40351d587109b95175f43aff81a1287e'],
    ]
    
    if train:
        downloaded_list = train_list
    else:
        downloaded_list = test_list

    data = []
    targets = []
    for file_name, checksum in downloaded_list:
        file_path = os.path.join(root, base_folder, file_name)
        with open(file_path, 'rb') as f:
            if sys.version_info[0] == 2:
                entry = pickle.load(f)
            else:
                entry = pickle.load(f, encoding='latin1')
            data.append(entry['data'])
            if 'labels' in entry:
                targets.extend(entry['labels'])
            else:
                targets.extend(entry['fine_labels'])

    data = np.vstack(data).reshape(-1, 3, 32, 32)
    data = data.transpose((0, 2, 3, 1))  # convert to HWC
    targets = np.vstack(targets)
#     np.save("cifar.npy", data)
#     data = np.load('cifar.npy')  # to serialize, increase locality
    return data, targets

In [None]:
image_dir = "../data/"
batch_size = 256
data, targets = load_cifar(batch_size, train=True, root=image_dir)

# @pipeline_def
# def simple_pipeline():
#     jpegs, labels = fn.readers.file(file_root=image_dir)
#     images = fn.decoders.image(jpegs, device='cpu')

#     return images, labels

In [None]:
print(f"data的size为:{data.shape}") # [50000, 32, 32, 3]
print(f"targets的size为:{targets.shape}") # [50000, 1]

In [None]:
def partition_data(training_data, labels, num_client, num_class, partition = 'noniid', beta=0.4): 
    '''按照Dirichlet分布划分原始数据集 '''
    # 参数num_client表示client的数量
    # training_data和labels是numpy数组
    training_data_subset_list = []
    training_label_subset_list = []
    if partition == "homo" or partition == "iid":
        idxs = np.random.permutation(N)   #在训练集的条数范围内生成随机序列
        batch_idxs = np.array_split(idxs, num_client)
        net_dataidx_map = {i: batch_idxs[i] for i in range(num_client)}

    elif partition == "noniid-labeldir" or partition == "noniid":
        min_size = 0 
        min_require_size = 10   # 每个client至少要有10条数据
        K = num_class
            # min_require_size = 100

        N = labels.shape[0]
        net_dataidx_map = {}  #用于存放每个client拥有的样本的idx数组

        #min_size表示所有client中样本数量最少的client对应的样本数量。如果存在某个client的样本数量没达到min_require_size，则继续为client分配样本。
        while min_size < min_require_size:
            idx_batch = [[] for _ in range(num_client)]  # idx_batch存放num_client个client对应的样本idx
            for k in range(K): #遍历所有类别，将每个类别按Dirichlet分布的比例分配给各个client。
                idx_k = np.where(labels == k)[0]  #idx_k表示训练集中label为k的所有样本的idx集合
                np.random.shuffle(idx_k) #上面选出来的idx是按顺序的，现在把顺序打乱。
                proportions = np.random.dirichlet(np.repeat(beta, num_client)) 
                #proportions的长度为num_client
                proportions = np.array([p * (len(idx_j) < N / num_client) for p, idx_j in zip(proportions, idx_batch)])  # 取出第j个client拥有的所有sample下标和第j个client的idx
                proportions = proportions / proportions.sum() #将剩下的client的划分比例重新归一化
                proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1] #
                idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))]  #为第j个client分配类别k的样本
                min_size = min([len(idx_j) for idx_j in idx_batch]) #min_size表示所有client中样本数量最少的client对应的样本数量
                # if K == 2 and num_client <= 10:
                #     if np.min(proportions) < 200:
                #         min_size = 0
                #         break


        for j in range(num_client):
            #分配完之后，由于idx_batch中的样本idx是按类别顺序存放的，所以要打乱。
            np.random.shuffle(idx_batch[j]) 
            net_dataidx_map[j] = idx_batch[j] # 用net_dataidx_map记录每个client拥有的样本。
            # 封装为dataloader
            training_data_subset = training_data[idx_batch[j]]
            training_label_subset = labels[idx_batch[j]]
            print(f"labels的维度为:{labels.shape}")
            print(f"idx_batch[{j}]:{idx_batch[j]}")
            training_data_subset_list.append(training_data_subset)
            training_label_subset_list.append(training_label_subset)
#     print(net_dataidx_map)
    #traindata_cls_counts：数据分布情况（每个client拥有的所有类别及其数量）
    traindata_cls_counts = record_net_data_stats(labels, net_dataidx_map) 

    return training_data_subset_list, training_label_subset_list, traindata_cls_counts

def record_net_data_stats(y_train, net_dataidx_map):
    '''用于记录每个client的数据分布(拥有的所有样本类别，及该类别出现的次数)'''
    net_cls_counts = {}

    for net_i, dataidx in net_dataidx_map.items(): # dict.items()返回(key, value)元组组成的列表
    # net_i表示第i个client, dataidx为其拥有的样本idx
        unq, unq_cnt = np.unique(y_train[dataidx], return_counts=True) #返回unique的类别数组
        tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))} # 字典,存放第i个client拥有的类别及其数量
        net_cls_counts[net_i] = tmp #字典，存放所有client的类别信息

    data_list=[]
    for net_id, data in net_cls_counts.items(): # net_id表示client编号，data表示该client拥有的类别的次数信息
        n_total=0
        for class_id, n_data in data.items(): # class_id表示类别编号，n_data表示该类别在该client中的出现次数
            n_total += n_data  # 计算该client拥有的数据条数
        data_list.append(n_total) #data_list保存每个client拥有的数据条数
    print('mean:', np.mean(data_list)) #打印每个client的平均数据条数和方差，以显示异质程度
    print('std:', np.std(data_list))

    return net_cls_counts


In [None]:
training_data_list, training_label_list, traindata_cls_counts = partition_data(data,targets,num_client=10,num_class=10, partition = 'noniid', beta=0.4)

In [None]:
print(training_data_list[0].shape) #[(item_num, 32, 32, 3)*10]
# print(training_label_list[0].shape) # [(item_num, 1)*10]
# print(training_data_list[0].shape==training_label_list[0].shape)

In [None]:
# 创建num_client个pipeline.
# 在pipeline中定义数据增强的流程

CIFAR10_MEAN=[0.49139968 * 255., 0.48215827 * 255., 0.44653124 * 255.]
CIFAR10_STD=[0.24703233 * 255., 0.24348505 * 255., 0.26158768 * 255.]
@pipeline_def(num_threads=4, device_id=0)
def get_dali_pipeline(images, labels):
#     images, labels = fn.readers.file(
#         file_root=images_dir, random_shuffle=True, name="Reader")
    # decode data on the GPU
    images = fn.decoders.image_random_crop(
        images, device="gpu", output_type=types.RGB, random_area=[0.14,1])
    # the rest of processing happens on the GPU as well
    images = fn.resize(images, resize_x=224, resize_y=224)
    
    images = fn.random.coin_flip(probability=0.5)
#     images = fn.crop_mirror_normalize(
#         images,
#         device="gpu",
#         crop_h=224,
#         crop_w=224,
#         mean=CIFAR10_MEAN,
#         std=CIFAR10_STD)
#     transforms.RandomHorizontalFlip(p=0.5),
#                 transforms.Compose(color_transform),
#                 transforms.ToTensor(),
#                 transforms.Normalize(mean=mean, std=std)])
#             ] * nmb_crops[i])
    return images, labels

In [None]:
# 定义dataloader
train_data = DALIGenericIterator(
    [get_dali_pipeline(training_data_list[0], training_label_list[0], batch_size=256)],
    ['data', 'label']
)

In [None]:
for i, data in enumerate(train_data):
    x, y = data[0]['data'], data[0]['label']
    print(f"size of x is :{x.size()}")
    print(f"size of y is: {y.size()}")
#     pred = model(x)
#     loss = loss_func(pred, y)
#     backward(loss, model)