# Simulate Unbalanced Distribution

In [1]:
import imp
import random
import CustomDataset
import CustomModels
import os
import torch
import pickle
import numpy as np
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from CustomDataset import dataset_partition
from setting import *

In [2]:
CIFAR10 = datasets.CIFAR10('./data', train=True, transform=transforms.ToTensor())
MNIST = datasets.MNIST('./data', train=True, transform=transforms.ToTensor())
MNIST_idx_dic = CustomDataset.produce_idx_dic(MNIST.targets)
CIFAR10_idx_dic = CustomDataset.produce_idx_dic(CIFAR10.targets)

In [3]:
# create directories
def create(path):
    if not os.path.isdir(path):
        os.mkdir(path)
        
# eg. Indexs/MNIST/p_?/mu_?/
#     Indexs/CIFAR10/p_?/linear/

create("Indexs")
create("Indexs/MNIST")
create("Indexs/CIFAR10")

# directories in MNIST
path = "Indexs/MNIST"
for p in p_MNIST:
    new_dir = os.path.join(path, 'p_'+str(p))
    create(new_dir)
    create(os.path.join(new_dir, EXPONENTIAL))
    create(os.path.join(new_dir, LINEAR))

for p in p_MNIST:
    path_d1 = os.path.join(path, 'p_'+str(p))
    for mu in mu_MNIST:
        new_dir = os.path.join(path_d1, 'mu_'+str(mu))
        create(new_dir)

# # directories in CIFAR10
path = "Indexs/CIFAR10"
for p in p_CIFAR10:
    new_dir = os.path.join(path, 'p_'+str(p))
    create(new_dir)
    create(os.path.join(new_dir, EXPONENTIAL))
    create(os.path.join(new_dir, LINEAR))

for p in p_CIFAR10:
    path_d1 = os.path.join(path, 'p_'+str(p))
    for mu in mu_CIFAR10:
        new_dir = os.path.join(path_d1, 'mu_'+str(mu))
        create(new_dir)


In [4]:
# create base index, seperate train set in to train and validation set
randomState = np.random.RandomState(np.random.seed(12345))
MNIST_eval_idx = CustomDataset.random_select(MNIST_idx_dic, [1000]*10, randomState)
CIFAR10_eval_idx = CustomDataset.random_select(CIFAR10_idx_dic, [1000]*10, randomState)
np.save("Indexs/MNIST_eval", MNIST_eval_idx)
np.save("Indexs/CIFAR10_eval", CIFAR10_eval_idx)

MNIST_all_idx = np.arange(len(MNIST.targets))
MNIST_train_idx = np.delete(MNIST_all_idx, MNIST_eval_idx)
CIFAR10_all_idx = np.arange(len(CIFAR10.targets))
CIFAR10_train_idx = np.delete(CIFAR10_all_idx, CIFAR10_eval_idx)
np.save("Indexs/MNIST_train", MNIST_train_idx)
np.save("Indexs/CIFAR10_train", CIFAR10_train_idx)

MNIST_train_idx_dic = CustomDataset.produce_idx_dic(MNIST.targets[MNIST_train_idx])
CIFAR10_train_idx_dic = CustomDataset.produce_idx_dic(np.array(CIFAR10.targets)[CIFAR10_train_idx])
MNIST_eval_idx_dic = CustomDataset.produce_idx_dic(MNIST.targets[MNIST_eval_idx])
CIFAR10_eval_idx_dic = CustomDataset.produce_idx_dic(np.array(CIFAR10.targets)[CIFAR10_eval_idx])

In [5]:
# create indexs for each distribution and permutation
def generate_distribution(max_size, num_classes, p, mu, unbalanced_type):
    distribution = np.zeros(num_classes).astype(int)
    if (unbalanced_type == LINEAR):
        diff = (max_size-int(max_size/p))
        for i in range(num_classes):
            distribution[i] = int(np.ceil(max_size - (diff*i/9)))
            
    elif (unbalanced_type == EXPONENTIAL):
        ratio = 1.0/p
        for i in range(num_classes):
            distribution[i] = int(np.ceil(max_size * np.power(ratio,i/9)))
            
    else:
        # STEP unbalanced
        assert(mu is not None)
        min_size = int(np.ceil(max_size/p))
        for i in range(num_classes):
            if(i < int((num_classes+1)*(1-mu))):
                distribution[i] = max_size
            else:
                distribution[i] = min_size
    print("({},p={},mu={}): {}".format(
        unbalanced_type, p, mu, distribution))
    return distribution
        

def generate_indexs(dataset_name, p_list, mu_list, permutations, 
                    train_idx_dic, train_idx, eval_idx_dic, eval_idx):
    path = os.path.join("Indexs", dataset_name)
    # generate unbalanced train set index
    train_max = train_max_size
    class_size = 10
    for p in p_list:
        num = 0
        for permutation in permutations:
            num += 1
            for mu in mu_list:
                # STEP
                distribution = generate_distribution(train_max, class_size, p, mu, STEP)
                distribution = distribution[np.array(permutation)]
                idx = CustomDataset.random_select(train_idx_dic, distribution, randomState)
                file_name = os.path.join(path, "p_"+str(p), "mu_"+str(mu), str(num))
                np.save(file_name, train_idx[idx])
            
            # LINEAR
            distribution = generate_distribution(train_max, class_size, p, None, LINEAR)
            distribution = distribution[np.array(permutation)]
            idx = CustomDataset.random_select(train_idx_dic, distribution, randomState)
            file_name = os.path.join(path, "p_"+str(p), LINEAR, str(num))
            np.save(file_name, train_idx[idx])
            
            # EXPONENTIAL
            distribution = generate_distribution(train_max, class_size, p, None, EXPONENTIAL)
            distribution = distribution[np.array(permutation)]
            idx = CustomDataset.random_select(train_idx_dic, distribution, randomState)
            file_name = os.path.join(path, "p_"+str(p), EXPONENTIAL, str(num))
            np.save(file_name, train_idx[idx])
            
    # generate unbalanced evaluation set index
    evaluation_max = evaluation_max_size
    class_size = 10
    for p in p_list:
        num = 0
        for permutation in permutations:
            num += 1
            for mu in mu_list:
                # STEP
                distribution = generate_distribution(evaluation_max, class_size, p, mu, STEP)
                distribution = distribution[np.array(permutation)]
                idx = CustomDataset.random_select(eval_idx_dic, distribution, randomState)
                file_name = os.path.join(path, "p_"+str(p), "mu_"+str(mu), str(num) + "_eval")
                np.save(file_name, eval_idx[idx])
            
            # LINEAR
            distribution = generate_distribution(evaluation_max, class_size, p, None, LINEAR)
            distribution = distribution[np.array(permutation)]
            idx = CustomDataset.random_select(eval_idx_dic, distribution, randomState)
            file_name = os.path.join(path, "p_"+str(p), LINEAR, str(num) + "_eval")
            np.save(file_name, eval_idx[idx])
            
            # EXPONENTIAL
            distribution = generate_distribution(evaluation_max, class_size, p, None, EXPONENTIAL)
            distribution = distribution[np.array(permutation)]
            idx = CustomDataset.random_select(eval_idx_dic, distribution, randomState)
            file_name = os.path.join(path, "p_"+str(p), EXPONENTIAL, str(num) + "_eval")
            np.save(file_name, eval_idx[idx])

In [6]:
generate_indexs('MNIST', p_MNIST, mu_MNIST, permutations, 
                MNIST_train_idx_dic, MNIST_train_idx,
                MNIST_eval_idx_dic, MNIST_eval_idx)

(step,p=10,mu=0.2): [4000 4000 4000 4000 4000 4000 4000 4000  400  400]
(step,p=10,mu=0.5): [4000 4000 4000 4000 4000  400  400  400  400  400]
(step,p=10,mu=0.8): [4000 4000  400  400  400  400  400  400  400  400]
(linear,p=10,mu=None): [4000 3600 3200 2800 2400 2000 1600 1200  800  400]
(exponential,p=10,mu=None): [4000 3098 2398 1857 1438 1114  862  668  517  400]
(step,p=10,mu=0.2): [4000 4000 4000 4000 4000 4000 4000 4000  400  400]
(step,p=10,mu=0.5): [4000 4000 4000 4000 4000  400  400  400  400  400]
(step,p=10,mu=0.8): [4000 4000  400  400  400  400  400  400  400  400]
(linear,p=10,mu=None): [4000 3600 3200 2800 2400 2000 1600 1200  800  400]
(exponential,p=10,mu=None): [4000 3098 2398 1857 1438 1114  862  668  517  400]
(step,p=10,mu=0.2): [4000 4000 4000 4000 4000 4000 4000 4000  400  400]
(step,p=10,mu=0.5): [4000 4000 4000 4000 4000  400  400  400  400  400]
(step,p=10,mu=0.8): [4000 4000  400  400  400  400  400  400  400  400]
(linear,p=10,mu=None): [4000 3600 3200 280

(step,p=250,mu=0.8): [4000 4000   16   16   16   16   16   16   16   16]
(linear,p=250,mu=None): [4000 3558 3115 2672 2230 1787 1344  902  459   16]
(exponential,p=250,mu=None): [4000 2166 1173  635  344  187  101   55   30   16]
(step,p=250,mu=0.2): [4000 4000 4000 4000 4000 4000 4000 4000   16   16]
(step,p=250,mu=0.5): [4000 4000 4000 4000 4000   16   16   16   16   16]
(step,p=250,mu=0.8): [4000 4000   16   16   16   16   16   16   16   16]
(linear,p=250,mu=None): [4000 3558 3115 2672 2230 1787 1344  902  459   16]
(exponential,p=250,mu=None): [4000 2166 1173  635  344  187  101   55   30   16]
(step,p=500,mu=0.2): [4000 4000 4000 4000 4000 4000 4000 4000    8    8]
(step,p=500,mu=0.5): [4000 4000 4000 4000 4000    8    8    8    8    8]
(step,p=500,mu=0.8): [4000 4000    8    8    8    8    8    8    8    8]
(linear,p=500,mu=None): [4000 3557 3113 2670 2226 1783 1339  896  452    8]
(exponential,p=500,mu=None): [4000 2006 1006  504  253  127   64   32   16    8]
(step,p=500,mu=0.2

(exponential,p=10,mu=None): [1000  775  600  465  360  279  216  167  130  100]
(step,p=10,mu=0.2): [1000 1000 1000 1000 1000 1000 1000 1000  100  100]
(step,p=10,mu=0.5): [1000 1000 1000 1000 1000  100  100  100  100  100]
(step,p=10,mu=0.8): [1000 1000  100  100  100  100  100  100  100  100]
(linear,p=10,mu=None): [1000  900  800  700  600  500  400  300  200  100]
(exponential,p=10,mu=None): [1000  775  600  465  360  279  216  167  130  100]
(step,p=25,mu=0.2): [1000 1000 1000 1000 1000 1000 1000 1000   40   40]
(step,p=25,mu=0.5): [1000 1000 1000 1000 1000   40   40   40   40   40]
(step,p=25,mu=0.8): [1000 1000   40   40   40   40   40   40   40   40]
(linear,p=25,mu=None): [1000  894  787  680  574  467  360  254  147   40]
(exponential,p=25,mu=None): [1000  700  490  342  240  168  117   82   58   40]
(step,p=25,mu=0.2): [1000 1000 1000 1000 1000 1000 1000 1000   40   40]
(step,p=25,mu=0.5): [1000 1000 1000 1000 1000   40   40   40   40   40]
(step,p=25,mu=0.8): [1000 1000   4

(step,p=500,mu=0.5): [1000 1000 1000 1000 1000    2    2    2    2    2]
(step,p=500,mu=0.8): [1000 1000    2    2    2    2    2    2    2    2]
(linear,p=500,mu=None): [1000  890  779  668  557  446  335  224  113    2]
(exponential,p=500,mu=None): [1000  502  252  126   64   32   16    8    4    2]
(step,p=1000,mu=0.2): [1000 1000 1000 1000 1000 1000 1000 1000    1    1]
(step,p=1000,mu=0.5): [1000 1000 1000 1000 1000    1    1    1    1    1]
(step,p=1000,mu=0.8): [1000 1000    1    1    1    1    1    1    1    1]
(linear,p=1000,mu=None): [1000  889  778  667  556  445  334  223  112    1]
(exponential,p=1000,mu=None): [1000  465  216  101   47   22   11    5    3    1]
(step,p=1000,mu=0.2): [1000 1000 1000 1000 1000 1000 1000 1000    1    1]
(step,p=1000,mu=0.5): [1000 1000 1000 1000 1000    1    1    1    1    1]
(step,p=1000,mu=0.8): [1000 1000    1    1    1    1    1    1    1    1]
(linear,p=1000,mu=None): [1000  889  778  667  556  445  334  223  112    1]
(exponential,p=10

In [7]:
generate_indexs('CIFAR10', p_CIFAR10, mu_CIFAR10, permutations,
                CIFAR10_train_idx_dic, CIFAR10_train_idx,
                CIFAR10_eval_idx_dic, CIFAR10_eval_idx)

(step,p=2,mu=0.2): [4000 4000 4000 4000 4000 4000 4000 4000 2000 2000]
(step,p=2,mu=0.5): [4000 4000 4000 4000 4000 2000 2000 2000 2000 2000]
(step,p=2,mu=0.8): [4000 4000 2000 2000 2000 2000 2000 2000 2000 2000]
(linear,p=2,mu=None): [4000 3778 3556 3334 3112 2889 2667 2445 2223 2000]
(exponential,p=2,mu=None): [4000 3704 3429 3175 2940 2722 2520 2334 2161 2000]
(step,p=2,mu=0.2): [4000 4000 4000 4000 4000 4000 4000 4000 2000 2000]
(step,p=2,mu=0.5): [4000 4000 4000 4000 4000 2000 2000 2000 2000 2000]
(step,p=2,mu=0.8): [4000 4000 2000 2000 2000 2000 2000 2000 2000 2000]
(linear,p=2,mu=None): [4000 3778 3556 3334 3112 2889 2667 2445 2223 2000]
(exponential,p=2,mu=None): [4000 3704 3429 3175 2940 2722 2520 2334 2161 2000]
(step,p=2,mu=0.2): [4000 4000 4000 4000 4000 4000 4000 4000 2000 2000]
(step,p=2,mu=0.5): [4000 4000 4000 4000 4000 2000 2000 2000 2000 2000]
(step,p=2,mu=0.8): [4000 4000 2000 2000 2000 2000 2000 2000 2000 2000]
(linear,p=2,mu=None): [4000 3778 3556 3334 3112 2889 26

(linear,p=2,mu=None): [1000  945  889  834  778  723  667  612  556  500]
(exponential,p=2,mu=None): [1000  926  858  794  735  681  630  584  541  500]
(step,p=2,mu=0.2): [1000 1000 1000 1000 1000 1000 1000 1000  500  500]
(step,p=2,mu=0.5): [1000 1000 1000 1000 1000  500  500  500  500  500]
(step,p=2,mu=0.8): [1000 1000  500  500  500  500  500  500  500  500]
(linear,p=2,mu=None): [1000  945  889  834  778  723  667  612  556  500]
(exponential,p=2,mu=None): [1000  926  858  794  735  681  630  584  541  500]
(step,p=2,mu=0.2): [1000 1000 1000 1000 1000 1000 1000 1000  500  500]
(step,p=2,mu=0.5): [1000 1000 1000 1000 1000  500  500  500  500  500]
(step,p=2,mu=0.8): [1000 1000  500  500  500  500  500  500  500  500]
(linear,p=2,mu=None): [1000  945  889  834  778  723  667  612  556  500]
(exponential,p=2,mu=None): [1000  926  858  794  735  681  630  584  541  500]
(step,p=10,mu=0.2): [1000 1000 1000 1000 1000 1000 1000 1000  100  100]
(step,p=10,mu=0.5): [1000 1000 1000 1000 10

In [8]:
# create ROS indexs
def generate_ros_indexs(dataset_name, dataset_targets, p_list, mu_list, permutations, randomState):
    path = os.path.join("Indexs", dataset_name)
    # generate unbalanced train set index
    train_max = train_max_size
    class_size = 10
    
    for p in p_list:
        num = 0
        for permutation in permutations:
            num += 1
            for mu in mu_list:
                # STEP
                imbalanced_idx = np.load(os.path.join(path, "p_"+str(p), "mu_"+str(mu), str(num)+".npy"))
                imbalanced_idx_dic =  CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx])
                ROS_idx = CustomDataset.duplicate_idxs(imbalanced_idx_dic, 4000, randomState=randomState)
                file_name = os.path.join(path, "p_"+str(p), "mu_"+str(mu), str(num)+"_ros")
                np.save(file_name, imbalanced_idx[ROS_idx])
                
                #print(len(CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx[ROS_idx]])[0]))
            
            # LINEAR
            imbalanced_idx = np.load(os.path.join(path, "p_"+str(p), LINEAR, str(num)+".npy"))
            imbalanced_idx_dic =  CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx])
            ROS_idx = CustomDataset.duplicate_idxs(imbalanced_idx_dic, 4000, randomState=randomState)
            file_name = os.path.join(path, "p_"+str(p), LINEAR, str(num)+"_ros")
            np.save(file_name, imbalanced_idx[ROS_idx])
            
            #print(len(CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx[ROS_idx]])[0]))
            
            # EXPONENTIAL
            imbalanced_idx = np.load(os.path.join(path, "p_"+str(p), EXPONENTIAL, str(num)+".npy"))
            imbalanced_idx_dic =  CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx])
            ROS_idx = CustomDataset.duplicate_idxs(imbalanced_idx_dic, 4000, randomState=randomState)
            file_name = os.path.join(path, "p_"+str(p), EXPONENTIAL, str(num)+"_ros")
            np.save(file_name, imbalanced_idx[ROS_idx])
            
            #print(len(CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx[ROS_idx]])[0]))


In [9]:
generate_ros_indexs("MNIST", MNIST.targets, p_MNIST, mu_MNIST, permutations, randomState)
generate_ros_indexs("CIFAR10", np.array(CIFAR10.targets), p_CIFAR10, mu_CIFAR10, permutations, randomState)

In [10]:
# create dataset partition for imbalanced ditribution
def generate_class_partition(imbalance_index_dic):
    dtype = [('key', int), ('size', int)]
    sizes = np.array([(0,0)]*len(imbalance_index_dic.keys()), dtype)
    total_size = 0

    for (k,v) in imbalance_index_dic.items():
        sizes[k] = (k,len(v))
        total_size = total_size + len(v)

    sizes[::-1].sort(order='size')
    partition, structure = recursive_partition(sizes, total_size)
    print(structure)
    print("")
    return partition

def recursive_partition(sizes, total_size):
    # param sizes: list of data where dtype = [('key', int), ('size', int)]
    if (len(sizes) == 1):
        return dataset_partition(old_class_number = sizes[0][0]), sizes[0]
    
    class_zero_size = 0
    partition_point = 0
    classes = np.array([-1]*10)
    
    while class_zero_size < total_size/2:
        class_zero_size = class_zero_size + sizes[partition_point][1]
        classes[sizes[partition_point][0]] = 0
        partition_point = partition_point+1
    
    for i in range(partition_point, len(sizes)):
        classes[sizes[i][0]] = 1
    
    child_1, structure_1 = recursive_partition(sizes[0:partition_point], class_zero_size)
    child_2, structure_2 = recursive_partition(sizes[partition_point:], total_size-class_zero_size)
    
    children = [child_1, child_2]
    
    return dataset_partition(classes=classes, children=children), [structure_1, structure_2]

def check_dataset_partition(partition):
    classes = np.array([0,1,2,3,4,5,6,7,8,9])
    frontier = partition.children[:]
    while len(frontier) != 0:
        item = frontier.pop(0)
        if (not item.has_children):
            print(item.old_class_number)
        else:
            frontier = frontier+item.children[:]
            temp = classes[item.classes!=-1]
            print(temp)

def check_idx_dic(idx_dic):
    for k, v in idx_dic.items():
        print(str((k,len(v))), end=' ')
    print("")

def generate_hierarchical_partition(dataset_name, dataset_targets, p_list, mu_list, permutations):
    path = os.path.join("Indexs", dataset_name)
    # generate unbalanced train set index
    train_max = train_max_size
    class_size = 10
    
    for p in p_list:
        num = 0
        for permutation in permutations:
            num += 1
            for mu in mu_list:
                # STEP
                imbalanced_idx = np.load(os.path.join(path, "p_"+str(p), "mu_"+str(mu), str(num)+".npy"))
                imbalanced_idx_dic =  CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx])
                partition = generate_class_partition(imbalanced_idx_dic)
                file_name = os.path.join(path, "p_"+str(p), "mu_"+str(mu), str(num)+"_h.txt")
                with open(file_name, 'wb') as f:
                    pickle.dump(partition, f)
                
                #print(len(CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx[ROS_idx]])[0]))
            
            # LINEAR
            imbalanced_idx = np.load(os.path.join(path, "p_"+str(p), LINEAR, str(num)+".npy"))
            imbalanced_idx_dic =  CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx])
            #check_idx_dic(imbalanced_idx_dic)
            partition = generate_class_partition(imbalanced_idx_dic)
            #check_dataset_partition(partition)
            file_name = os.path.join(path, "p_"+str(p), LINEAR, str(num)+"_h.txt")
            with open(file_name, 'wb') as f:
                pickle.dump(partition, f)
            
            #print(len(CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx[ROS_idx]])[0]))
            
            # EXPONENTIAL
            imbalanced_idx = np.load(os.path.join(path, "p_"+str(p), EXPONENTIAL, str(num)+".npy"))
            imbalanced_idx_dic =  CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx])
            partition = generate_class_partition(imbalanced_idx_dic)
            file_name = os.path.join(path, "p_"+str(p), EXPONENTIAL, str(num)+"_h.txt")
            with open(file_name, 'wb') as f:
                pickle.dump(partition, f)
            
            #print(len(CustomDataset.produce_idx_dic(dataset_targets[imbalanced_idx[ROS_idx]])[0]))

In [11]:
generate_hierarchical_partition("MNIST", MNIST.targets, p_MNIST, mu_MNIST, permutations)

[[[[(7, 4000), (6, 4000)], (5, 4000)], [(4, 4000), (3, 4000)]], [[(2, 4000), (1, 4000)], [(0, 4000), [(9, 400), (8, 400)]]]]

[[[(4, 4000), (3, 4000)], (2, 4000)], [[(1, 4000), (0, 4000)], [[[(9, 400), (8, 400)], (7, 400)], [(6, 400), (5, 400)]]]]

[[(1, 4000), (0, 4000)], [[[(9, 400), (8, 400)], [(7, 400), (6, 400)]], [[(5, 400), (4, 400)], [(3, 400), (2, 400)]]]]

[[[(0, 4000), (1, 3600)], [(2, 3200), (3, 2800)]], [[(4, 2400), (5, 2000)], [[(6, 1600), (7, 1200)], [(8, 800), (9, 400)]]]]

[[[(0, 4000), (1, 3098)], (2, 2398)], [[[(3, 1857), (4, 1438)], (5, 1114)], [[(6, 862), (7, 668)], [(8, 517), (9, 400)]]]]

[[[[(9, 4000), (8, 4000)], (7, 4000)], [(6, 4000), (4, 4000)]], [[(3, 4000), (2, 4000)], [(1, 4000), [(5, 400), (0, 400)]]]]

[[[(9, 4000), (6, 4000)], (3, 4000)], [[(2, 4000), (1, 4000)], [[[(8, 400), (7, 400)], (5, 400)], [(4, 400), (0, 400)]]]]

[[(9, 4000), (6, 4000)], [[[(8, 400), (7, 400)], [(5, 400), (4, 400)]], [[(3, 400), (2, 400)], [(1, 400), (0, 400)]]]]

[[[(6, 4000)

[[[(6, 4000), (8, 3565)], (2, 3129)], [[(9, 2694), (7, 2258)], [[(0, 1823), (5, 1387)], [(3, 952), [(4, 516), (1, 80)]]]]]

[[(6, 4000), (8, 2590)], [[(2, 1677), (9, 1086)], [[(7, 704), (0, 456)], [[(5, 295), (3, 191)], [(4, 124), (1, 80)]]]]]

[[[[(9, 4000), (8, 4000)], (6, 4000)], [(4, 4000), (3, 4000)]], [[(2, 4000), (1, 4000)], [(0, 4000), [(7, 80), (5, 80)]]]]

[[[(4, 4000), (3, 4000)], (2, 4000)], [[(1, 4000), (0, 4000)], [[[(9, 80), (8, 80)], (7, 80)], [(6, 80), (5, 80)]]]]

[[(4, 4000), (2, 4000)], [[[(9, 80), (8, 80)], [(7, 80), (6, 80)]], [[(5, 80), (3, 80)], [(1, 80), (0, 80)]]]]

[[[(2, 4000), (4, 3565)], (1, 3129)], [[(3, 2694), (0, 2258)], [[(9, 1823), (8, 1387)], [(6, 952), [(7, 516), (5, 80)]]]]]

[[(2, 4000), (4, 2590)], [[(1, 1677), (3, 1086)], [[(0, 704), (9, 456)], [[(8, 295), (6, 191)], [(7, 124), (5, 80)]]]]]

[[[[(7, 4000), (6, 4000)], (5, 4000)], [(4, 4000), (3, 4000)]], [[(2, 4000), (1, 4000)], [(0, 4000), [(9, 40), (8, 40)]]]]

[[[(4, 4000), (3, 4000)], (2, 40

[[[(4, 4000), (0, 3557)], (9, 3113)], [[(5, 2670), (8, 2226)], [[(7, 1783), (2, 1339)], [(6, 896), [(1, 452), (3, 8)]]]]]

[[(4, 4000), (0, 2006)], [(9, 1006), [(5, 504), [(8, 253), [(7, 127), [(2, 64), [(6, 32), [(1, 16), (3, 8)]]]]]]]]

[[[[(9, 4000), (8, 4000)], (7, 4000)], [(6, 4000), (5, 4000)]], [[(3, 4000), (2, 4000)], [(0, 4000), [(4, 8), (1, 8)]]]]

[[[(9, 4000), (8, 4000)], (7, 4000)], [[(6, 4000), (2, 4000)], [[[(5, 8), (4, 8)], (3, 8)], [(1, 8), (0, 8)]]]]

[[(8, 4000), (6, 4000)], [[[(9, 8), (7, 8)], [(5, 8), (4, 8)]], [[(3, 8), (2, 8)], [(1, 8), (0, 8)]]]]

[[[(6, 4000), (8, 3557)], (2, 3113)], [[(9, 2670), (7, 2226)], [[(0, 1783), (5, 1339)], [(3, 896), [(4, 452), (1, 8)]]]]]

[[(6, 4000), (8, 2006)], [(2, 1006), [(9, 504), [(7, 253), [(0, 127), [(5, 64), [(3, 32), [(4, 16), (1, 8)]]]]]]]]

[[[[(9, 4000), (8, 4000)], (6, 4000)], [(4, 4000), (3, 4000)]], [[(2, 4000), (1, 4000)], [(0, 4000), [(7, 8), (5, 8)]]]]

[[[(4, 4000), (3, 4000)], (2, 4000)], [[(1, 4000), (0, 4000)]

[(6, 4000), [(8, 1592), [(2, 634), [(9, 252), [(7, 101), [(0, 40), [(5, 16), [(3, 7), [(4, 3), (1, 1)]]]]]]]]]

[[[[(9, 4000), (8, 4000)], (6, 4000)], [(4, 4000), (3, 4000)]], [[(2, 4000), (1, 4000)], [(0, 4000), [(7, 1), (5, 1)]]]]

[[[(4, 4000), (3, 4000)], (2, 4000)], [[(1, 4000), (0, 4000)], [[[(9, 1), (8, 1)], (7, 1)], [(6, 1), (5, 1)]]]]

[[(4, 4000), (2, 4000)], [[[(9, 1), (8, 1)], [(7, 1), (6, 1)]], [[(5, 1), (3, 1)], [(1, 1), (0, 1)]]]]

[[[(2, 4000), (4, 3556)], (1, 3112)], [[(3, 2667), (0, 2223)], [[(9, 1779), (8, 1334)], [(6, 890), [(7, 446), (5, 1)]]]]]

[(2, 4000), [(4, 1592), [(1, 634), [(3, 252), [(0, 101), [(9, 40), [(8, 16), [(6, 7), [(7, 3), (5, 1)]]]]]]]]]



In [34]:
os.path.join("a","b","c")

'a/b/c'

In [32]:
np.power(np.power(1.0/2, 0), 9)

1.0

In [None]:
# Produce targrt transform and index transform on datasets use by the hierarchical method

In [5]:
# target dataset MNIST_expo_unbalance_2560to5
Dataset = 'MNIST'
Distribution = 'expo_unbalance_2560to5'
randomState = np.random.RandomState(np.random.seed(12345))
target_dataset, _, _ = CustomDataset.load_dataset(dataset_name=Dataset, distribution_name=Distribution, transform = None, data_folder="./data")
target_idx_dic = CustomDataset.produce_idx_dic(target_dataset.targets)
origin_indexs = np.load("./data/"+Dataset+"_"+Distribution +".npy")

897


In [6]:
origin_indexs

array([   13,    14,    19, ..., 59983, 59986, 59998])

In [7]:
target_idx_dic[0]

[414, 1748, 2611, 3507, 3853]

In [8]:
# using binary splits on the targets
dtype = [('key', int), ('size', int)]
sizes = np.array([(0,0)]*len(target_idx_dic.keys()), dtype)
total_size = 0

for (k,v) in target_idx_dic.items():
    sizes[k] = (k,len(v))
    total_size = total_size + len(v)

sizes[::-1].sort(order='size')

def recursive_partition(sizes, total_size):
    if (len(sizes) == 1):
        return sizes[0]
    
    class_one_size = 0
    index = 0
    while class_one_size < total_size/2:
        class_one_size = class_one_size + sizes[index][1]
        index = index+1
    return [
        recursive_partition(sizes[0:index], class_one_size), 
        recursive_partition(sizes[index:], total_size-class_one_size)
    ]

In [9]:
class_partition = recursive_partition(sizes, total_size)

In [10]:
isinstance(class_partition, list)

True

In [11]:
class_partition

[(6, 2560),
 [(9, 1280),
  [(1, 640),
   [(2, 320),
    [(3, 160), [(5, 80), [(7, 40), [(4, 20), [(8, 10), (0, 5)]]]]]]]]]

In [12]:
# TODO: build levels of classifers and datasets based on the given hierarchical structure

In [19]:
def build_datasets_partition(target_idx_dic, class_partition, origin_indexs):
    # dataset_partition : (idxs, classes, children) | (idxs, old_class_number)
    # children : [dataset_partition, dataset_partition]
    # this function will return an dataset_partition build from the root of class_partition
    classes = np.array([-1]*10)
    children = []
    class_number = 0
    for child_class_partition in class_partition:
        if isinstance(child_class_partition, list):
            # build child partition for the classes
            child_dataset_partition = build_datasets_partition(target_idx_dic, child_class_partition, origin_indexs)
            children.append(child_dataset_partition)
            # all classes in the child partition will be merged into single class of current partition
            classes[child_dataset_partition.classes!=-1] = class_number
        else:
            # no child partition, single class
            old_class_number = child_class_partition[0]
            children.append(
                dataset_partition(
                    idxs = origin_indexs[target_idx_dic[old_class_number]], 
                    old_class_number = old_class_number
                )
            )
            classes[old_class_number] = class_number
        class_number = class_number+1
    
    return dataset_partition(classes=classes, children=children)

def check_dataset_partition(target_idx_dic, partition):
    classes = np.array([0,1,2,3,4,5,6,7,8,9])
    frontier = partition.children[:]
    while len(frontier) != 0:
        item = frontier.pop(0)
        if (not item.has_children):
            print(item.old_class_number)
        else:
            frontier = frontier+item.children[:]
            temp = classes[item.classes!=-1]
            print(temp)
    
    

In [20]:
a = build_datasets_partition(target_idx_dic, class_partition, origin_indexs)

In [37]:
import pickle 
filehandler = open('./data/partition_object_test.txt', 'wb') 
pickle.dump(a, filehandler)

In [46]:
import pickle 
filehandler = open('./data/partition_object_test.txt', 'rb') 
a_2 = pickle.load(filehandler)

In [52]:
d=CustomDataset.load_partition_dataset("MNIST", a_2.children[1], None, train=True, data_folder="./data")

In [53]:
dic = CustomDataset.produce_idx_dic(d.targets)

In [54]:
for (k,v) in dic.items():
    print(len(v))

1280
1275


In [51]:
check_dataset_partition(target_idx_dic, a_2)

6
Number: 2560
[0 1 2 3 4 5 7 8 9]
Number: 2555
9
Number: 1280
[0 1 2 3 4 5 7 8]
Number: 1275
1
Number: 640
[0 2 3 4 5 7 8]
Number: 635
2
Number: 320
[0 3 4 5 7 8]
Number: 315
3
Number: 160
[0 4 5 7 8]
Number: 155
5
Number: 80
[0 4 7 8]
Number: 75
7
Number: 40
[0 4 8]
Number: 35
4
Number: 20
[0 8]
Number: 15
8
Number: 10
0
Number: 5


In [12]:
# finished

In [6]:
np.array([-1,1,1])[True]

array([[-1,  1,  1]])

In [1]:
"\n".join(str(item) for item in [1,1,1,1])

'1\n1\n1\n1'

In [45]:
m = CustomModels.load_model('Custom_05', 1,  2)
target_dataset = datasets.MNIST('./data', train=False, download=True,
                                transform=transforms.Compose([
                            transforms.Pad(2, fill=0, padding_mode='constant'),
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))

In [46]:
loader= torch.utils.data.DataLoader(target_dataset, batch_size=10, shuffle=True, num_workers=4)

In [32]:
pic = (target_dataset[0][0])
tensor = transforms.ToTensor()(pic)
tensor.unsqueeze_(0)

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]])

In [48]:
a = loader.__iter__().next()

In [72]:
a[0].shape[0]

10

In [33]:
tensor.shape

torch.Size([1, 1, 32, 32])

In [58]:
result = m(a[0])

In [74]:
result.data[:,0]

tensor([-0.7331, -0.6521, -0.4208, -0.5569, -0.6298, -1.0289, -1.0347, -0.4753,
        -0.6981, -0.7299])

In [73]:
np.exp(result.data.numpy())

array([[0.48041195, 0.519588  ],
       [0.52097166, 0.47902843],
       [0.6565223 , 0.34347776],
       [0.5729933 , 0.42700675],
       [0.53271914, 0.46728086],
       [0.357414  , 0.642586  ],
       [0.35531732, 0.64468265],
       [0.62169015, 0.37830985],
       [0.49755234, 0.50244766],
       [0.48194927, 0.51805073]], dtype=float32)

10

In [34]:
m = torch.tensor([[1,2],
             [1,2],
                 [2,1]])
n = torch.argmax(m,axis=1)

In [35]:
np.log(0.99)

-0.01005033585350145

In [36]:
n

tensor([1, 1, 0])

In [44]:
m[np.arange(len(m)),n]
m[:,:] = 1

In [45]:
m

tensor([[1, 1],
        [1, 1],
        [1, 1]])

In [66]:
(np.array([1,2,3,4]) == 3) and (np.array([1,0,1,0])==1)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [77]:
a = 1
a += 1

In [78]:
a

2

In [91]:
a = np.zeros((10,10))
a[:, np.array([1,1,1,0,0,0,1,1,1,0]) == 1] += result.data[:,0].numpy().reshape((10,1))

In [92]:
a

array([[-0.73311132, -0.73311132, -0.73311132,  0.        ,  0.        ,
         0.        , -0.73311132, -0.73311132, -0.73311132,  0.        ],
       [-0.65205967, -0.65205967, -0.65205967,  0.        ,  0.        ,
         0.        , -0.65205967, -0.65205967, -0.65205967,  0.        ],
       [-0.42079869, -0.42079869, -0.42079869,  0.        ,  0.        ,
         0.        , -0.42079869, -0.42079869, -0.42079869,  0.        ],
       [-0.55688125, -0.55688125, -0.55688125,  0.        ,  0.        ,
         0.        , -0.55688125, -0.55688125, -0.55688125,  0.        ],
       [-0.62976092, -0.62976092, -0.62976092,  0.        ,  0.        ,
         0.        , -0.62976092, -0.62976092, -0.62976092,  0.        ],
       [-1.02886045, -1.02886045, -1.02886045,  0.        ,  0.        ,
         0.        , -1.02886045, -1.02886045, -1.02886045,  0.        ],
       [-1.03474402, -1.03474402, -1.03474402,  0.        ,  0.        ,
         0.        , -1.03474402, -1.03474402

In [87]:
result.data[:,0].reshape((10,1))

tensor([[-0.7331],
        [-0.6521],
        [-0.4208],
        [-0.5569],
        [-0.6298],
        [-1.0289],
        [-1.0347],
        [-0.4753],
        [-0.6981],
        [-0.7299]])

In [25]:
cm = np.load("MNIST_hierarchical_test_whole_system_test/result_outputs/confusion_matrix.npy").astype(int)
cm

array([[ 832,    0,   56,    0,    2,    3,   40,   10,   15,   22],
       [   0, 1128,    4,    0,    0,    1,    1,    0,    0,    1],
       [   1,    1, 1013,    3,    1,    0,    3,    6,    4,    0],
       [   3,    0,    5,  948,    2,   30,    0,   11,   10,    1],
       [   0,    6,   15,    0,  822,    0,    8,    5,    9,  117],
       [   1,   17,    3,    8,    3,  838,    8,    4,    5,    5],
       [   0,    2,    2,    0,    1,    1,  952,    0,    0,    0],
       [   3,   18,   35,    3,    7,    1,    0,  891,    4,   66],
       [  13,    3,   18,   38,   21,   60,   24,    7,  691,   99],
       [   4,    1,    1,    0,    4,    1,    1,    8,    3,  986]])

In [27]:
np.average(np.sum(np.eye(10)* cm,1) / np.sum(cm,1))

0.9086663501138753

In [9]:
np.exp(1)

2.718281828459045

In [12]:
np.sum(cm,0)[9]

2440.0

In [13]:
np.sum(cm)

10000.0