In [2]:
import os
from shutil import copyfile, rmtree
from random import shuffle, seed
from json import dump as savejson

In [3]:
def prepare_training_validation(setname, threshold, root_dir='/image/data/folder/',
                                random_seed=2017, step=1):
    
    seed(random_seed)
    
    step_name = ['', 'pseudo_pairing', 'roc']    
    origin_path = os.path.join(root_dir,'imgs', setname)
    dataset_path = os.path.join(root_dir, 'train_test_data', step_name[step], setname)
    target_root = os.path.join(dataset_path, '%d' % threshold)
    target_path1 = os.path.join(target_root, 'train')
    target_path2 = os.path.join(target_root, 'test')
    
    try:
        os.mkdir(dataset_path)
    except:
        pass
    
    try:
        rmtree(target_root)
    except:
        pass

    try:
        os.mkdir(target_root)
        os.mkdir(target_path1)
        os.mkdir(target_path2)
    except OSError:
        pass

    print "set name: %s" % setname
    print "total seller: %d" % len(os.listdir(origin_path))
    print "threshold: %d images" % threshold

    sellernames = os.listdir(origin_path)
    shuffle(sellernames)
    distractor_ct = 0
    sellername_dict = {'training_seller': [],
                       'validation_seller': [],
                       'pseudo_seller': [],
                       'training_distractor': [],
                       'validation_distractor': []}
    for seller in sellernames:
        seller_path = os.path.join(origin_path, seller)
        seller_img_ct = len(os.listdir(seller_path))
        if seller_img_ct < threshold:
            continue
        oripath = os.path.join(origin_path, seller)
        imagefiles = os.listdir(oripath)
        imagefiles = map(lambda x: os.path.join(oripath, x), imagefiles)
        if seller_img_ct >= 2 * threshold:
            tarpath1 = os.path.join(target_path1, seller)
            os.mkdir(tarpath1)
            tarpath2 = os.path.join(target_path2, seller)
            os.mkdir(tarpath2)
            shuffle(imagefiles)
            halfimagenum = len(imagefiles) / 2
            for image in imagefiles[:halfimagenum]:
                copyfile(image, image.replace(oripath, tarpath1))
            for image in imagefiles[halfimagenum:]:
                copyfile(image, image.replace(oripath, tarpath2))
            sellername_dict['training_seller'].append(seller)
            sellername_dict['validation_seller'].append(seller)
            sellername_dict['pseudo_seller'].append(seller)
        else:
            if distractor_ct % step == 0:
                tarpath1 = os.path.join(target_path1, seller)
                os.mkdir(tarpath1)
                for image in imagefiles:
                    copyfile(image, image.replace(oripath, tarpath1))
                sellername_dict['training_seller'].append(seller)
                sellername_dict['training_distractor'].append(seller)
            else:
                tarpath2 = os.path.join(target_path2, seller)
                os.mkdir(tarpath2)
                for image in imagefiles:
                    copyfile(image, image.replace(oripath, tarpath2))
                sellername_dict['validation_seller'].append(seller)
                sellername_dict['validation_distractor'].append(seller)
            distractor_ct += 1
    
    print "valid seller: %d" % (len(sellername_dict['training_distractor']) +
                                len(sellername_dict['validation_distractor']) +
                                len(sellername_dict['pseudo_seller']))
    print "training count: %d, validation count: %d" % (len(sellername_dict['training_seller']),
                                                        len(sellername_dict['validation_seller']))
    print "training distractor: %d, validation distractor: %d" % (len(sellername_dict['training_distractor']),
                                                                  len(sellername_dict['validation_distractor']))
    print "pseudo seller: %d" % len(sellername_dict['pseudo_seller'])
    with open(os.path.join(target_root, 'seller_name.json'), 'w') as fp:
        savejson(sellername_dict, fp)
    return [target_root, target_path1, target_path2]


def prepare_train_val_label(setname, tar_path, random_seed=201710):
    seed(random_seed)
    
    train_sellers = sorted(os.listdir(tar_path[1]))
    train_classes = [os.path.join(tar_path[1], x) for x in train_sellers]
    data_path = os.path.join(tar_path[0], 'labels')
    try:
        rmtree(data_path)
    except:
        pass
    try:
        os.makedirs(data_path)
    except:
        pass
    
    with open(os.path.join(data_path, 'train.txt'), 'w') as fp_tr:
        for i in range(len(train_classes)):
            cl = train_classes[i]
            imgs = [os.path.join(cl, x) for x in sorted(os.listdir(cl))]
            for img in imgs:
                fp_tr.write("%s %d\n" % (img, i))
    
    class_name = {}
    train_class_ct = len(train_classes)
    for i in xrange(train_class_ct):
        class_name[i] = train_classes[i].split('/')[-1]
            
    test_cl_index = []
    test_sellers = sorted(os.listdir(tar_path[2]))
    test_classes = [os.path.join(tar_path[2], x) for x in test_sellers]
    exist_test_class_ct, new_test_class_ct = 0, 0
    for i in range(len(test_sellers)):
        if test_sellers[i] in train_sellers:
            exist_test_class_ct += 1
            test_cl_index.append(train_sellers.index(test_sellers[i]))
        else:
            index_i = train_class_ct + new_test_class_ct
            new_test_class_ct += 1
            class_name[index_i] = test_sellers[i]
            test_cl_index.append(index_i)

    print "train class: %d, test class: %d" % (train_class_ct, exist_test_class_ct + new_test_class_ct)
    print "exist test class: %d, new test class: %d" % (exist_test_class_ct, new_test_class_ct)
    with open(os.path.join(data_path, 'class_name.json'), 'w') as fp:
        savejson(class_name, fp)

    with open(os.path.join(data_path, 'test.txt'), 'w') as fp:
        for i in range(len(test_cl_index)):
            cl = test_classes[i]
            imgs = [os.path.join(cl, x) for x in os.listdir(cl)]
            for img in imgs:
                fp.write("%s %d\n" % (img, test_cl_index[i]))
    print 'Data preparation finished.'


def prepare_data(setname, threshold, step=1, random_seed=None):
    if random_seed is None:
        tar_path = prepare_training_validation(setname, threshold, step=step)
        print('----')
        prepare_train_val_label(setname, tar_path)
        print('---------------')
    else:
        tar_path = prepare_training_validation(setname, threshold, step=step, random_seed=random_seed)
        print('----')
        prepare_train_val_label(setname, tar_path, random_seed=random_seed)
        print('---------------')    

In [27]:
prepare_data('Agora', 10)

set name: Agora
total seller: 2836
threshold: 10 images
valid seller: 1617
training count: 1617, validation count: 1020
training distractor: 597, validation distractor: 0
pseudo seller: 1020
----
train class: 1617, test class: 1020
exist test class: 1020, new test class: 0
Data preparation finished.
---------------


In [28]:
prepare_data('Agora', 20)

set name: Agora
total seller: 2836
threshold: 20 images
valid seller: 1020
training count: 1020, validation count: 480
training distractor: 540, validation distractor: 0
pseudo seller: 480
----
train class: 1020, test class: 480
exist test class: 480, new test class: 0
Data preparation finished.
---------------


In [29]:
prepare_data('Agora', 40)

set name: Agora
total seller: 2836
threshold: 40 images
valid seller: 480
training count: 480, validation count: 161
training distractor: 319, validation distractor: 0
pseudo seller: 161
----
train class: 480, test class: 161
exist test class: 161, new test class: 0
Data preparation finished.
---------------


In [30]:
prepare_data('Evolution', 10)

set name: Evolution
total seller: 3635
threshold: 10 images
valid seller: 1773
training count: 1773, validation count: 1093
training distractor: 680, validation distractor: 0
pseudo seller: 1093
----
train class: 1773, test class: 1093
exist test class: 1093, new test class: 0
Data preparation finished.
---------------


In [31]:
prepare_data('Evolution', 20)

set name: Evolution
total seller: 3635
threshold: 20 images
valid seller: 1093
training count: 1093, validation count: 519
training distractor: 574, validation distractor: 0
pseudo seller: 519
----
train class: 1093, test class: 519
exist test class: 519, new test class: 0
Data preparation finished.
---------------


In [32]:
prepare_data('Evolution', 40)

set name: Evolution
total seller: 3635
threshold: 40 images
valid seller: 519
training count: 519, validation count: 197
training distractor: 322, validation distractor: 0
pseudo seller: 197
----
train class: 519, test class: 197
exist test class: 197, new test class: 0
Data preparation finished.
---------------


In [63]:
prepare_data('SilkRoad2', 10)

set name: SilkRoad2
total seller: 1172
threshold: 10 images
valid seller: 663
training count: 663, validation count: 415
training distractor: 248, validation distractor: 0
pseudo seller: 415
----
train class: 663, test class: 415
exist test class: 415, new test class: 0
Data preparation finished.
---------------


In [64]:
prepare_data('SilkRoad2', 20)

set name: SilkRoad2
total seller: 1172
threshold: 20 images
valid seller: 415
training count: 415, validation count: 211
training distractor: 204, validation distractor: 0
pseudo seller: 211
----
train class: 415, test class: 211
exist test class: 211, new test class: 0
Data preparation finished.
---------------


In [65]:
prepare_data('SilkRoad2', 40)

set name: SilkRoad2
total seller: 1172
threshold: 40 images
valid seller: 211
training count: 211, validation count: 76
training distractor: 135, validation distractor: 0
pseudo seller: 76
----
train class: 211, test class: 76
exist test class: 76, new test class: 0
Data preparation finished.
---------------


In [36]:
prepare_data('Agora', 10, step=2)

set name: Agora
total seller: 2836
threshold: 10 images
valid seller: 1617
training count: 1319, validation count: 1318
training distractor: 299, validation distractor: 298
pseudo seller: 1020
----
train class: 1319, test class: 1318
exist test class: 1020, new test class: 298
Data preparation finished.
---------------


In [37]:
prepare_data('Agora', 20, step=2)

set name: Agora
total seller: 2836
threshold: 20 images
valid seller: 1020
training count: 750, validation count: 750
training distractor: 270, validation distractor: 270
pseudo seller: 480
----
train class: 750, test class: 750
exist test class: 480, new test class: 270
Data preparation finished.
---------------


In [38]:
prepare_data('Agora', 40, step=2)

set name: Agora
total seller: 2836
threshold: 40 images
valid seller: 480
training count: 321, validation count: 320
training distractor: 160, validation distractor: 159
pseudo seller: 161
----
train class: 321, test class: 320
exist test class: 161, new test class: 159
Data preparation finished.
---------------


In [39]:
prepare_data('Evolution', 10, step=2)

set name: Evolution
total seller: 3635
threshold: 10 images
valid seller: 1773
training count: 1433, validation count: 1433
training distractor: 340, validation distractor: 340
pseudo seller: 1093
----
train class: 1433, test class: 1433
exist test class: 1093, new test class: 340
Data preparation finished.
---------------


In [40]:
prepare_data('Evolution', 20, step=2)

set name: Evolution
total seller: 3635
threshold: 20 images
valid seller: 1093
training count: 806, validation count: 806
training distractor: 287, validation distractor: 287
pseudo seller: 519
----
train class: 806, test class: 806
exist test class: 519, new test class: 287
Data preparation finished.
---------------


In [41]:
prepare_data('Evolution', 40, step=2)

set name: Evolution
total seller: 3635
threshold: 40 images
valid seller: 519
training count: 358, validation count: 358
training distractor: 161, validation distractor: 161
pseudo seller: 197
----
train class: 358, test class: 358
exist test class: 197, new test class: 161
Data preparation finished.
---------------


In [66]:
prepare_data('SilkRoad2', 10, step=2)

set name: SilkRoad2
total seller: 1172
threshold: 10 images
valid seller: 663
training count: 539, validation count: 539
training distractor: 124, validation distractor: 124
pseudo seller: 415
----
train class: 539, test class: 539
exist test class: 415, new test class: 124
Data preparation finished.
---------------


In [67]:
prepare_data('SilkRoad2', 20, step=2)

set name: SilkRoad2
total seller: 1172
threshold: 20 images
valid seller: 415
training count: 313, validation count: 313
training distractor: 102, validation distractor: 102
pseudo seller: 211
----
train class: 313, test class: 313
exist test class: 211, new test class: 102
Data preparation finished.
---------------


In [68]:
prepare_data('SilkRoad2', 40, step=2)

set name: SilkRoad2
total seller: 1172
threshold: 40 images
valid seller: 211
training count: 144, validation count: 143
training distractor: 68, validation distractor: 67
pseudo seller: 76
----
train class: 144, test class: 143
exist test class: 76, new test class: 67
Data preparation finished.
---------------


In [69]:
prepare_data('Agora_dedup', 10)

set name: Agora_dedup
total seller: 2836
threshold: 10 images
valid seller: 926
training count: 926, validation count: 408
training distractor: 518, validation distractor: 0
pseudo seller: 408
----
train class: 926, test class: 408
exist test class: 408, new test class: 0
Data preparation finished.
---------------


In [70]:
prepare_data('Agora_dedup', 20)

set name: Agora_dedup
total seller: 2836
threshold: 20 images
valid seller: 408
training count: 408, validation count: 137
training distractor: 271, validation distractor: 0
pseudo seller: 137
----
train class: 408, test class: 137
exist test class: 137, new test class: 0
Data preparation finished.
---------------


In [11]:
prepare_data('Agora_dedup', 40, random_seed=2017121605)

set name: Agora_dedup
total seller: 2836
threshold: 40 images
valid seller: 137
training count: 137, validation count: 45
training distractor: 92, validation distractor: 0
pseudo seller: 45
----
train class: 137, test class: 45
exist test class: 45, new test class: 0
Data preparation finished.
---------------


In [5]:
prepare_data('Evolution_dedup', 10, random_seed=2017121605)

set name: Evolution_dedup
total seller: 3635
threshold: 10 images
valid seller: 989
training count: 989, validation count: 443
training distractor: 546, validation distractor: 0
pseudo seller: 443
----
train class: 989, test class: 443
exist test class: 443, new test class: 0
Data preparation finished.
---------------


In [73]:
prepare_data('Evolution_dedup', 20)

set name: Evolution_dedup
total seller: 3635
threshold: 20 images
valid seller: 443
training count: 443, validation count: 155
training distractor: 288, validation distractor: 0
pseudo seller: 155
----
train class: 443, test class: 155
exist test class: 155, new test class: 0
Data preparation finished.
---------------


In [74]:
prepare_data('Evolution_dedup', 40)

set name: Evolution_dedup
total seller: 3635
threshold: 40 images
valid seller: 155
training count: 155, validation count: 47
training distractor: 108, validation distractor: 0
pseudo seller: 47
----
train class: 155, test class: 47
exist test class: 47, new test class: 0
Data preparation finished.
---------------


In [75]:
prepare_data('SilkRoad2_dedup', 10)

set name: SilkRoad2_dedup
total seller: 1172
threshold: 10 images
valid seller: 414
training count: 414, validation count: 181
training distractor: 233, validation distractor: 0
pseudo seller: 181
----
train class: 414, test class: 181
exist test class: 181, new test class: 0
Data preparation finished.
---------------


In [76]:
prepare_data('SilkRoad2_dedup', 20)

set name: SilkRoad2_dedup
total seller: 1172
threshold: 20 images
valid seller: 181
training count: 181, validation count: 59
training distractor: 122, validation distractor: 0
pseudo seller: 59
----
train class: 181, test class: 59
exist test class: 59, new test class: 0
Data preparation finished.
---------------


In [77]:
prepare_data('SilkRoad2_dedup', 40)

set name: SilkRoad2_dedup
total seller: 1172
threshold: 40 images
valid seller: 59
training count: 59, validation count: 24
training distractor: 35, validation distractor: 0
pseudo seller: 24
----
train class: 59, test class: 24
exist test class: 24, new test class: 0
Data preparation finished.
---------------


In [78]:
prepare_data('Agora_dedup', 10, step=2)

set name: Agora_dedup
total seller: 2836
threshold: 10 images
valid seller: 926
training count: 667, validation count: 667
training distractor: 259, validation distractor: 259
pseudo seller: 408
----
train class: 667, test class: 667
exist test class: 408, new test class: 259
Data preparation finished.
---------------


In [79]:
prepare_data('Agora_dedup', 20, step=2)

set name: Agora_dedup
total seller: 2836
threshold: 20 images
valid seller: 408
training count: 273, validation count: 272
training distractor: 136, validation distractor: 135
pseudo seller: 137
----
train class: 273, test class: 272
exist test class: 137, new test class: 135
Data preparation finished.
---------------


In [80]:
prepare_data('Agora_dedup', 40, step=2)

set name: Agora_dedup
total seller: 2836
threshold: 40 images
valid seller: 137
training count: 91, validation count: 91
training distractor: 46, validation distractor: 46
pseudo seller: 45
----
train class: 91, test class: 91
exist test class: 45, new test class: 46
Data preparation finished.
---------------


In [81]:
prepare_data('Evolution_dedup', 10, step=2)

set name: Evolution_dedup
total seller: 3635
threshold: 10 images
valid seller: 989
training count: 716, validation count: 716
training distractor: 273, validation distractor: 273
pseudo seller: 443
----
train class: 716, test class: 716
exist test class: 443, new test class: 273
Data preparation finished.
---------------


In [82]:
prepare_data('Evolution_dedup', 20, step=2)

set name: Evolution_dedup
total seller: 3635
threshold: 20 images
valid seller: 443
training count: 299, validation count: 299
training distractor: 144, validation distractor: 144
pseudo seller: 155
----
train class: 299, test class: 299
exist test class: 155, new test class: 144
Data preparation finished.
---------------


In [83]:
prepare_data('Evolution_dedup', 40, step=2)

set name: Evolution_dedup
total seller: 3635
threshold: 40 images
valid seller: 155
training count: 101, validation count: 101
training distractor: 54, validation distractor: 54
pseudo seller: 47
----
train class: 101, test class: 101
exist test class: 47, new test class: 54
Data preparation finished.
---------------


In [84]:
prepare_data('SilkRoad2_dedup', 10, step=2)

set name: SilkRoad2_dedup
total seller: 1172
threshold: 10 images
valid seller: 414
training count: 298, validation count: 297
training distractor: 117, validation distractor: 116
pseudo seller: 181
----
train class: 298, test class: 297
exist test class: 181, new test class: 116
Data preparation finished.
---------------


In [85]:
prepare_data('SilkRoad2_dedup', 20, step=2)

set name: SilkRoad2_dedup
total seller: 1172
threshold: 20 images
valid seller: 181
training count: 120, validation count: 120
training distractor: 61, validation distractor: 61
pseudo seller: 59
----
train class: 120, test class: 120
exist test class: 59, new test class: 61
Data preparation finished.
---------------


In [86]:
prepare_data('SilkRoad2_dedup', 40, step=2)

set name: SilkRoad2_dedup
total seller: 1172
threshold: 40 images
valid seller: 59
training count: 42, validation count: 41
training distractor: 18, validation distractor: 17
pseudo seller: 24
----
train class: 42, test class: 41
exist test class: 24, new test class: 17
Data preparation finished.
---------------
