In [127]:
import numpy as np
import random
import os
import scipy.io as io
import shutil

def get_selected_samples_idx(labels_path):
    # load AllLabels/*.txt 
    filenames = os.listdir(labels_path)
    filenames.sort()
    all_labels = []
    for filename in filenames:
        file_path = os.path.join(labels_path,filename)
        all_labels.append(np.loadtxt(file_path))
    all_labels = np.transpose(all_labels)
    
    # Extract 21 concepts which contains the most images
    # Like many other authers did "Hashing with Graph","Learning Consistence.."
    # sizeOfEachLabel = all_labels.sum(axis = 0)
    # labels21_idx = np.argsort(-sizeOfEachLabel)[:21]
    # labels21 = all_labels[:,labels21_idx]
    
    # select single-labeled samples and return idx array
    labelCountOfEachImage = all_labels.sum(axis = 1)
    singleLabeledIdx = np.where(labelCountOfEachImage == 1)
    selectedSamplesIdx = np.array(singleLabeledIdx)[0]
    selectedSamplesIdx = selectedSamplesIdx.tolist()
    
    return selectedSamplesIdx, all_labels

def split_trainset_and_testset_idx(selectedSampleIdx,trainset_ratio, savepath):
    selectedSampleSize = np.size(selectedSampleIdx,0)
    trainsetSize = int(selectedSampleSize * trainset_ratio)
    trainSampleIdx = random.sample(selectedSampleIdx,trainsetSize)
    testSampleIdx = np.setdiff1d(selectedSampleIdx,trainSampleIdx)
    np.savetxt(os.path.join(savepath,'trainSampleIdx.txt'),trainSampleIdx)
    np.savetxt(os.path.join(savepath,'testSampleIdx.txt'),testSampleIdx)
    return trainSampleIdx,testSampleIdx

def prepare_image_feature_mat(trainSampleIdx,testSampleIdx,CM55Path,savepath):
    # Load images feature
    allImageFeature = np.loadtxt(CM55Path)

    # Save image features of trainset and testset 
    testImageFeature = allImageFeature[testSampleIdx,:]
    trainImageFeature = allImageFeature[trainSampleIdx,:]
    io.savemat(os.path.join(savepath,'nus_img_raw.mat'),{'raw_img_test':testImageFeature,
                                                         'raw_img_train':trainImageFeature})

def prepare_testset_groundtruth(testSampleIdx,allLabels,savepath):
    testsetGroundTruthMatrix = np.mat(allLabels[testSampleIdx,:])
    groundTruth = testsetGroundTruthMatrix.nonzero()[1]+1  
    io.savemat(os.path.join(savepath,'nus_test_groundtruth.mat'),{'ground_truth':groundTruth})
    return groundTruth
    
def prepare_text_feature_mat(trainSampleIdx,testSampleIdx,tags1kPath,savepath):
    allTextFeature = np.loadtxt(tags1kPath)
    trainTextFeature = allTextFeature[trainSampleIdx,:]
    testTextFeature = allTextFeature[testSampleIdx,:]
    io.savemat(os.path.join(savepath,'nus_text.mat'),{'text_test':testTextFeature,
                                                         'text_train':trainTextFeature})

def prepare_raw_image_directory(trainSampleIdx,testSampleIdx,imagelistPath,imageSourcePath,imageTargetPath):
    imageListFile = open(imagelistPath)
    allImages = []
    trainImages = []
    testImages = []
    for image in imageListFile.readlines():
        allImages.append(image)   
    for idx in trainSampleIdx:
        trainImages.append(allImages[idx])
    for idx in testSampleIdx:
        testImages.append(allImages[idx])

    for idx, filename in enumerate(trainImages):
        sourceFilePath = os.path.join(imageSourcePath,filename.strip()).replace('\\','/')
        newFilename = idx_filename_prefix(idx) + str(idx) + '.jpg'
        targetFilePath = os.path.join(imageTargetPath,'trainset',newFilename)
        shutil.copyfile(sourceFilePath,targetFilePath)
    for idx, filename in enumerate(testImages):
        sourceFilePath = os.path.join(imageSourcePath,filename.strip()).replace('\\','/')
        newFilename = idx_filename_prefix(idx) + str(idx) + '.jpg'
        targetFilePath = os.path.join(imageTargetPath,'testset',newFilename)
        shutil.copyfile(sourceFilePath,targetFilePath)
        
def idx_filename_prefix(idx):
    if idx < 10:
        return '00000'
    elif idx < 100:
        return '0000'
    elif idx < 1000:
        return '000'
    elif idx < 10000:
        return '00'
    elif idx < 100000:
        return '0'
    else:
        return ''

In [77]:
# Select 79216 samples for experiment
selectedSampleIdx,allLabels = get_selected_samples_idx('raw_image_nuswide/AllLabels/')
# Split selected samples to trainset(52810) and testset(26406)
trainSampleIdx, testSampleIdx = split_trainset_and_testset_idx(selectedSampleIdx, 2/3,'raw_image_nuswide')
prepare_image_feature_mat(trainSampleIdx,testSampleIdx,'raw_image_nuswide/Normalized_CM55.txt','raw_image_nuswide/')
prepare_testset_groundtruth(testSampleIdx,allLabels,'raw_image_nuswide')
prepare_text_feature_mat(trainSampleIdx,testSampleIdx,'raw_image_nuswide/AllTags1k.txt','raw_image_nuswide')

In [128]:
prepare_raw_image_directory(trainSampleIdx,testSampleIdx,'raw_image_nuswide/Imagelist.txt','raw_image_nuswide/NUSWIDE_IMAGES/Flickr','raw_image_nuswide/')

In [124]:
a = 'raw_image_nuswide/NUSWIDE_IMAGES/Flickr\\sss'
b = a.replace('\\','/')
print(b)

raw_image_nuswide/NUSWIDE_IMAGES/Flickr/sss
