## dataset.ipynb: preprocess the dataset 

In [1]:
import numpy as np
from sklearn.decomposition import PCA
import scipy.io as sio
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn import preprocessing
import os
import random
import tensorflow as tf
# from skimage.transform import rotate
import scipy.ndimage

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
numComponents = 30
window_size = 5
testRatio = 0.8
savedata = True

In [4]:
# 数据标准化
def max_min(x):
    return (x-np.min(x))/(np.max(x)-np.min(x))

In [5]:
#  load the Indian pines dataset which is the .mat format
def loadData():
    data_dict = sio.loadmat(r"E:\Eric_HSI\hyperspectral_datasets\Indian_pines_corrected.mat")
    data_gt_dict = sio.loadmat(r"E:\Eric_HSI\hyperspectral_datasets\Indian_pines_gt.mat")
    # startswith 检查字符串是否以 "————" 开头, 取出数据集
    data_name = [t for t in list(data_dict.keys()) if not t.startswith('__')][0]
    data_gt_name = [t for t in list(data_gt_dict.keys()) if not t.startswith('__')][0]
    data = data_dict[data_name]
    data_gt = data_gt_dict[data_gt_name].astype(np.int32)
    # 标准化
    data = max_min(data).astype(np.float32)
    class_num = np.max(data_gt)
    print('DataSet %s shape is %s class_num is %s'%(data_name,data.shape,class_num))
    return data, data_gt

In [6]:
def applyPCA(X, numComponents=75):
    newX = np.reshape(X, (-1, X.shape[2]))
    pca = PCA(n_components=numComponents, whiten=True)
    newX = pca.fit_transform(newX)
    newX = np.reshape(newX, (X.shape[0],X.shape[1], numComponents))
    return newX, pca

In [7]:
#  pad zeros to dataset
def padWithZeros(X, margin=2):
    newX = np.zeros((X.shape[0] + 2 * margin, X.shape[1] + 2* margin, X.shape[2]))
    x_offset = margin
    y_offset = margin
    newX[x_offset : X.shape[0] + x_offset, y_offset : X.shape[1] + y_offset, :] = X
    return newX

In [8]:
#  create Patches for dataset 顾名思义，将一整张图片划分成一个个小的patch。TotalPatNum，width，height，channels。 删除背景后，总共的 patch (10249, 5, 5, 30)  (10249,)
# Patches windows size
# window_size = 5
# X (145, 145, 30)

def createPatches(X, y, window_size=5, removeZeroLabels = True):
    margin = int((window_size - 1) / 2)                                         # =>2
    zeroPaddedX = padWithZeros(X, margin=margin)                              # (149, 149, 30)
    # split patches
    patchesData = np.zeros((X.shape[0] * X.shape[1], window_size, 
                            window_size, X.shape[2]))                            # (21025, 5, 5, 30)
    patchesLabels = np.zeros((X.shape[0] * X.shape[1]))                         # (21025,)
    patchIndex = 0
    for r in range(margin, zeroPaddedX.shape[0] - margin):
        for c in range(margin, zeroPaddedX.shape[1] - margin):
            patch = zeroPaddedX[r - margin:r + margin + 1, c - margin:c + margin + 1]
            patchesData[patchIndex, :, :, :] = patch
            patchesLabels[patchIndex] = y[r-margin, c-margin]
            patchIndex = patchIndex + 1
            
    # 删除像素值为零的值
    if removeZeroLabels:
        patchesData = patchesData[patchesLabels>0, : , : , :]                     # (21025, 5, 5, 30) -> (10249, 5, 5, 30)
        patchesLabels = patchesLabels[patchesLabels>0]                         # (10249,)
        patchesLabels -= 1
    return patchesData, patchesLabels

In [9]:
#  split data to Train and Test Set
# X_train, Y_train, X_label, Y_label
# (200, 5, 5, 30) (10049, 5, 5, 30) (200,) (10049,)

def splitTrainTestSet(X, y, testRatio):
#     train, test, train_label, test_label = train_test_split(X, y, 
#                                test_size=testRatio, random_state=345, stratify=y)
    class_num = np.max(y).astype(np.int32)
    print(class_num)
    ss=StratifiedShuffleSplit(n_splits=class_num, test_size=testRatio, random_state=0)
    
    for train_index, test_index in ss.split(X, y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        train, test = X[train_index], X[test_index]
        train_label, test_label = y[train_index], y[test_index]
        
    return train, test, train_label, test_label

In [10]:
#  over sample 
# 过采样，将比例比较少的例子重复几次，然后叠加到原始数据上
# X_train:  ((200, 5, 5, 30), y_train : ((200,)
# 变为
# X_train : ((730, 5, 5, 30), (730,))

def oversampleWeakClasses(X, y):
    uniqueLabels, labelCounts = np.unique(y, return_counts=True)
    maxCount = np.max(labelCounts)
    labelInverseRatios = maxCount / labelCounts  
    # repeat for every label and concat
    newX = X[y == uniqueLabels[0], :, :, :].repeat(round(labelInverseRatios[0]), 
                                                   axis=0)
    newY = y[y == uniqueLabels[0]].repeat(round(labelInverseRatios[0]), axis=0)
    for label, labelInverseRatio in zip(uniqueLabels[1:], labelInverseRatios[1:]):
        cX = X[y== label,:,:,:].repeat(round(labelInverseRatio), axis=0)
        cY = y[y == label].repeat(round(labelInverseRatio), axis=0)
        newX = np.concatenate((newX, cX))
        newY = np.concatenate((newY, cY))
    np.random.seed(seed=42)
    rand_perm = np.random.permutation(newY.shape[0])
    newX = newX[rand_perm, :, :, :]
    newY = newY[rand_perm]
    return newX, newY

In [11]:
##  standartize, 不要这样的标准化##
# def standartizeData(X):
#     newX = np.reshape(X, (-1, X.shape[3]))
#     scaler = preprocessing.StandardScaler().fit(newX)  
#     newX = scaler.transform(newX)
#     newX = np.reshape(newX, (X.shape[0], X.shape[1], X.shape[2], X.shape[3]))
#     return newX, scaler

In [12]:
#  Augment Data
def AugmentData(X_train):
    for i in range(int(X_train.shape[0]/2)):
        patch = X_train[i,:,:,:]
        num = random.randint(0,2)

        if (num == 0):
            flipped_patch = np.flipud(patch)
        if (num == 1):
            flipped_patch = np.fliplr(patch)

        if (num == 2):
            no = random.randrange(-180,180,30)
            flipped_patch = scipy.ndimage.interpolation.rotate(patch, 
                            no,axes=(1, 0), reshape=False, output=None, 
                            order=3, mode='constant', cval=0.0, prefilter=False)
        
    patch2 = flipped_patch
    X_train[i,:,:,:] = patch2
    
    return X_train

In [13]:
def savePreprocessedData(path, data, data_label, train, train_label, test, test_label):
    data_path = os.path.join(os.getcwd(), path)
    print(data_path)

    if savedata:
        with open(os.path.join(data_path, 'data.npy'), 'bw') as outfile:
            np.save(outfile, data_all)
        with open(os.path.join(data_path, 'data_label.npy'), 'bw') as outfile:
            np.save(outfile, data_label_all)  

        with open(os.path.join(data_path, ' data_remove_0.npy'), 'bw') as outfile:
            np.save(outfile, data_remove_0)
        with open(os.path.join(data_path, 'data_label_remove_0.npy'), 'bw') as outfile:
            np.save(outfile, data_label_remove_0) 

        with open(os.path.join(data_path, 'train.npy'), 'bw') as outfile:
            np.save(outfile, train)
        with open(os.path.join(data_path, 'train_label.npy'), 'bw') as outfile:
            np.save(outfile, train_label)
        
        with open(os.path.join(data_path, 'test.npy'), 'bw') as outfile:
            np.save(outfile, test)
        with open(os.path.join(data_path, 'test_label.npy'), 'bw') as outfile:
            np.save(outfile, test_label)

# 调用函数

In [14]:
X, y = loadData()
X, pca = applyPCA(X, numComponents=numComponents)
print(X.shape)

DataSet data shape is (145, 145, 200) class_num is 16
(145, 145, 30)


In [15]:
# Preprocess Data
data_all, data_label_all = createPatches(X, y, window_size=window_size, removeZeroLabels = False) 
data_remove_0, data_label_remove_0 = createPatches(X, y, window_size=window_size, removeZeroLabels = True) 
print("data_all %s, data_label_all %s"%(data_all.shape, data_label_all.shape))
train, test, train_label, test_label = splitTrainTestSet(data_all, data_label_all, testRatio)
print(train.shape, test.shape, train_label.shape, test_label.shape)

train, train_label = oversampleWeakClasses(train, train_label)
print("train %s, train_label %s"%(train.shape, train_label.shape))

data_all (21025, 5, 5, 30), data_label_all (21025,)
16
(4205, 5, 5, 30) (16820, 5, 5, 30) (4205,) (16820,)
train (36635, 5, 5, 30), train_label (36635,)


In [16]:
train = AugmentData(train)
print(train.shape)

(36635, 5, 5, 30)


In [17]:
if not os.path.exists('Indian_pines_w_size_' + str(window_size) + '_num_0.2_for_2D'):
    os.mkdir('Indian_pines_w_size_'+ str(window_size)+'_num_0.2_for_2D')
savePreprocessedData('Indian_pines_w_size_'+ str(window_size) +'_num_0.2_for_2D', data_all, data_label_all, train, train_label, test, test_label)

e:\Eric_HSI\hyper_data_preprocess\Indian_pines_w_size_5_num_0.2_for_2D
