# 构造用于三维卷积神经网络的数据集
- 用周围19个像素的值取预测一个值，貌似太大了
- 将得到的数据集储存起来，让卷积模块调用

In [1]:
import scipy.io as sio
import numpy as np
import random
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# 定义常量

In [3]:
window_size = 19   # 创建的数据立方体大小
batch_size = 32
train_num = 200
seed = 666
savedata = True

# 导入数据

In [4]:
# 数据标准化
def max_min(x):
    return (x-np.min(x))/(np.max(x)-np.min(x))

In [5]:
def loadData():
    data_dict = sio.loadmat(r"E:\Eric_HSI\hyperspectral_datasets\Indian_pines_corrected.mat")
    data_gt_dict = sio.loadmat(r"E:\Eric_HSI\hyperspectral_datasets\Indian_pines_gt.mat")
    # startswith 检查字符串是否以 "————" 开头, 取出数据集
    data_name = [t for t in list(data_dict.keys()) if not t.startswith('__')][0]
    data_gt_name = [t for t in list(data_gt_dict.keys()) if not t.startswith('__')][0]
    data = data_dict[data_name]
    data_gt = data_gt_dict[data_gt_name].astype(np.int32)
    # 标准化
    data = max_min(data).astype(np.float32)
    class_num = np.max(data_gt)
    print('DataSet %s shape is %s class_num is %s'%(data_name,data.shape,class_num))
    return data, data_gt

In [6]:
# data, data_gt = loadData()

In [7]:
def neighbor_add():
    # t array([-9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    t = np.array([i - window_size // 2 for i in range(window_size)])

    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            index = i * data.shape[1] + j
            for p in range(window_size):
                for q in range(window_size):
                    # [p][q] 表示对小平面进行遍历
                    # % 表示取余
                    # 对所创建的 n 个小立方体进行填充
                    # 这里对于不能取满的，取对称的对角元素的值，明显不太好呀
                    # print("i,j",i, j)
                    num_1 = (i + t[p]) % data.shape[0]
                    # print("p,q", p, q)
                    num_2 = (j + t[q]) % data.shape[1]
                    # print("num_1, num_2",num_1, num_2)
                    data_all[index][p][q] = data[num_1][num_2]
    return data_all

In [8]:
# data_all = neighbor_add()

In [9]:
# 去除背景，标签减一
def split_background():
    global data_all
    
    data_label_all = np.reshape(data_gt, (-1, 1))
    tmp = np.unique(data_label_all)
    index = np.where(np.reshape(data_label_all, (-1)) == 0)   # 返回坐标tuple(ndarray)
    data_label_all = np.delete(data_label_all, index, axis=0)
    
    # 因为本来中标签没用0，但是经过Len(labels)，np.where 定位索引位置，又从0开始，所以标签值-1
    for i in range(len(data_label_all)):
        data_label_all[i][0] = np.where(tmp == data_label_all[i][0])[0][0]
    data_label_all = np.reshape(data_label_all, (-1))  # 降维
    
    data_all = np.delete(data_all, index, axis=0)
    return data_all, data_label_all

In [10]:
# data_all, data_label_all = split_background()

In [11]:
# Split the data
def split_train_test(test_size=0.8, random_state=seed):
    train, test, train_label, test_label = train_test_split(data_all, data_label_all, test_size=test_size, random_state=seed)
    return train, test, train_label, test_label

In [20]:
def savePreprocessedData(path, data, data_label, train, train_label, test, test_label):
    data_path = os.path.join(os.getcwd(), path)
    print(data_path)

    if savedata:
        with open(os.path.join(data_path, 'data.npy'), 'bw') as outfile:
            np.save(outfile, data_all)
        with open(os.path.join(data_path, 'data_label.npy'), 'bw') as outfile:
            np.save(outfile, data_label_all)  

        with open(os.path.join(data_path, 'train.npy'), 'bw') as outfile:
            np.save(outfile, train)
        with open(os.path.join(data_path, 'train_label.npy'), 'bw') as outfile:
            np.save(outfile, train_label)
        
        with open(os.path.join(data_path, 'test.npy'), 'bw') as outfile:
            np.save(outfile, test)
        with open(os.path.join(data_path, 'test_label.npy'), 'bw') as outfile:
            np.save(outfile, test_label)

# 调用函数

In [13]:
# 这个在前面定义
data, data_gt = loadData()

DataSet data shape is (145, 145, 200) class_num is 16


In [14]:
# 创建一个 (21025, 19, 19, 200) 的数组装数据
data_all = np.zeros((np.reshape(data_gt, (-1, 1)).shape[0], window_size, window_size, data.shape[2]), dtype='float32')
data_all.shape

(21025, 19, 19, 200)

In [15]:
data_all = neighbor_add()
data_all, data_label_all = split_background()

In [16]:
data_all = data_all.swapaxes(1, 3)
print('dataset_shape: ', data_all.shape)

data_all = data_all[:, :, :, :, np.newaxis]
print('dataset_shape: ', data_all.shape)

dataset_shape:  (10249, 200, 19, 19)
dataset_shape:  (10249, 200, 19, 19, 1)


In [17]:
train, test, train_label, test_label = split_train_test(test_size=0.8, random_state=seed)

In [18]:
train.shape, test.shape, train_label.shape, test_label.shape

((2049, 200, 19, 19, 1), (8200, 200, 19, 19, 1), (2049,), (8200,))

# 保存数据

In [22]:
if not os.path.exists('Indian_pines_w_size_19_num_0.2_for_3D'):
    os.mkdir('Indian_pines_w_size_19_num_0.2_for_3D')
savePreprocessedData('Indian_pines_w_size_19_num_0.2_for_3D', data_all, data_label_all, train, train_label, test, test_label)

e:\Eric_HSI\hyper_data_preprocess\Indian_pines_w_size_19_num_0.2


# 创建dataset

In [25]:
db_train = tf.data.Dataset.from_tensor_slices((train, train_label))
db_test = tf.data.Dataset.from_tensor_slices((test, test_label))

# 自定义训练函数不用 repeat
db_train = db_train.shuffle(train_num).batch(batch_size=batch_size)
db_test = db_test.batch(batch_size=batch_size)

In [26]:
db_train, db_test

(<BatchDataset shapes: ((None, 200, 19, 19, 1), (None,)), types: (tf.float32, tf.int32)>,
 <BatchDataset shapes: ((None, 200, 19, 19, 1), (None,)), types: (tf.float32, tf.int32)>)