In [1]:
import arcpy
from arcpy import env
import pandas as pd
import numpy as np
import os

In [2]:
# 数组整形
def resize_arrays(A, B, fill_value=0):
    """调整数组形状一致"""
    new_shape = (max(A.shape[0], B.shape[0]), max(A.shape[1], B.shape[1]))

    if A.shape != new_shape:
        if A.shape[0] < new_shape[0]:
            padding_rows = new_shape[0] - A.shape[0]
            padding = np.full((padding_rows, A.shape[1]), fill_value)
            A = np.vstack((A, padding))
        elif A.shape[0] > new_shape[0]:
            A = A[:new_shape[0], :]

        if A.shape[1] < new_shape[1]:
            pad_width = ((0, 0), (0, new_shape[1] - A.shape[1]))
            A = np.pad(A, pad_width, mode='constant', constant_values=fill_value)
        elif A.shape[1] > new_shape[1]:
            A = A[:, :new_shape[1]]
    
    if B.shape != new_shape:
        if B.shape[0] < new_shape[0]:
            padding_rows = new_shape[0] - B.shape[0]
            padding = np.full((padding_rows, B.shape[1]), fill_value)
            B = np.vstack((B, padding))
        elif B.shape[0] > new_shape[0]:
            B = B[:new_shape[0], :]

        if B.shape[1] < new_shape[1]:
            pad_width = ((0, 0), (0, new_shape[1] - B.shape[1]))
            B = np.pad(B, pad_width, mode='constant', constant_values=fill_value)
        elif B.shape[1] > new_shape[1]:
            B = B[:, :new_shape[1]]
    
    return A, B


In [2]:
# 设置工作环境
env.workspace = r"D:\ArcGISProjects\workspace\shbyq\features_data_dy.gdb"
arcpy.ListRasters()

['DEM',
 'AnalyticalHillshading',
 'Aspect',
 'ChannelNetworkBaseLevel',
 'ChannelNetworkDistance',
 'ClosedDepressions',
 'ConvergenceIndex',
 'LSFactor',
 'PlanCurvature',
 'ProfileCurvature',
 'RelativeSlopePosition',
 'Slope',
 'TopographicWetnessIndex',
 'TotalCatchmentArea',
 'ValleyDepth',
 'Contrast',
 'Correlation',
 'Dissimilarity',
 'Entropy',
 'Homogeneity',
 'Mean',
 'ndvi',
 'PCA_0',
 'PCA_1',
 'SecondMoment',
 'Variance',
 'PRE',
 'SRA',
 'TMP',
 'VAP',
 'WIND',
 'BIO',
 'LON',
 'LAT',
 'PH',
 'DL',
 'DZ']

In [4]:
print(arcpy.ListRasters())

['DEM', 'AnalyticalHillshading', 'Aspect', 'ChannelNetworkBaseLevel', 'ChannelNetworkDistance', 'ClosedDepressions', 'ConvergenceIndex', 'LSFactor', 'PlanCurvature', 'ProfileCurvature', 'RelativeSlopePosition', 'Slope', 'TopographicWetnessIndex', 'TotalCatchmentArea', 'ValleyDepth', 'Contrast', 'Correlation', 'Dissimilarity', 'Entropy', 'Homogeneity', 'Mean', 'ndvi', 'PCA_0', 'PCA_1', 'SecondMoment', 'Variance', 'PRE', 'SRA', 'TMP', 'VAP', 'WIND', 'BIO', 'LON', 'LAT', 'PH']


In [3]:
feature_numpyarray_dict = {}
# feature_list = ['BIO', 'PRE', 'SRA', 'TMP', 'VAP', 'WIN', 'NDVI', 'TDQS', 'LIGHT', 'AnalyticalHillshading', 'Aspect', 'ChannelNetworkBaseLevel', 'ChannelNetworkDistance', 'ClosedDepressions', 'ConvergenceIndex', 'LSFactor', 'PlanCurvature', 'ProfileCurvature', 'RelativeSlopePosition', 'Slope', 'TopographicWetnessIndex', 'TotalCatchmentArea', 'ValleyDepth', 'DEM', 'LON', 'LAT']
# feature_list = ['DEM', 'AnalyticalHillshading', 'ChannelNetworkDistance', 'PlanCurvature', 'ProfileCurvature', 'RelativeSlopePosition', 'Slope', 'ValleyDepth', 'PCA_0', 'PCA_1', 'PRE', 'SRA', 'BIO', 'LON', 'LAT']
feature_list = ['DEM',  'Aspect','ChannelNetworkBaseLevel','PlanCurvature',
       'ProfileCurvature', 'RelativeSlopePosition', 'Slope',
       'TopographicWetnessIndex', 'TotalCatchmentArea', 
       'Mean', 'ndvi', 'PCA_0',  'PRE','SRA', 'TMP', 'VAP', 'WIND', 'BIO', 'LON', 'LAT', 'PH']
for one_raster in feature_list:
    feature_numpyarray_dict[one_raster] = arcpy.RasterToNumPyArray(one_raster)

In [4]:
# 检查形状是否一致
print(len(set([feature_numpyarray_dict[_].shape for _ in feature_numpyarray_dict.keys()])))

1


In [5]:
# 集成特征表
features_table = np.column_stack(tuple([feature_numpyarray_dict[_].flatten() for _ in feature_numpyarray_dict.keys()]))

In [7]:
len(list(feature_numpyarray_dict.values())[0].flatten())

171871183

In [6]:
flatten_list = [feature_numpyarray_dict[_].flatten() for _ in feature_numpyarray_dict.keys()]

In [8]:
out_path = r"F:\cache_data\pre_soiltype_table\dy\cart\features_csv"
# 定义每个小块的大小
chunk_size = 500000

# 获取特征数量
num_features = len(feature_numpyarray_dict)

# 获取行数（假设所有栅格数据的行数相同，以第一个为准）
num_rows = len(list(feature_numpyarray_dict.values())[0].flatten())

# 计算需要切割成多少块
num_chunks = num_rows // chunk_size + (num_rows % chunk_size > 0)
print(num_chunks)
# 逐块处理或导出
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, num_rows)
    
    # 获取当前块的数据
    # chunk_data = np.column_stack([feature_numpyarray_dict[feature].flatten()[start_idx:end_idx] for feature in feature_list])
    chunk_data = np.column_stack([_[start_idx:end_idx] for _ in flatten_list])

    # 转为pandas datafram
    data = pd.DataFrame(chunk_data,columns = feature_list)
    data.to_csv(os.path.join(out_path,f'data_chunk_{str(i).zfill(3)}.csv'))

344


In [6]:
features_table.size

3609294843

In [7]:
features_table.shape

(171871183, 21)

In [8]:
# 添加列名
data = pd.DataFrame(features_table,columns = feature_list)

In [9]:
# 保存为csv文件
import os
out_path = r"F:\cache_data\pred_feature_table\dy\feature_table\select"
chunk_size = 500000
total_rows = data.shape[0]
for i in range(0, total_rows, chunk_size):
    start = i
    end = min(i + chunk_size, total_rows)
    filename =  os.path.join(out_path,f'data_chunk_{i}.csv') # 文件名格式可以根据您的需要进行修改
    df_chunk = data.iloc[start:end]
    df_chunk.to_csv(filename, index=False)
    print(i)

0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000
8500000
9000000
9500000
10000000
10500000
11000000
11500000
12000000
12500000
13000000
13500000
14000000
14500000
15000000
15500000
16000000
16500000
17000000
17500000
18000000
18500000
19000000
19500000
20000000
20500000
21000000
21500000
22000000
22500000
23000000
23500000
24000000
24500000
25000000
25500000
26000000
26500000
27000000
27500000
28000000
28500000
29000000
29500000
30000000
30500000
31000000
31500000
32000000
32500000
33000000
33500000
34000000
34500000
35000000
35500000
36000000
36500000
37000000
37500000
38000000
38500000
39000000
39500000
40000000
40500000
41000000
41500000
42000000
42500000
43000000
43500000
44000000
44500000
45000000
45500000
46000000
46500000
47000000
47500000
48000000
48500000
49000000
49500000
50000000
50500000
51000000
51500000
52000000
52500000
53000000
53500000
54000000
54500000
55000000
55500000
56000000
56500000
5