In [1]:
import os
import numpy as np
import rasterio
from autogluon.tabular import TabularPredictor
import pandas as pd
from tqdm import tqdm
from rasterio.windows import Window
from autogluon.tabular import TabularPredictor
from concurrent.futures import ProcessPoolExecutor, as_completed
import psutil
import time

In [2]:

class TIFPredictor:
    def __init__(self, model_path, model_name, input_folder, chunk_size=(100, 100)):
        self.model = TabularPredictor.load(model_path)
        self.select_model = model_name
        self.input_folder = input_folder
        self.feature_columns = self.model.feature_metadata_in.get_features()
        self.chunk_size = chunk_size

    def read_tif_info(self, file_path):
        with rasterio.open(file_path) as src:
            return src.profile, src.shape

    def read_tif_chunk(self, file_path, row_start, row_end, col_start, col_end):
        with rasterio.open(file_path) as src:
            return src.read(1, window=((row_start, row_end), (col_start, col_end)))

    def generate_chunks(self, shape):
        rows, cols = shape
        for row in range(0, rows, self.chunk_size[0]):
            for col in range(0, cols, self.chunk_size[1]):
                yield (row, min(row + self.chunk_size[0], rows),
                       col, min(col + self.chunk_size[1], cols))

    def predict(self, output_path):
        # Get metadata from the first TIF file
        first_tif = os.path.join(self.input_folder, f"{self.feature_columns[0]}.tif")
        profile, shape = self.read_tif_info(first_tif)

        # Prepare the output file
        profile.update(dtype=rasterio.float32, count=1, compress='lzw')
        total_chunks = sum(1 for _ in self.generate_chunks(shape))
        with rasterio.open(output_path, 'w', **profile) as dst:
            with tqdm(total=total_chunks, desc="处理进度") as pbar:
                for row_start, row_end, col_start, col_end in self.generate_chunks(shape):
                    chunk_data = {}
                    for feature in self.feature_columns:
                        tif_path = os.path.join(self.input_folder, f"{feature}.tif")
                        chunk = self.read_tif_chunk(tif_path, row_start, row_end, col_start, col_end)
                        chunk_data[feature] = chunk.flatten()
                    print("完成区块特征数据拾取")
                    # Prepare input data for the model
                    X = pd.DataFrame(chunk_data)

                    # Make predictions
                    predictions = self.model.predict(X, model=self.select_model)
                    print("完成区块预测")
                    # Reshape predictions to match the chunk size
                    predictions = predictions.values.reshape((row_end - row_start, col_end - col_start))

                    # Write the predictions to the output file
                    dst.write(predictions.astype(rasterio.float32), 1, window=((row_start, row_end), (col_start, col_end)))
                pbar.update(1)
        print(f"Predictions saved to {output_path}")



In [3]:
class ProgressTrackingTIFPredictor:
    def __init__(self, model_path, model_name, input_folder, num_workers=None):
        self.model = TabularPredictor.load(model_path)
        self.select_model = model_name
        self.input_folder = input_folder
        self.feature_columns = self.model.feature_metadata_in.get_features()
        self.num_workers = num_workers or max(1, psutil.cpu_count(logical=False) - 1)
        self.tif_data = {}
        self.shape = None
        self.profile = None

    def preload_tif_data(self):
        total_memory = psutil.virtual_memory().total
        memory_limit = 0.8 * total_memory  # Use up to 80% of available memory

        cumulative_size = 0
        for feature in tqdm(self.feature_columns, desc="Preloading TIF data"):
            tif_path = os.path.join(self.input_folder, f"{feature}.tif")
            with rasterio.open(tif_path) as src:
                if self.shape is None:
                    self.shape = src.shape
                    self.profile = src.profile
                data = src.read(1)
                cumulative_size += data.nbytes
                if cumulative_size > memory_limit:
                    print(f"Warning: Memory limit reached. Some data will be read on-demand.")
                    break
                self.tif_data[feature] = data

        print("TIF data preloading completed")

    def read_chunk(self, feature, window):
        if feature in self.tif_data:
            return self.tif_data[feature][window.row_off:window.row_off+window.height, 
                                          window.col_off:window.col_off+window.width]
        else:
            tif_path = os.path.join(self.input_folder, f"{feature}.tif")
            with rasterio.open(tif_path) as src:
                return src.read(1, window=window)

    def process_chunk(self, chunk_info):
        row_start, row_end, col_start, col_end = chunk_info
        window = Window(col_start, row_start, col_end - col_start, row_end - row_start)
        chunk_data = {}
        for feature in self.feature_columns:
            chunk = self.read_chunk(feature, window)
            chunk_data[feature] = chunk.flatten()

        X = pd.DataFrame(chunk_data)
        predictions = self.model.predict(X, model=self.select_model)
        return predictions.values.reshape((row_end - row_start, col_end - col_start)), chunk_info

    def predict(self, output_path):
        start_time = time.time()
        self.preload_tif_data()

        if self.shape is None or self.profile is None:
            first_tif = os.path.join(self.input_folder, f"{self.feature_columns[0]}.tif")
            with rasterio.open(first_tif) as src:
                self.shape = src.shape
                self.profile = src.profile

        self.profile.update(dtype=rasterio.float32, count=1, compress='lzw')

        # Determine optimal chunk size
        total_pixels = self.shape[0] * self.shape[1]
        chunk_pixels = max(1000000, total_pixels // (self.num_workers * 10))  # Aim for at least 10 chunks per worker
        chunk_side = int(np.sqrt(chunk_pixels))
        chunks = list(self.generate_chunks(self.shape, (chunk_side, chunk_side)))

        # Pre-allocate the entire output array in memory
        output_data = np.zeros(self.shape, dtype=np.float32)

        total_chunks = len(chunks)
        processed_chunks = 0
        total_time = 0

        with ProcessPoolExecutor(max_workers=self.num_workers) as executor:
            futures = {executor.submit(self.process_chunk, chunk): chunk for chunk in chunks}
            
            with tqdm(total=total_chunks, desc="Processing chunks") as pbar:
                for future in as_completed(futures):
                    chunk_start_time = time.time()
                    predictions, (row_start, row_end, col_start, col_end) = future.result()
                    output_data[row_start:row_end, col_start:col_end] = predictions
                    
                    processed_chunks += 1
                    chunk_time = time.time() - chunk_start_time
                    total_time += chunk_time
                    
                    # Update progress bar
                    pbar.update(1)
                    
                    # Estimate remaining time
                    avg_time_per_chunk = total_time / processed_chunks
                    estimated_total_time = avg_time_per_chunk * total_chunks
                    remaining_time = estimated_total_time - total_time
                    
                    # Update progress bar description with time estimate
                    pbar.set_description(f"Processing chunks - ETA: {remaining_time:.2f}s")

        # Write the entire output at once
        with rasterio.open(output_path, 'w', **self.profile) as dst:
            dst.write(output_data, 1)

        end_time = time.time()
        total_runtime = end_time - start_time
        print(f"Predictions saved to {output_path}")
        print(f"Total runtime: {total_runtime:.2f} seconds")

    @staticmethod
    def generate_chunks(shape, chunk_size):
        rows, cols = shape
        for row in range(0, rows, chunk_size[0]):
            for col in range(0, cols, chunk_size[1]):
                yield (row, min(row + chunk_size[0], rows),
                       col, min(col + chunk_size[1], cols))

In [4]:
class LargeScaleTIFPredictor:
    def __init__(self, model_path, model_name, input_folder, num_workers=None, chunk_size=(1000, 1000)):
        self.model = TabularPredictor.load(model_path)
        self.select_model = model_name
        self.input_folder = input_folder
        self.feature_columns = self.model.feature_metadata_in.get_features()
        self.num_workers = num_workers or max(1, psutil.cpu_count(logical=False) - 1)
        self.chunk_size = chunk_size
        self.shape = None
        self.profile = None

    def get_tif_info(self):
        first_tif = os.path.join(self.input_folder, f"{self.feature_columns[0]}.tif")
        with rasterio.open(first_tif) as src:
            self.shape = src.shape
            self.profile = src.profile
        print(f"TIF dimensions: {self.shape[0]} x {self.shape[1]} pixels")

    def read_chunk(self, feature, window):
        tif_path = os.path.join(self.input_folder, f"{feature}.tif")
        with rasterio.open(tif_path) as src:
            return src.read(1, window=window)

    def process_chunk(self, chunk_info):
        row_start, row_end, col_start, col_end = chunk_info
        window = Window(col_start, row_start, col_end - col_start, row_end - row_start)
        chunk_data = {}
        for feature in self.feature_columns:
            chunk = self.read_chunk(feature, window)
            chunk_data[feature] = chunk.flatten()

        X = pd.DataFrame(chunk_data)
        predictions = self.model.predict(X, model=self.select_model)
        return predictions.values.reshape((row_end - row_start, col_end - col_start)), chunk_info

    def predict(self, output_path):
        start_time = time.time()
        self.get_tif_info()

        self.profile.update(dtype=rasterio.float32, count=1, compress='lzw')

        chunks = list(self.generate_chunks(self.shape, self.chunk_size))
        total_chunks = len(chunks)

        print(f"Total number of chunks to process: {total_chunks}")
        print(f"Estimated memory usage per chunk: {self.estimate_memory_usage_per_chunk()} MB")

        processed_chunks = 0
        total_processing_time = 0

        with rasterio.open(output_path, 'w', **self.profile) as dst:
            with ProcessPoolExecutor(max_workers=self.num_workers) as executor:
                futures = {executor.submit(self.process_chunk, chunk): chunk for chunk in chunks}
                
                with tqdm(total=total_chunks, desc="Processing chunks") as pbar:
                    for future in as_completed(futures):
                        chunk_start_time = time.time()
                        predictions, (row_start, row_end, col_start, col_end) = future.result()
                        dst.write(predictions, 1, window=Window(col_start, row_start, col_end - col_start, row_end - row_start))
                        
                        processed_chunks += 1
                        chunk_time = time.time() - chunk_start_time
                        total_processing_time += chunk_time
                        
                        # Update progress bar
                        pbar.update(1)
                        
                        # Estimate remaining time
                        avg_time_per_chunk = total_processing_time / processed_chunks
                        estimated_total_time = avg_time_per_chunk * total_chunks
                        remaining_time = estimated_total_time - total_processing_time
                        
                        # Update progress bar description with time estimate
                        pbar.set_description(f"Processing chunks - ETA: {remaining_time/60:.2f} minutes")

        end_time = time.time()
        total_runtime = end_time - start_time
        print(f"Predictions saved to {output_path}")
        print(f"Total runtime: {total_runtime/60:.2f} minutes")

    @staticmethod
    def generate_chunks(shape, chunk_size):
        rows, cols = shape
        for row in range(0, rows, chunk_size[0]):
            for col in range(0, cols, chunk_size[1]):
                yield (row, min(row + chunk_size[0], rows),
                       col, min(col + chunk_size[1], cols))

    def estimate_memory_usage_per_chunk(self):
        # Estimate memory usage for one chunk (in MB)
        num_features = len(self.feature_columns)
        chunk_pixels = self.chunk_size[0] * self.chunk_size[1]
        bytes_per_pixel = 4  # Assuming float32
        return (chunk_pixels * num_features * bytes_per_pixel) / (1024 * 1024)

In [5]:
import os
import numpy as np
import pandas as pd
import rasterio
from rasterio.features import geometry_mask
from rasterio.windows import Window
from autogluon.tabular import TabularPredictor
from tqdm import tqdm
import time
from shapely.geometry import shape, box
import fiona
import logging
import warnings
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import DataConversionWarning

# Set sklearn logging level to WARNING
logging.getLogger('sklearn').setLevel(logging.WARNING)

# Ignore specific sklearn warnings
warnings.filterwarnings('ignore', category=DataConversionWarning)
warnings.filterwarnings('ignore', message='X does not have valid feature names')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class OptimizedSingleProcessVectorMaskedTIFPredictor:
    def __init__(self, model_path, model_name, input_folder, vector_path, chunk_size=(1000, 1000)):
        self.model = TabularPredictor.load(model_path)
        self.select_model = model_name
        self.input_folder = input_folder
        self.vector_path = vector_path
        self.chunk_size = chunk_size
        self.feature_columns = self.model.feature_metadata_in.get_features()
        self.shape = None
        self.profile = None
        self.vector_mask = None
        self.valid_windows = []

    def get_tif_info_and_create_mask(self):
        first_tif = os.path.join(self.input_folder, f"{self.feature_columns[0]}.tif")
        with rasterio.open(first_tif) as src:
            self.shape = src.shape
            self.profile = src.profile

            with fiona.open(self.vector_path, "r") as vector_file:
                shapes = [shape(feature['geometry']) for feature in vector_file]
                raster_bounds = box(*src.bounds)
                total_vector_area = sum(s.area for s in shapes)
                clipped_shapes = [shape.intersection(raster_bounds) for shape in shapes]
                clipped_shapes = [s for s in clipped_shapes if not s.is_empty]
                
                if not clipped_shapes:
                    raise ValueError("The vector area does not intersect with the raster extent.")
                
                clipped_area = sum(s.area for s in clipped_shapes)
                self.vector_mask = geometry_mask(clipped_shapes, out_shape=self.shape, transform=src.transform, invert=True)

        logging.info(f"TIF dimensions: {self.shape[0]} x {self.shape[1]} pixels")
        logging.info(f"Total raster area: {raster_bounds.area}")
        logging.info(f"Total vector area: {total_vector_area}")
        logging.info(f"Intersection area: {clipped_area}")
        logging.info(f"Percentage of vector area within raster: {(clipped_area / total_vector_area) * 100:.2f}%")
        logging.info(f"Area to process: {np.sum(self.vector_mask)} pixels")

        self.valid_windows = self.generate_valid_windows()
        logging.info(f"Number of chunks to process: {len(self.valid_windows)}")

    def generate_valid_windows(self):
        valid_windows = []
        for window in self.generate_windows(self.shape, self.chunk_size):
            if np.any(self.vector_mask[window.row_off:window.row_off+window.height, 
                                       window.col_off:window.col_off+window.width]):
                valid_windows.append(window)
        return valid_windows

    def read_masked_chunk(self, feature, window):
        tif_path = os.path.join(self.input_folder, f"{feature}.tif")
        with rasterio.open(tif_path) as src:
            chunk = src.read(1, window=window)
            mask = self.vector_mask[window.row_off:window.row_off+window.height, 
                                    window.col_off:window.col_off+window.width]
            return chunk[mask]

    def process_chunk(self, window):
        try:
            chunk_data = {}
            for feature in self.feature_columns:
                chunk = self.read_masked_chunk(feature, window)
                chunk_data[feature] = chunk.flatten()

            X = pd.DataFrame(chunk_data)
            predictions = self.model.predict(X, model=self.select_model)

            full_chunk = np.zeros((window.height, window.width), dtype=np.float32)
            mask = self.vector_mask[window.row_off:window.row_off+window.height, 
                                    window.col_off:window.col_off+window.width]
            full_chunk[mask] = predictions

            return full_chunk, window
        except Exception as e:
            logging.error(f"Error processing chunk {window}: {str(e)}")
            return None

    def predict(self, output_path):
        start_time = time.time()
        
        try:
            self.get_tif_info_and_create_mask()
        except Exception as e:
            logging.error(f"Error in get_tif_info_and_create_mask: {str(e)}")
            return

        self.profile.update(dtype=rasterio.float32, count=1, compress='lzw')

        # Ensure output directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        total_chunks = len(self.valid_windows)
        processed_chunks = 0
        total_processing_time = 0

        with rasterio.open(output_path, 'w', **self.profile) as dst:
            for window in tqdm(self.valid_windows, desc="Processing chunks"):
                chunk_start_time = time.time()
                result = self.process_chunk(window)
                if result is not None:
                    predictions, window = result
                    dst.write(predictions, 1, window=window)
                    processed_chunks += 1
                    chunk_time = time.time() - chunk_start_time
                    total_processing_time += chunk_time

                    avg_time_per_chunk = total_processing_time / processed_chunks
                    estimated_total_time = avg_time_per_chunk * total_chunks
                    remaining_time = estimated_total_time - total_processing_time

        end_time = time.time()
        total_runtime = end_time - start_time
        logging.info(f"Predictions saved to {output_path}")
        logging.info(f"Total runtime: {total_runtime/60:.2f} minutes")
        logging.info(f"Processed {processed_chunks} out of {total_chunks} chunks")

    @staticmethod
    def generate_windows(shape, chunk_size):
        rows, cols = shape
        for row in range(0, rows, chunk_size[0]):
            for col in range(0, cols, chunk_size[1]):
                yield Window(col, row, 
                             min(chunk_size[1], cols - col),
                             min(chunk_size[0], rows - row))

In [6]:
# 使用示例
model_path = r'F:\cache_data\model_path\sb\soil_type\autogluon\autogluon_20240902'
model_name = 'WeightedEnsemble_L2'  # 例如 'WeightedEnsemble_L2'
input_folder = r'F:\tif_features\county_feature\sb'
output_path = r'C:\Users\Runker\Desktop\test\CSC\prediction_wl2.tif'
shp_extent_path = r'C:\Users\Runker\Desktop\test\SHp\SB_EXTENT.shp'

In [7]:
# 矢量范围裁剪预测
predictor = OptimizedSingleProcessVectorMaskedTIFPredictor(model_path, model_name, input_folder, shp_extent_path, chunk_size=(1000, 1000))
predictor.predict(output_path)

2024-09-04 13:55:43,494 - INFO - TIF dimensions: 12573 x 12150 pixels
2024-09-04 13:55:43,494 - INFO - Total raster area: 3819048750.0
2024-09-04 13:55:43,494 - INFO - Total vector area: 1697098094.7289455
2024-09-04 13:55:43,494 - INFO - Intersection area: 1695684300.7598128
2024-09-04 13:55:43,494 - INFO - Percentage of vector area within raster: 99.92%
2024-09-04 13:55:43,566 - INFO - Area to process: 67827322 pixels
2024-09-04 13:55:43,577 - INFO - Number of chunks to process: 95
Processing chunks:   2%|▏         | 2/95 [04:00<3:20:29, 129.35s/it]

In [None]:
predictor = LargeScaleTIFPredictor(model_path, model_name, input_folder, chunk_size=(20, 20))
predictor.predict(output_path)

In [None]:
predictor = ProgressTrackingTIFPredictor(model_path, model_name, input_folder)
predictor.predict(output_path)

In [None]:
predictor = TIFPredictor(model_path, model_name, input_folder)
predictor.predict(output_path)