## **NDVI Data Processing by Region**

### **Overview**
This notebook processes NDVI data for multiple regions and years by merging TIFF files, extracting bands corresponding to specific days of the year (DOY), and saving the results as individual TIFF files. The workflow involves the following steps:

1. **Identify regions and years**: Extract regions and group TIFF files by year.
2. **Merge TIFF files**: Merge multiple TIFF files for the same region and year.
3. **Extract DOY from bands**: Extract bands corresponding to specific DOY and save them as separate TIFF files.
4. **Parallel processing**: Use parallel processing to handle multiple regions simultaneously for faster execution.

In [None]:
import os
import re
import rasterio
from rasterio.merge import merge
from joblib import Parallel, delayed

In [None]:
base_input_dir = '/mnt/raid5/1114/original/'  # Base path for the input directory
base_output_dir = '/mnt/raid5/1114/preprocessed/'  # Base path for the output directory

# Set the default coordinate reference system to EPSG:4326
DEFAULT_CRS = 'EPSG:4326'

def extract_doy_from_band_name(band_name):
    """
    Function to extract DOY (Day of Year) from the band name.
    """
    match = re.search(r'Syn_VI_fitted(\d+)', band_name)
    if match:
        return int(match.group(1))
    return None

def extract_year_from_filename(filename):
    """
    Function to extract the year from the filename.
    """
    match = re.search(r'_(\d{4})(?:[\.\-_]|$)', filename)
    if match:
        return int(match.group(1))
    return None

def process_and_save_bands(mosaic, out_meta, band_descriptions, output_dir, year, region_name, crs=DEFAULT_CRS):
    """
    Function to extract bands from the merged TIFF file and save them with year and DOY information in the filename.
    """
    band_count = mosaic.shape[0]
    for band_idx in range(band_count):
        band_description = band_descriptions[band_idx]
        doy = extract_doy_from_band_name(band_description)
        if doy is None:
            print(f"DOY value could not be extracted from band {band_idx + 1}. {band_description}")
            continue

        file_name = f"NDVI_{region_name}_{year}_{doy:03d}.tif"
        output_file = os.path.join(output_dir, file_name)
        band_data = mosaic[band_idx, :, :]

        out_meta.update({
            "driver": "GTiff",
            "height": band_data.shape[0],
            "width": band_data.shape[1],
            "count": 1,
            "dtype": 'int16',
            "crs": crs
        })

        with rasterio.open(output_file, "w", **out_meta) as dest:
            dest.write(band_data, 1)

        print(f"Saved: {output_file}")

def merge_tiff_files(tiff_files):
    """
    Function to merge the specified list of TIFF files and return the merged data.
    """
    if len(tiff_files) == 0:
        raise ValueError("No TIFF files to merge.")

    src_files_to_mosaic = []
    band_descriptions = None
    for tif_file in tiff_files:
        src = rasterio.open(tif_file)
        src_files_to_mosaic.append(src)
        if band_descriptions is None:
            band_descriptions = src.descriptions

    mosaic, out_trans = merge(src_files_to_mosaic)

    out_meta = src_files_to_mosaic[0].meta.copy()
    out_meta.update({
        "transform": out_trans
    })

    for src in src_files_to_mosaic:
        src.close()

    return mosaic, out_meta, band_descriptions

def process_ndvi_for_region(region_name, year, tiff_files, output_dir, crs='EPSG:4326'):
    """
    Main function to process NDVI data for the given region and year.
    """
    print(f"Merging {len(tiff_files)} TIF files for {year} in region: {region_name}")

    mosaic, out_meta, band_descriptions = merge_tiff_files(tiff_files)

    out_meta.update({
        "crs": crs,
        "dtype": 'int16'
    })

    process_and_save_bands(mosaic, out_meta, band_descriptions, output_dir, year, region_name, crs)
    print(f"Completed processing for {year} in region: {region_name}")

def process_all_regions_for_ndvi(base_input_dir, base_output_dir, target_regions, n_jobs=5):
    """
    Function to process NDVI data for a list of specified regions in parallel.
    """
    def process_region(region_name):
        region_path = os.path.join(base_input_dir, region_name)
        if not os.path.isdir(region_path):
            return

        print(f"Processing region: {region_name}")
        tiff_files_by_year = {}
        
        for filename in os.listdir(region_path):
            if filename.endswith('.tif'):
                year = extract_year_from_filename(filename)
                if year:
                    tiff_files_by_year.setdefault(year, []).append(os.path.join(region_path, filename))

        for year, tiff_files in tiff_files_by_year.items():
            print(f"Processing {year} for region {region_name} with {len(tiff_files)} files.")
            output_dir = os.path.join(base_output_dir, region_name, 'preprocessed', str(year))
            os.makedirs(output_dir, exist_ok=True)
            process_ndvi_for_region(region_name, year, tiff_files, output_dir)

    # Process each region in parallel
    Parallel(n_jobs=n_jobs)(delayed(process_region)(region) for region in target_regions)

# Store all folder names in target_regions
target_regions = [folder_name for folder_name in os.listdir(base_input_dir) if os.path.isdir(os.path.join(base_input_dir, folder_name))]

# Process NDVI data in parallel for the specified regions
process_all_regions_for_ndvi(base_input_dir, base_output_dir, target_regions)