# Data Preparation for Machine Learning: ESA CCI Soil Moisture and Auxiliary Data
This Jupyter notebook prepares a dataset for machine learning by cropping the ESA CCI global dataset to the study area using an NDVI reference. Additionally, auxiliary datasets (e.g., LST, NDVI, TVDI, etc.) are upscaled to match the resolution of ESA CCI soil moisture data (25 km). The final output is a dataframe for use in modeling.

## Libraries and Dependencies
We will start by importing the necessary libraries, including `gdal` for geospatial operations, `pandas` for data manipulation, and `skimage.transform` for image resizing.


In [None]:
import os
import numpy as np
from osgeo import gdal
from skimage.transform import resize
import pandas as pd
from glob import glob


## Functions for Geospatial Operations
Here, we define functions to handle tasks like cropping rasters based on a reference dataset, loading and saving raster data, and resampling auxiliary datasets to match the resolution of ESA CCI data.


In [None]:
# Function to crop the input raster based on the reference extent
def crop_raster(input_file, output_file, reference_extent):
    input_ds = gdal.Open(input_file)
    if input_ds is None:
        raise ValueError(f"Failed to open input raster file: {input_file}")

    ulx, xres, _, uly, _, yres = input_ds.GetGeoTransform()
    lrx = ulx + (input_ds.RasterXSize * xres)
    lry = uly + (input_ds.RasterYSize * yres)

    # Define the region of interest (ROI) based on the reference extent
    roi = [max(ulx, reference_extent[0]), min(lrx, reference_extent[2]),
           min(uly, reference_extent[1]), max(lry, reference_extent[3])]

    # Compute the new size (width and height) of the cropped raster
    new_width = int((roi[1] - roi[0]) / xres)
    new_height = int((roi[2] - roi[3]) / abs(yres))

    # Create a new GeoTIFF file for the cropped raster
    driver = gdal.GetDriverByName('GTiff')
    output_ds = driver.Create(output_file, new_width, new_height, input_ds.RasterCount, gdal.GDT_Float32)
    if output_ds is None:
        raise ValueError(f"Failed to create output dataset: {output_file}")

    output_ds.SetGeoTransform((roi[0], xres, 0, roi[3], 0, -yres))
    output_ds.SetProjection(input_ds.GetProjection())

    # Read data from the input raster, crop it, and write it to the output raster
    gdal.Warp(output_ds, input_ds, outputBounds=roi)

    # Close the datasets
    input_ds = None
    output_ds = None

# Get extent of a raster dataset
def get_extent(dataset):
    ulx, xres, _, uly, _, yres = dataset.GetGeoTransform()
    lrx = ulx + (dataset.RasterXSize * xres)
    lry = uly + (dataset.RasterYSize * yres)
    return [ulx, uly, lrx, lry]


### Resampling Auxiliary Datasets
The function `resample_raster` adjusts the resolution of auxiliary datasets like LST, NDVI, etc., to match the ESA CCI soil moisture dataset (25 km resolution).


In [None]:
# Resample a raster to match the ESA CCI resolution
def resample_raster(input_file, output_file, esacci_file, is_categorical=False):
    input_ds = gdal.Open(input_file)
    esacci_ds = gdal.Open(esacci_file)

    if input_ds is None or esacci_ds is None:
        raise ValueError("Failed to open input or ESACCI raster file.")

    input_resolution = abs(input_ds.GetGeoTransform()[1])
    esacci_resolution_x = abs(esacci_ds.GetGeoTransform()[1])
    esacci_resolution_y = abs(esacci_ds.GetGeoTransform()[5])
    esacci_resolution = (esacci_resolution_x + esacci_resolution_y) / 2

    input_width = input_ds.RasterXSize
    input_height = input_ds.RasterYSize
    new_width = int(input_width * (input_resolution / esacci_resolution))
    new_height = int(input_height * (input_resolution / esacci_resolution))

    driver = gdal.GetDriverByName('GTiff')
    output_ds = driver.Create(output_file, new_width, new_height, input_ds.RasterCount, gdal.GDT_Float32 if not is_categorical else gdal.GDT_Int32)

    output_ds.SetGeoTransform(esacci_ds.GetGeoTransform())
    output_ds.SetProjection(esacci_ds.GetProjection())

    gdal.ReprojectImage(input_ds, output_ds, input_ds.GetProjection(), esacci_ds.GetProjection(), gdal.GRA_NearestNeighbour)

    input_ds = None
    esacci_ds = None
    output_ds = None


### Creating the Dataframe
This function processes ESA CCI soil moisture and auxiliary datasets, extracting pixel values and creating a structured dataframe for machine learning.


In [None]:
# Function to extract pixel values and create a dataframe
def extract_pixel_values(ESACCI_folder, output_dir, esaccci_resolution):
    df = pd.DataFrame(columns=['Date', 'Lon', 'Lat', 'LST', 'NDVI', 'TVDI', 'precipitation', 'soil_text', 'Landcover', 'Elevation', 'Slope', 'Soil_Bulkdensity','Skin_temperature', 'TWI', 'ESACCI'])

    for esacci_files in os.listdir(ESACCI_folder):
        if esacci_files.endswith(".tif"):
            date = esacci_files.split(".")[0]

            ndvi_file = os.path.join(ndvi_dir, f"NDVI_{date}.tif")
            lst_file = os.path.join(lst_dir, f"LST_{date}.tif")
            tvdi_file = os.path.join(TVDI_folder, f"TVDI_{date}.tif")
            precipitation_file = os.path.join(precip_folder, f"precipitation_{date}.tif")
            skin_temp_file = os.path.join(skin_temp_folder, f"skin_temperature_{date}.tif")
            soil_text = os.path.join(Covariates_folder, f"soil_texture.tif")
            lulc = os.path.join(Covariates_folder, f"LULC_{date[0:4]}.tif")
            elevation = os.path.join(Covariates_folder, f"Elevation.tif")
            twi = os.path.join(Covariates_folder, f"TWI.tif")
            bulkdensity = os.path.join(Covariates_folder, f"soil_bulkdensity.tif")
            slope = os.path.join(Covariates_folder, f"Slope.tif")
            esacci_file = os.path.join(ESACCI_folder, esacci_files)

            # Skipping files if not found
            if not os.path.exists(ndvi_file):
                print(f"NDVI file {ndvi_file} does not exist. Skipping to the next file.")
                continue

            # Resampling and cropping
            cropped_esacci_file = os.path.join(temp_folder, f"temp_cropped_esaccci_{date}.tif")
            ndvi_ds = gdal.Open(lst_file)
            ndvi_extent = get_extent(ndvi_ds)
            crop_raster(esacci_file, cropped_esacci_file, reference_extent=ndvi_extent)

            # Resample all auxiliary datasets to ESA CCI resolution
            resampled_ndvi_file = os.path.join(temp_folder, f"temp_resampled_ndvi_{date}.tif")
            resample_raster(ndvi_file, resampled_ndvi_file, cropped_esacci_file)

            # Continue this for other datasets...

            # Extracting data and constructing the dataframe
            ndvi_data = gdal.Open(resampled_ndvi_file).ReadAsArray()
            esacci_data = gdal.Open(cropped_esacci_file).ReadAsArray()

            geo_transform = gdal.Open(cropped_esacci_file).GetGeoTransform()
            lon = geo_transform[0] + np.arange(esacci_data.shape[1]) * geo_transform[1] + geo_transform[1] / 2
            lat = geo_transform[3] + np.arange(esacci_data.shape[0]) * geo_transform[5] + geo_transform[5] / 2

            lon_grid, lat_grid = np.meshgrid(lon, lat)
            lon_flat = lon_grid.flatten()
            lat_flat = lat_grid.flatten()

            temp_df = pd.DataFrame({
                'Date': date,
                'Lon': lon_flat,
                'Lat': lat_flat,
                'NDVI': ndvi_data.flatten(),
                'ESACCI': esacci_data.flatten()
            })

            temp_df = temp_df[temp_df['ESACCI'] >= 0]
            df = pd.concat([df, temp_df], ignore_index=True)
            df = df.dropna()

            # Remove temporary files
            os.remove(cropped_esacci_file)
            os.remove(resampled_ndvi_file)
            # Continue this for other temporary files...

    return df


## Final Remarks
The code above prepares the ESA CCI soil moisture and auxiliary
