### Note well: this Notebook works for a single date [to be updated]

## Pre-processing of model datasets

* Predictors and Landsat LST band: resampling to 10 m with NN
* Predictors and Landsat LST band: raster grid alignment

In [3]:
import os
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio.merge import merge
from rasterio.plot import show
from rasterio.plot import show_hist
from rasterio import mask
from rasterio.warp import reproject, Resampling, calculate_default_transform
from rasterio.enums import Resampling
from rasterio.mask import mask
from rasterio.features import rasterize
from rasterio.transform import xy
import ipywidgets as widgets
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon, Point
from collections import Counter
import sys
from rasterio.crs import CRS

### 1. Select the Sentinel-2 image date: the code retreives the corresponding Landsat 8 date

#### Option 1: manual image selection

In [None]:
# date_s2_w = widgets.Dropdown(
#     options=['2022-08-09', '2023-02-15', '2023-03-22', '2023-06-25', '2023-11-17', '2024-02-05', '2024-05-10', '2024-07-14', '2024-07-29'], ##here, put all availables S2 dates
#     value='2023-03-22',
#     description='Sentinel-2 date:',
#     disabled=False,
#     layout={'width': 'max-content'},
#     style = {'description_width': 'initial'}
# )
# date_s2_w

In [None]:
#date_s2 = date_s2_w.value

#### Option 2: Tagged cell as parameters and selection of a random parameters that will be rewriten by and extarnal file
use this method if you intend to run with more than a single date

In [None]:
# The following parameters are just and example and will be rewritten by driver.py
date_s2_w = '2017-02-16'
season = "Summer"
total_samples = 100
combined = 1000

In [None]:
print("Selected Date:", date_s2_w )
date_s2 = date_s2_w

#### Rest of the code

In [6]:
df = pd.read_csv('LST_L89/L8_corresponding_to_S2.csv', delimiter = ';')
check = df['note'].loc[df['Sentinel_date'] == date_s2].values[0]

In [None]:
date_l8 = df['Landsat_date'].loc[df['Sentinel_date'] == date_s2].values[0]
print(date_l8)
file_l8 = f"Average_LST_map_{season}.tif"
print(file_l8)

### 2. Open predictors and response variable files

In [8]:
lst_file_path = os.path.join('E:\TESI\MODEL\Input\LST_L89\Mediated_LST/', file_l8)
print(lst_file_path)

E:\TESI\MODEL\Input\LST_L89\Mediated_LST/Average_LST_map_Summer.tif


In [9]:
BH_file_path = 'UCP/UCP_20m/BH.tif'
BSF_file_path = 'UCP/UCP_20m/BSF.tif'
IMD_file_path = 'UCP/UCP_20m/IMD.tif'
SVF_file_path = 'UCP/UCP_20m/SVF.tif'
TCH_file_path = 'UCP/UCP_20m/TCH.tif'

In [10]:
fractions = 'Fractions/Final_Class_Fraction_Layer_Masked_' + date_s2 + '.tif'

### 3. Resample the predictors and response variable to 10 m and save to file

In [11]:
input_predictors = [
    lst_file_path,
    BH_file_path,
    BSF_file_path,
    IMD_file_path,
    SVF_file_path,
    TCH_file_path,
    fractions
]

In [None]:
for input_path in input_predictors:
    
    output_path = input_path[:-4] + '_10m.tif'
    
    if not os.path.exists(output_path):
        with rasterio.open(input_path) as src:
            src_crs = src.crs or CRS.from_epsg(32632)
            dst_resolution = 10

            transform, width, height = calculate_default_transform(
                src_crs, src_crs, src.width, src.height, *src.bounds, resolution=dst_resolution
            )

            kwargs = src.meta.copy()
            kwargs.update({
                'transform': transform,
                'width': width,
                'height': height,
                'res': (dst_resolution, dst_resolution),
                'compress': 'lzw'
            })

            with rasterio.open(output_path, 'w', **kwargs) as dst:
                
                for i in range(1, src.count + 1):
                    reproject(
                        source=rasterio.band(src, i),
                        destination=rasterio.band(dst, i),
                        src_transform=src.transform,
                        src_crs=src.crs,
                        dst_transform=transform,
                        dst_crs=src.crs,
                        resampling=Resampling.nearest
                    )
        print(f"Processed: {output_path}")
    else:
        print(f"Skipped (already exists): {output_path}")

### 4. Align predictors and response variable (10 m rasters): reference is Fractions

In [17]:
reference = fractions[:-4] + '_10m.tif'

In [18]:
input_rasters = [
    BH_file_path[:-4] + '_10m.tif',
    BSF_file_path[:-4] + '_10m.tif',
    IMD_file_path[:-4] + '_10m.tif',
    SVF_file_path[:-4] + '_10m.tif',
    TCH_file_path[:-4] + '_10m.tif',
    lst_file_path[:-4] + '_10m.tif'
]

In [None]:
with rasterio.open(reference) as ref:
    ref_crs = ref.crs or CRS.from_epsg(32632)
    ref_transform = ref.transform
    ref_width = ref.width
    ref_height = ref.height

for raster_path in input_rasters:
    with rasterio.open(raster_path) as src:
        profile = src.profile
        profile.update({
            'crs': ref_crs,
            'transform': ref_transform,
            'width': ref_width,
            'height': ref_height
        })

        # Create an empty array to store the aligned data
        aligned = np.empty((ref_height, ref_width), dtype=src.dtypes[0])

        # Reproject and resample from source to aligned array
        reproject(
            source=src.read(1),  # Read band 1
            destination=aligned,
            src_transform=src.transform,
            src_crs=src.crs,
            dst_transform=ref_transform,
            dst_crs=ref_crs,
            resampling=Resampling.nearest
        )

    # Overwrite the original file with the aligned version
    with rasterio.open(raster_path, 'w', **profile) as dst:
        dst.write(aligned, 1)

    print(f'Aligned and overwritten: {raster_path}')

## Extraction of sample points for model training

Constraints:
1. points must not be placed outside the area of interest (AOI)
2. points must not be placed in areas with NoData (both in predictors and LST)
3. points must not be placed in areas with land consumption variations between 2015-2023

(For the first three constraints, create a validity mask)

4. stratified extraction of N points across LCZs (note: merge *LCZs Bare Soil or Sand* and *Low Plants*)

### 1. Create a validity mask to extract sample points: the mask refers to Sentinel-2 date

In [None]:
# --- Inputs ---
raster_paths = [
    lst_file_path[:-4] + '_10m.tif',
    BH_file_path[:-4] + '_10m.tif',
    BSF_file_path[:-4] + '_10m.tif',
    IMD_file_path[:-4] + '_10m.tif',
    SVF_file_path[:-4] + '_10m.tif',
    TCH_file_path[:-4] + '_10m.tif',
    fractions[:-4] + '_10m.tif'
]

aoi_path = 'AOI/AOI.shp'
validations_path = 'validation/validation_areas/change_area_CMM_dissolved_buffer30m_no_holes.gpkg'
output_path = season + '_binary_mask_' + date_s2.replace('-', '') + '.tif'

In [38]:
validations_path = 'validation/validation_areas/change_area_CMM_dissolved_buffer30m_no_holes.gpkg'

In [39]:
# --- Use the first raster as the reference ---
with rasterio.open(raster_paths[0]) as ref:
    meta = ref.meta.copy()
    meta.update(dtype='uint8', count=1, nodata=0)
    transform = ref.transform
    crs = ref.crs
    width = ref.width
    height = ref.height
    shape = (height, width)

In [None]:
# --- Step 1: Identify NoData pixels across all rasters ---
nodata_mask = np.zeros(shape, dtype=bool)

# Path to the 10 m LST file so we can recognise it inside the loop
for path in raster_paths:
    with rasterio.open(path) as src:
        data = src.read(1, masked=True)  # returns a MaskedArray
        # Masked pixels are treated as NoData
        nodata_mask |= data.mask
        # 2b. Extra rule: in the LST layer, value 149 means "invalid" - LST from Landsat 8/9 in kelvin set 149 as NoData value
        if path == lst_file_path[:-4] + '_10m.tif':
            nodata_mask |= (data.data == 149) | np.isnan(data.data)

In [41]:
# --- Step 2: Rasterize AOI (inside = 1, outside = 0) ---
aoi = gpd.read_file(aoi_path).to_crs(crs)
aoi_mask = rasterize(
    [(geom, 1) for geom in aoi.geometry],
    out_shape=shape,
    transform=transform,
    fill=0,
    all_touched=False,
    dtype='uint8'
)

In [42]:
# --- Step 3: Rasterize validations (mask = 1 inside polygons) ---
validations = gpd.read_file(validations_path).to_crs(crs)
validation_mask = rasterize(
    [(geom, 1) for geom in validations.geometry],
    out_shape=shape,
    transform=transform,
    fill=0,
    all_touched=False,
    dtype='uint8'
)

In [43]:
# --- Step 4: Combine all masks ---
# Start with full ones (valid area), then mask out with zeros
final_mask = np.ones(shape, dtype='uint8')
final_mask[nodata_mask] = 0
final_mask[aoi_mask == 0] = 0
final_mask[validation_mask == 1] = 0

In [None]:
# --- Step 5: Save result ---
with rasterio.open('masks/' + output_path, 'w', **meta) as dst:
    dst.write(final_mask, 1)

print(f"Binary mask saved to {output_path}")

### 2. Open the LCZ map, resample it to 10 m and align it to the other rasters (if not done already)

In [None]:
from datetime import datetime

# Convert the date_s2 to datetime object
dt = datetime.strptime(date_s2, '%Y-%m-%d')
month = dt.month

# Determine lcz_date based on month
if month in [12, 1, 2, 3]:
    lcz_date = '2023-03-22'
elif month in [6, 7, 8, 9]:
    lcz_date = '2023-06-25'
else:
    lcz_date = '2023-11-17'

print(lcz_date)

In [46]:
# Open LCZ raster
with rasterio.open('LCZ/classified_Random Forest_' + lcz_date.replace('-', '') + '_20m.tif') as lcz_src:
    lcz_data = lcz_src.read(1).astype('float32')  # To allow np.nan
    lcz_crs = lcz_src.crs
    lcz_transform = lcz_src.transform

In [47]:
# Open final_mask raster to get 10m alignment
with rasterio.open('masks/' + output_path) as mask_src:
    target_shape = (mask_src.height, mask_src.width)
    target_transform = mask_src.transform
    target_crs = mask_src.crs
    target_profile = mask_src.profile.copy()
    target_profile.update({
        'dtype': 'float32',
        'nodata': np.nan,
        'count': 1
    })

In [48]:
# Prepare output array and metadata
resampled_lcz = np.full(target_shape, np.nan, dtype='float32')

In [None]:
# Reproject (resample + align)
reproject(
    source=lcz_data,
    destination=resampled_lcz,
    src_transform=lcz_transform,
    src_crs=lcz_crs,
    dst_transform=target_transform,
    dst_crs=target_crs,
    resampling=Resampling.nearest,
    dst_nodata=np.nan  # Ensure nodata stays as NaN
)

In [None]:
output_path = f'LCZ/classified_Random Forest_' + lcz_date.replace('-', '') + '_10m.tif'

if not os.path.exists(output_path):
    with rasterio.open(output_path, 'w', **target_profile) as dst:
        dst.write(resampled_lcz, 1)
    print(f"Resampled LCZ saved to: {output_path}")
else:
    print(f"File already exists: {output_path}")

### 3. Open LCZ (10m) and validity mask and mask LCZ with nans, merge the classes 104 and 106

In [None]:
# --- Open LCZ and mask rasters ---
with rasterio.open('LCZ/classified_Random Forest_' + lcz_date.replace('-', '') + '_10m.tif') as lcz_src:
    lcz = lcz_src.read(1).astype('float32')  # To support np.nan

with rasterio.open('masks/' + season + '_binary_mask_' + date_s2.replace('-', '') + '.tif') as mask_src:
    mask = mask_src.read(1)

In [52]:
# --- Step 1: Set LCZ pixels to NaN where mask is 0 ---
lcz[mask == 0] = np.nan

In [53]:
# --- Step 2: Replace LCZ values 106 and 104 with 1046 ---
lcz[np.isin(lcz, [104, 106])] = 1046

### 4. Extract N points within the validity mask and in each LCZ class (stratified random sampling) [NB: we could directly divide here into training/testing based on LCZs]

In [54]:
# --- Mask to valid pixels (non-NaN) ---
valid_mask = ~np.isnan(lcz)
valid_indices = np.argwhere(valid_mask)
valid_values = lcz[valid_mask]

In [55]:
# --- Count number of pixels per LCZ class ---
counts = Counter(valid_values.astype(int))
total_valid_pixels = sum(counts.values())

In [56]:
# --- Determine number of samples per class, proportional to pixel count ---
samples_per_class = {
    val: int(round(total_samples * count / total_valid_pixels))
    for val, count in counts.items()
}

In [57]:
# --- Sample coordinates for each class ---
sampled_points = []

for lcz_value, n_samples in samples_per_class.items():
    # Get indices for this class
    class_indices = valid_indices[valid_values == lcz_value]
    
    # Randomly sample n indices
    if len(class_indices) >= n_samples:
        sampled = class_indices[np.random.choice(len(class_indices), n_samples, replace=False)]
    else:
        sampled = class_indices  # If fewer pixels than requested, take all

    for row, col in sampled:
        x, y = xy(transform, row, col)
        sampled_points.append({
            'geometry': Point(x, y),
            'LCZ': int(lcz_value)
        })

In [None]:
# --- Create GeoDataFrame ---
gdf = gpd.GeoDataFrame(sampled_points, crs=crs)
gdf.to_file(f'masks/{season}_sampled_points_' + date_s2.replace('-', '') + f'_{combined}.gpkg', driver="GPKG")

In [None]:
gdf

In [61]:
df = pd.read_csv('LST_L89/L8_corresponding_to_S2.csv', delimiter = ';')
#print(df)
check = df['note'].loc[df['Sentinel_date'] == date_s2].values[0]

In [None]:
date_l8 = df['Landsat_date'].loc[df['Sentinel_date'] == date_s2].values[0]
print(date_l8)
file_l8 = f"Average_LST_map_{season}.tif"
print(file_l8)

In [None]:
gdf = gpd.read_file(f'masks/{season}_sampled_points_' + date_s2.replace('-','') + f'_{combined}.gpkg')
print(gdf)

In [64]:
# === Step 2: List of raster files to sample ===
rasters = {
    'LST': 'E:\TESI\MODEL\Input\LST_L89\Mediated_LST/' + file_l8[:-4] + '_10m.tif',
    'BH': 'UCP/UCP_20m/BH_10m.tif',
    'BSF': 'UCP/UCP_20m/BSF_10m.tif',
    'IMD': 'UCP/UCP_20m/IMD_10m.tif',
    'SVF': 'UCP/UCP_20m/SVF_10m.tif',
    'TCH': 'UCP/UCP_20m/TCH_10m.tif',
    'Fractions': ('Fractions/Final_Class_Fraction_Layer_Masked_' + date_s2 + '_10m.tif', 
                 ['F_S', 'F_M', 'F_AC', 'F_BS', 'F_TV', 'F_W', 'F_G'])
}

### 2. Sample predictors and LST at the point locations and save the values in a dataframe

In [65]:
# === Step 3: Prepare output DataFrame ===
df = pd.DataFrame()
df["geometry"] = gdf.geometry

In [None]:
from rasterstats import point_query
from shapely.geometry import mapping
# === Step 4: Sample rasters ===
for key, raster_info in rasters.items():
    if isinstance(raster_info, str):
        # Single-band raster
        df[key] = point_query(gdf, raster_info) #, nodata=-9999
    else:
        # Multi-band raster with custom band names
        raster_path, band_names = raster_info
        with rasterio.open(raster_path) as src:
            coords = [mapping(geom)["coordinates"] for geom in gdf.geometry]
            samples = list(src.sample(coords))  # List of tuples (one per point)

            # Validate band count
            if src.count < len(band_names):
                raise ValueError(f"Raster '{raster_path}' has fewer bands than band names provided.")

            # Extract each band's value
            for i, band_name in enumerate(band_names):
                col_name = f"{key}_{band_name}"  # e.g., sentinel2_red
                df[col_name] = [s[i] if s is not None else None for s in samples]

In [67]:
# === Step 4: Optional: Combine with original attributes ===
df_full = pd.concat([gdf.reset_index(drop=True).drop(columns="geometry"), df], axis=1)

In [None]:
df_full

In [None]:
# Combine attributes and geometry
df_full_gdf = gpd.GeoDataFrame(df_full, geometry=gdf.geometry, crs=gdf.crs)

# Construct output path
output_path = f'masks/{season}_sampled_points_' + date_s2.replace('-', '') + f'_{combined}_all_bands.gpkg'

# Ensure directory exists
#os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Save to GeoPackage
df_full_gdf.to_file(output_path, driver='GPKG')

print(f"Saved to: {output_path}")

In [None]:
df_full_gdf