# Merge LIDAR

In [None]:
import os
import glob
import subprocess

# Download the 6 Colorado tiles from
# https://langnico.github.io/globalcanopyheight/assets/tile_index.html

# Define input folder containing lidar tiles
tif_dir = r"C:\Users\OneDrive\Desktop\GLOBALPCL\lidarHAG"

# Define merged output location
output_tif  = r"C:\Users\OneDrive\Desktop\GLOBALPCL\lidarHAG\LIDAR_GEDI_CO.tif"

tifs = glob.glob(os.path.join(tif_dir, "*Map*.tif"))

# Prepare the gdal_merge command for HAG
merge_command_hag = [
    "python",
    # Change this to your local gdal repository 
    "C:\\Users\\anaconda3\\envs\\lidarpods\\Scripts\\gdal_merge.py",
    "--config", "CHECK_DISK_FREE_SPACE", "FALSE",
    #"-ot", "Byte",
    "-o", output_tif,
    "-n", "255",
    "-a_nodata","255",
    
    
] + tifs

# Run the gdal_merge command for HAG and capture the output
process_hag = subprocess.run(merge_command_hag, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

# Check if the command for HAG was successful
if process_hag.returncode != 0:
    # An error occurred, print the error
    print("Error occurred while merging TIFF files HAG:")
    print(process_hag.stderr)
else:
    print("TIFF files merged successfully for HAG.")



# Tiling the PCL

In [None]:
import os
from itertools import product
import rasterio as rio
from rasterio import windows

# set path to input PCL
in_path = 'C:\\Users\\OneDrive\\Desktop\\GLOBALPCL\\Input\\'
input_filename = 'pcl_west_wgs_CO.tif'

# set path for tiled PCL
out_path = 'C:\\Users\\OneDrive\\Desktop\\GLOBALPCL\\Tiles\\'
output_filename = 'tile_{}-{}.tif'

widthtile = 200
heighttile = 200

def get_tiles(ds, width=widthtile, height=heighttile):
    nols, nrows = ds.meta['width'], ds.meta['height']
    offsets = product(range(0, nols, width), range(0, nrows, height))
    big_window = windows.Window(col_off=0, row_off=0, width=nols, height=nrows)
    for col_off, row_off in  offsets:
        window =windows.Window(col_off=col_off, row_off=row_off, width=width, height=height).intersection(big_window)
        transform = windows.transform(window, ds.transform)
        yield window, transform

# this also check for empty tiles (does not exist in land locked CO)            
with rio.open(os.path.join(in_path, input_filename)) as inds:
    tile_width, tile_height = widthtile, heighttile
    nodata = inds.nodata  # Get the NoData value from the dataset

    meta = inds.meta.copy()

    for window, transform in get_tiles(inds):
        print(window)
        data = inds.read(window=window)
        # Check if the entire tile is NoData
        if nodata is not None and not (data == nodata).all():
            meta['transform'] = transform
            meta['width'], meta['height'] = window.width, window.height
            outpath = os.path.join(out_path, output_filename.format(int(window.col_off), int(window.row_off)))
            with rio.open(outpath, 'w', **meta) as outds:
                outds.write(data)


# Create a list of filepaths to the tiles of interest

In [None]:
import glob
import os

# Path to the directory
directory_path = 'C:\\Users\\OneDrive\\Desktop\\GLOBALPCL\\Tiles\\'

# Pattern to match all .tif files
pattern = '*.tif'

# Construct full pattern path
full_pattern = directory_path + pattern

# List all .tif files in the directory
tif_files = glob.glob(full_pattern)

# Initialize an empty list to store filenames without extensions
filenames_without_extension = []

# Loop through each file in the list for naming convention 
for file_path in tif_files:
    filename_with_extension = os.path.basename(file_path)
    filename_without_extension = os.path.splitext(filename_with_extension)[0]
    filenames_without_extension.append(filename_without_extension)

print(filenames_without_extension[0])
print(len(filenames_without_extension))

# Get the data from planitary computer and export as CSV

In [None]:
#https://planetarycomputer.microsoft.com/dataset/io-lulc-9-class#Example-Notebook

import rasterio
from rasterio.warp import transform as warp_transform
import dask.array as da
import dask.distributed
from matplotlib.colors import ListedColormap
import pystac_client
import pyproj
import cartopy.crs as ccrs
import numpy as np
import pandas as pd
import planetary_computer
import rasterio
import rasterio.features
import stackstac
from pystac.extensions.eo import EOExtension as eo
import dask.dataframe as dd
import datetime
import odc.stac
import rioxarray
import time

# LIDAR HAG 
with rasterio.open(r"C:\Users\OneDrive\Desktop\GLOBALPCL\lidarHAG\LIDAR_GEDI_CO.tif") as src:
    arrayHAG = src.read(1)
    hag_transform = src.transform
    hag_crs = src.crs

for w in range(len(tif_files)):
    #try:
    start_time = datetime.datetime.now()

    # Path to your GeoTIFF files
    tif_path = tif_files[w]

    # Open the GeoTIFF file
    with rasterio.open(tif_path) as src:
        array = src.read()
        # Get the dimensions of the array
        bands, height, width = array.shape
        # Get the affine transformation for the GeoTIFF
        affine_transform = src.transform

        # Get the dimensions of the image
        width = src.width
        height = src.height

        # Initialize a list to hold the coordinates
        wgs_coords = []

        # Loop over each pixel in the image
        for row in range(height):
            for col in range(width):
                # Get the x, y coordinates of the center of the pixel
                x, y = affine_transform * (col + 0.5, row + 0.5)

                # Transform the x, y coordinates to longitude, latitude (WGS84)
                lon, lat = warp_transform(src.crs, 'EPSG:4326', [x], [y])

                # Append the WGS84 coordinates to the list
                wgs_coords.append((lon[0], lat[0]))

    print(len(wgs_coords))

    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=planetary_computer.sign_inplace,
    )

    central_lon = sum(lon for lon, _ in wgs_coords) / len(wgs_coords)
    central_lat = sum(lat for _, lat in wgs_coords) / len(wgs_coords)

    offset_lon, offset_lat = 0.25, 0.15  

    # Create the bounding box around Fort Collins, Colorado
    bbox_of_interest = [
        central_lon - offset_lon,  # Min longitude (west boundary)
        central_lat - offset_lat,  # Min latitude (south boundary)
        central_lon + offset_lon,  # Max longitude (east boundary)
        central_lat + offset_lat   # Max latitude (north boundary)
    ]

    print("Bounding box: ", bbox_of_interest)

    #######################################################
    #######################################################
    # LULC Global 9 - Classes #############################
    #######################################################
    #######################################################

    # 0 - No Data
    # 1 - Water
    # 2 - Trees
    # 4 - Flooded vegetation
    # 5 - Crops
    # 7 - Built area
    # 8 - Bare ground
    # 9 - Snow/ice
    # 10 - Clouds
    # 11 - Rangeland

    search = catalog.search(collections=["io-lulc-9-class"], bbox=bbox_of_interest)

    items = search.item_collection()

    item = items[0]

    LULC_stack = (
        stackstac.stack(
            items,
            dtype=np.ubyte,
            fill_value=255,
            bounds_latlon=bbox_of_interest,
            sortby_date=False,
        )
        .assign_coords(
            time=pd.to_datetime([item.properties["start_datetime"] for item in items])
            .tz_convert(None)
            .to_numpy()
        )
        .sortby("time")
    )
    # Assuming the 'classes' attribute is accessible from 'item'
    print(item.properties)

    #######################################################
    #######################################################
    # SENTINEL 2 ##########################################
    #######################################################
    #######################################################
    
    # Sentinel adjust time window
    time_of_interest = "2021-04-01/2021-08-01"

    search = catalog.search(
        collections=["sentinel-2-l2a"],
        bbox=bbox_of_interest,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 10}},
    )

    items = search.item_collection()

    least_cloudy_item = min(items, key=lambda item: eo.ext(item).cloud_cover)

    utm_crs = pyproj.CRS.from_user_input(f'EPSG:326{(int((central_lon + 180) / 6) % 60) + 1}')

    # Create a stack from the least cloudy item
    if least_cloudy_item:
        sentinel_stack = stackstac.stack(
            [least_cloudy_item],
            bounds_latlon=bbox_of_interest,
            epsg=utm_crs.to_epsg(),
            resolution=10,
            assets=["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B09", "B11", "B12"],  
            fill_value=0
        )

        print("Stack created successfully from the least cloudy item")
    else:
        print("No least cloudy item found to create a stack")

    #######################################################
    #######################################################
    # Landsat #############################################
    #######################################################
    #######################################################

    # time_of_interest = "2021-04-01/2021-08-01"

    # search = catalog.search(
    #     collections=["landsat-c2-l2"],
    #     bbox=bbox_of_interest,
    #     datetime=time_of_interest,
    #     query={"eo:cloud_cover": {"lt": 10}},
    # )

    # items = search.item_collection()

    # selected_item = min(items, key=lambda item: eo.ext(item).cloud_cover)

    # utm_crs = pyproj.CRS.from_user_input(f'EPSG:326{(int((central_lon + 180) / 6) % 60) + 1}')

    # bands_of_interest = ["green", "nir08"]
    # Landsat_stack = odc.stac.stac_load(
    #     [selected_item], bands=bands_of_interest, bbox=bbox_of_interest
    # ).isel(time=0)


    #######################################################
    #######################################################
    #######################################################
    #######################################################
    #######################################################

    # items = list(catalog.get_collection("hgb").get_all_items())
    # item = items[0]

    # da = rioxarray.open_rasterio(
    #     item.assets["aboveground"].href, chunks=dict(x=2560, y=2560)
    # )

    # # Transform our data array to a dataset by selecting the only data variable ('band')
    # # renaming it to something useful ('biomass')
    # ds = da.to_dataset(dim="band").rename({1: "biomass"})
    # ds

    #######################################################
    #######################################################
    #######################################################
    #######################################################
    #######################################################



    # Define a list of geographic coordinates for the points of interest
    points = wgs_coords


    # Initialize the WGS 84 geographic coordinate system (EPSG:4326)
    geographic = pyproj.CRS('EPSG:4326')

    # Open the TIFF file
    # with rasterio.open(tif_path) as src:
    #     # Read the raster data
    #     array = src.read()
    #     # Get the dimensions of the array
    #     bands, height, width = array.shape

    df_PCL = pd.DataFrame(array.reshape(bands, -1).T, columns=[f'Band_{i+1}' for i in range(bands)])
    df_PCL.columns = ["PCLVALUE"]

    #df_PCL = pd.DataFrame(arrayHAG.reshape(bands, -1).T, columns=[f'Band_{i+1}' for i in range(bands)])
    #df_PCL.columns = ["hag"]

    df_list = []
    df2_list = []
    #df3_list = []
    #df4_list = []
    hag_values = []
    geographic = pyproj.CRS('EPSG:4326')

    for lon, lat in points:
        utm_crs = pyproj.CRS.from_user_input(f'EPSG:326{(int((lon + 180) / 6) % 60) + 1}')
        transformer = pyproj.Transformer.from_crs(geographic, utm_crs, always_xy=True)

        # Convert geographic coordinates to UTM coordinates
        utm_x, utm_y = transformer.transform(lon, lat)


        # Extract values across years for the given point LULC
        values_across_years_LULC = LULC_stack.sel(x=utm_x, y=utm_y, method='nearest')#.values
        df_list.append(values_across_years_LULC)

        values_across_sentinalbands = sentinel_stack.sel(x=utm_x, y=utm_y, method='nearest')#.values
        df2_list.append(values_across_sentinalbands)

        #values_across_landsatbands = Landsat_stack.sel(x=utm_x, y=utm_y, method='nearest')#.values
        #df3_list.append(values_across_landsatbands)

        # Transform geographic coordinates to pixel coordinates in the HAG raster
        x, y = warp_transform(geographic, hag_crs, [lon], [lat])
        col, row = ~hag_transform * (x[0], y[0])  # Apply the inverse of the affine transform
        col, row = int(col), int(row)  # Convert to integer pixel indices
    
        # Extract the HAG value if the indices are within the raster dimensions
        if (0 <= col < arrayHAG.shape[1]) and (0 <= row < arrayHAG.shape[0]):
            hag_value = arrayHAG[row, col]
        else:
            hag_value = np.nan  # Use NaN for coordinates outside the raster
    
        hag_values.append(hag_value)

    

    df_COMPUTED = da.compute(*df_list) 
    df = pd.concat([pd.DataFrame(row).T for row in df_COMPUTED], ignore_index=True)
    time_coords = LULC_stack.time.values
    time_columns = [f'LULC_{pd.to_datetime(str(time)).strftime("%Y")}' for time in time_coords]
    df.columns = time_columns
    print("LULC")

    df2_COMPUTED = da.compute(*df2_list) 
    df2 = pd.concat([pd.DataFrame(row) for row in df2_COMPUTED], ignore_index=True)
    df2.columns = ["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B09", "B11", "B12"]
    print("SENTINEL")

    #df3_COMPUTED = da.compute(*df3_list) 
    #df3 = pd.concat([pd.DataFrame(row) for row in df3_COMPUTED], ignore_index=True)
    # df3.columns = ["green"]
    # print("LANDSAT")

    # df4_COMPUTED = da.compute(*df4_list) 
    # df4 = pd.concat([pd.DataFrame(row) for row in df4_COMPUTED], ignore_index=True)
    # df4.columns = ["HAG"]
    # print("HAG")

    combined_df = pd.concat([df, df2, df_PCL], axis=1)
    combined_df
    combined_df['HAG'] = hag_values

    # Determine time
    end_time = datetime.datetime.now()
    time_diff = end_time - start_time
    print(f"The operation took {time_diff} seconds.")

    # save dataframe of information
    combined_df.to_csv(f'C:\\Users\\OneDrive\\Desktop\\GLOBALPCL\\OutputCSV\\df_LULC_SENT_{filenames_without_extension[w]}.csv', index=False)
    time.sleep(10)
    #except:
    #    continue


# Merge the data for modeling

In [None]:
import os
import pandas as pd

# Directory containing the CSV files
directory = "C:\\Users\\OneDrive\\Desktop\\GLOBALPCL\\OutputCSV"

# Initialize an empty DataFrame to hold all the data
combined_df = pd.DataFrame()

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        # Read the CSV file and append it to the combined DataFrame
        df = pd.read_csv(file_path)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

# Now combined_df contains all the data from the CSV files in the directory
print(combined_df.shape)

# Filter out samples where all B01 to B12 columns are zeros
# ###! I NEED TO FIGURE OUT WHY SOME Sentinel information is is all 0s !###
cols_to_check = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12']
combined_df = combined_df.loc[~(combined_df[cols_to_check] == 0).all(axis=1)]


print(combined_df.shape)


# Data Balance test 1 - this is an idea, may be a better workaround for the 
# imbalanced dataset (consider weights, etc.)

In [None]:
import pandas as pd

# Assuming combined_df is your DataFrame
# First, count the rows where PCLVALUE > 80
count_above_80 = (combined_df['PCLVALUE'] > 80).sum()

# Then, filter rows where PCLVALUE < 20
filtered_below_20 = combined_df[combined_df['PCLVALUE'] < 80]

# Now, randomly sample from the filtered_below_20 to match the count of rows above 80
subset_below_20 = filtered_below_20.sample(n=count_above_80)

# If you need to combine both subsets into a single DataFrame
final_subset = pd.concat([combined_df[combined_df['PCLVALUE'] > 80], subset_below_20])

print(len(final_subset))

import matplotlib.pyplot as plt

# Assuming combined_df is your DataFrame and 'PCLVALUE' is the column of interest
plt.figure(figsize=(10, 6))
plt.hist(final_subset['PCLVALUE'], bins=30, alpha=0.7, color='blue')
plt.title('Histogram of PCLVALUE')
plt.xlabel('PCLVALUE')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()



In [None]:
# import pandas as pd

# # Define a threshold to consider a PCLVALUE as high
# high_value_threshold = combined_df['PCLVALUE'].quantile(0.8)  # for example, take the 90th percentile

# # Filter the dataset for all high PCLVALUE rows
# high_value_df = combined_df[combined_df['PCLVALUE'] > high_value_threshold]

# # Sample with replacement to increase the number of high PCLVALUE rows
# # You can adjust the `n` to control how many times you want to replicate the high-end data
# oversampled_high_value_df = high_value_df.sample(n=2000000, replace=True)

# # Concatenate the original dataframe with the oversampled high PCLVALUE dataframe
# combined_df_oversampled = pd.concat([combined_df, oversampled_high_value_df])

# # Shuffle the dataframe if necessary
# combined_df_oversampled = combined_df_oversampled.sample(frac=1).reset_index(drop=True)
# combined_df_oversampled


# Model fit and evaluate 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers, callbacks
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam

data = combined_df  # Assuming 'combined_df' is loaded correctly
X = data[['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'HAG']]
y = data['PCLVALUE'] / 100  # Normalize 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Building the model
# this is a simple nn model - need more data and training
model = tf.keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(1, activation='sigmoid')
])

# Early stopping monitor
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

model.compile(optimizer=Adam(learning_rate=0.0002),
              loss='mean_squared_error',
              metrics=['mean_absolute_error'])

model.summary()

# Train the model with early stopping
model.fit(X_train_scaled, y_train, epochs=1, batch_size=256, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
model.evaluate(X_test_scaled, y_test)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Plotting actual vs predicted values
plt.figure(figsize=(10, 10))
plt.scatter(y_test, y_pred, alpha=0.5, color='black', s=10)  # Adjusted marker size
plt.title('Modeled vs. Predicted PCLVALUE')
plt.xlabel('Modeled PCLVALUE')
plt.ylabel('Predicted PCLVALUE')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--')  # Diagonal line
plt.grid(True)  # Added grid for better readability
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.show()

# Test learning rates - gif

In [None]:
import os
import glob
from PIL import Image

# Directory for the images
if not os.path.exists('plots'):
    os.makedirs('plots')

learning_rates = [0.00001 * i for i in range(1, 35)]  # Learning rates from 0.0001 to 0.001

for lr in learning_rates:
    model = tf.keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='relu')
    ])

    model.compile(optimizer=Adam(learning_rate=lr),
                  loss='mean_squared_error',
                  metrics=['mean_absolute_error'])

    model.fit(X_train_scaled, y_train, epochs=5, batch_size=256, validation_split=0.2)
    
    y_pred = model.predict(X_test_scaled)

    plt.figure(figsize=(10, 10))
    plt.scatter(y_test, y_pred, alpha=0.5, color='black', s=1)
    plt.title(f'Learning Rate: {lr:.5f}')
    plt.xlabel('Actual PCLVALUE')
    plt.ylabel('Predicted PCLVALUE')
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--')
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.savefig(f'plots/lr_{lr:.5f}.png')
    plt.close()

# Create the GIF
images = []
image_paths = sorted(glob.glob('plots/*.png'), key=os.path.getmtime)

for filename in image_paths:
    images.append(Image.open(filename))
images[0].save('C:\\Users\\OneDrive\\Desktop\\GLOBALPCL\\gif\\learning_rate.gif',
               save_all=True, append_images=images[1:], optimize=False, duration=400, loop=0)

print("GIF saved as 'learning_rate.gif'")


# Model search - keras tuner

In [None]:
import keras_tuner as kt
import tensorflow as tf

def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(
        hp.Int('units', min_value=32, max_value=512, step=32),
        activation='relu',
        input_shape=(X_train_scaled.shape[1],)
    ))
    model.add(tf.keras.layers.Dense(
        hp.Int('units', min_value=32, max_value=512, step=32),
        activation='relu'
    ))
    model.add(tf.keras.layers.Dense(1))
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])
        ),
        loss='mean_squared_error',
        metrics=['mean_absolute_error']
    )
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_mean_absolute_error',
    max_trials=20,
    executions_per_trial=1,
    project_name='PCL_model2'
)

tuner.search(X_train_scaled, y_train, epochs=1, validation_split=0.2)

# Assuming you have already run the hyperparameter tuning with a tuner object
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

# To print the summary of the best model's architecture
best_model.summary()

# To see the best hyperparameters
print(best_hyperparameters.values)

# This code ingests a tile, downloads data, and fits the above model, then exports tif (a bit slow)

In [None]:
import rasterio
from rasterio.warp import transform as warp_transform
#https://planetarycomputer.microsoft.com/dataset/io-lulc-9-class#Example-Notebook
import dask.array as da
import dask.distributed
from matplotlib.colors import ListedColormap
import pystac_client
import pyproj
import cartopy.crs as ccrs
import numpy as np
import pandas as pd
import planetary_computer
import rasterio
import rasterio.features
import stackstac
from pystac.extensions.eo import EOExtension as eo
import dask.dataframe as dd
import datetime
import odc.stac
import rioxarray

tilenumber = "0-2200"

tif_files = [f"C:\\Users\\OneDrive\\Desktop\\GLOBALPCL\\Tiles\\tile_{tilenumber}.tif"]

with rasterio.open(r"C:\Users\OneDrive\Desktop\GLOBALPCL\lidarHAG\LIDAR_GEDI_CO.tif") as src:
    arrayHAG = src.read(1)  # Ensure you read only one band if that's all you need
    hag_transform = src.transform
    hag_crs = src.crs

for w in range(len(tif_files)):
    #try:
    # Path to your GeoTIFF file
    tif_path = tif_files[0]

    # Open the GeoTIFF file
    with rasterio.open(tif_path) as src:
        # Get the affine transformation for the GeoTIFF
        affine_transform = src.transform

        # Get the dimensions of the image
        width = src.width
        height = src.height

        # Initialize a list to hold the coordinates
        wgs_coords = []

        # Loop over each pixel in the image
        for row in range(height):
            for col in range(width):
                # Get the x, y coordinates of the center of the pixel
                x, y = affine_transform * (col + 0.5, row + 0.5)

                # Transform the x, y coordinates to longitude, latitude (WGS84)
                lon, lat = warp_transform(src.crs, 'EPSG:4326', [x], [y])

                # Append the WGS84 coordinates to the list
                wgs_coords.append((lon[0], lat[0]))

    print(len(wgs_coords))

    # Record the start time
    start_time = datetime.datetime.now()

    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=planetary_computer.sign_inplace,
    )

    central_lon = sum(lon for lon, _ in wgs_coords) / len(wgs_coords)
    central_lat = sum(lat for _, lat in wgs_coords) / len(wgs_coords)

    #central_lon, central_lat = -105.556671,40.370342

    offset_lon, offset_lat = 0.25, 0.15  # Adjust these values as needed for the desired area

    # Create the bounding box around Fort Collins, Colorado
    bbox_of_interest = [
        central_lon - offset_lon,  # Min longitude (west boundary)
        central_lat - offset_lat,  # Min latitude (south boundary)
        central_lon + offset_lon,  # Max longitude (east boundary)
        central_lat + offset_lat   # Max latitude (north boundary)
    ]

    print("Bounding box: ", bbox_of_interest)

    #######################################################
    #######################################################
    # LULC Global 9 - Classes #############################
    #######################################################
    #######################################################

    # 0 - No Data
    # 1 - Water
    # 2 - Trees
    # 4 - Flooded vegetation
    # 5 - Crops
    # 7 - Built area
    # 8 - Bare ground
    # 9 - Snow/ice
    # 10 - Clouds
    # 11 - Rangeland

    search = catalog.search(collections=["io-lulc-9-class"], bbox=bbox_of_interest)

    items = search.item_collection()

    item = items[0]

    LULC_stack = (
        stackstac.stack(
            items,
            dtype=np.ubyte,
            fill_value=255,
            bounds_latlon=bbox_of_interest,
            sortby_date=False,
        )
        .assign_coords(
            time=pd.to_datetime([item.properties["start_datetime"] for item in items])
            .tz_convert(None)
            .to_numpy()
        )
        .sortby("time")
    )
    # Assuming the 'classes' attribute is accessible from 'item'
    print(item.properties)

    #######################################################
    #######################################################
    # SENTINEL 2 ##########################################
    #######################################################
    #######################################################

    time_of_interest = "2021-04-01/2021-08-01"

    search = catalog.search(
        collections=["sentinel-2-l2a"],
        bbox=bbox_of_interest,
        datetime=time_of_interest,
        query={"eo:cloud_cover": {"lt": 50}},
    )

    items = search.item_collection()

    least_cloudy_item = min(items, key=lambda item: eo.ext(item).cloud_cover)

    utm_crs = pyproj.CRS.from_user_input(f'EPSG:326{(int((central_lon + 180) / 6) % 60) + 1}')

    # Create a stack from the least cloudy item
    if least_cloudy_item:
        sentinel_stack = stackstac.stack(
            [least_cloudy_item],
            bounds_latlon=bbox_of_interest,
            epsg=utm_crs.to_epsg(),
            resolution=10,
            assets=["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B09", "B11", "B12"],  
            fill_value=0
        )

        print("Stack created successfully from the least cloudy item")
    else:
        print("No least cloudy item found to create a stack")

    #######################################################
    #######################################################
    # Landsat #############################################
    #######################################################
    #######################################################

    # time_of_interest = "2021-04-01/2021-08-01"

    # search = catalog.search(
    #     collections=["landsat-c2-l2"],
    #     bbox=bbox_of_interest,
    #     datetime=time_of_interest,
    #     query={"eo:cloud_cover": {"lt": 10}},
    # )

    # items = search.item_collection()

    # selected_item = min(items, key=lambda item: eo.ext(item).cloud_cover)

    # utm_crs = pyproj.CRS.from_user_input(f'EPSG:326{(int((central_lon + 180) / 6) % 60) + 1}')

    # bands_of_interest = ["green", "nir08"]
    # Landsat_stack = odc.stac.stac_load(
    #     [selected_item], bands=bands_of_interest, bbox=bbox_of_interest
    # ).isel(time=0)


    #######################################################
    #######################################################
    #######################################################
    #######################################################
    #######################################################

    # items = list(catalog.get_collection("hgb").get_all_items())
    # item = items[0]

    # da = rioxarray.open_rasterio(
    #     item.assets["aboveground"].href, chunks=dict(x=2560, y=2560)
    # )

    # # Transform our data array to a dataset by selecting the only data variable ('band')
    # # renaming it to something useful ('biomass')
    # ds = da.to_dataset(dim="band").rename({1: "biomass"})
    # ds

    #######################################################
    #######################################################
    #######################################################
    #######################################################
    #######################################################

    # Define a list of geographic coordinates for the points of interest
    points = wgs_coords

    # Initialize the WGS 84 geographic coordinate system (EPSG:4326)
    geographic = pyproj.CRS('EPSG:4326')

    # Open the TIFF file
    with rasterio.open(tif_path) as src:
        # Read the raster data
        array = src.read()
        # Get the dimensions of the array
        bands, height, width = array.shape

    df_PCL = pd.DataFrame(array.reshape(bands, -1).T, columns=[f'Band_{i+1}' for i in range(bands)])
    df_PCL.columns = ["PCLVALUE"]

    df_list = []
    df2_list = []
    df3_list = []
    hag_values = []

    for lon, lat in points:
        utm_crs = pyproj.CRS.from_user_input(f'EPSG:326{(int((lon + 180) / 6) % 60) + 1}')
        transformer = pyproj.Transformer.from_crs(geographic, utm_crs, always_xy=True)

        # Convert geographic coordinates to UTM coordinates
        utm_x, utm_y = transformer.transform(lon, lat)


        # Extract values across years for the given point LULC
        values_across_years_LULC = LULC_stack.sel(x=utm_x, y=utm_y, method='nearest')#.values
        df_list.append(values_across_years_LULC)

        values_across_sentinalbands = sentinel_stack.sel(x=utm_x, y=utm_y, method='nearest')#.values
        df2_list.append(values_across_sentinalbands)

        #values_across_landsatbands = Landsat_stack.sel(x=utm_x, y=utm_y, method='nearest')#.values
        #df3_list.append(values_across_landsatbands)

            # Transform geographic coordinates to pixel coordinates in the HAG raster
        x, y = warp_transform(geographic, hag_crs, [lon], [lat])
        col, row = ~hag_transform * (x[0], y[0])  # Apply the inverse of the affine transform
        col, row = int(col), int(row)  # Convert to integer pixel indices
    
        # Extract the HAG value if the indices are within the raster dimensions
        if (0 <= col < arrayHAG.shape[1]) and (0 <= row < arrayHAG.shape[0]):
            hag_value = arrayHAG[row, col]
        else:
            hag_value = np.nan  # Use NaN for coordinates outside the raster
    
        hag_values.append(hag_value)



    df_COMPUTED = da.compute(*df_list) 
    df = pd.concat([pd.DataFrame(row).T for row in df_COMPUTED], ignore_index=True)
    time_coords = LULC_stack.time.values
    time_columns = [f'LULC_{pd.to_datetime(str(time)).strftime("%Y")}' for time in time_coords]
    df.columns = time_columns
    print("LULC")

    df2_COMPUTED = da.compute(*df2_list) 
    df2 = pd.concat([pd.DataFrame(row) for row in df2_COMPUTED], ignore_index=True)
    df2.columns = ["B01", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B09", "B11", "B12"]
    print("SENTINEL")

    #df3_COMPUTED = da.compute(*df3_list) 
    #df3 = pd.concat([pd.DataFrame(row) for row in df3_COMPUTED], ignore_index=True)
    # df3.columns = ["green"]
    # print("LANDSAT")

    combined_df_pred = pd.concat([df, df2, df_PCL], axis=1)
    combined_df_pred

    combined_df_pred = pd.concat([df, df2, df_PCL], axis=1)
    combined_df_pred
    combined_df_pred['HAG'] = hag_values

    # USE THE SAME VARIABLES AS MODEL FIT
    X_combined = combined_df_pred[['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'HAG']]
    
    # THis is incorrect 
    X_combined_scaled = scaler.transform(X_combined)  # Assuming 'scaler' is your previously fitted MinMaxScaler
    
    combined_predictions = model.predict(X_combined_scaled)
    
    # Define output GeoTIFF file path
    output_tif_path = f"C:\\Users\\OneDrive\\Desktop\\GLOBALPCL\\OutputTIF\\prediction_tile_{tilenumber}_4.tif"
    
    # Get the GeoTIFF metadata from the input GeoTIFF file
    with rasterio.open(tif_files[0]) as src:
        meta = src.meta.copy()
        transform = src.transform
    
    # Update metadata for the output GeoTIFF
    meta.update(
        dtype=rasterio.float32,
        count=1,  # Number of bands
        nodata=None,  # Set nodata value if applicable
    )
    
    predictions_reshaped = combined_predictions.reshape((meta['height'], meta['width']))
    
    # Confirm that predictions_reshaped now has the correct shape
    print(predictions_reshaped.shape)
    
    # Write the reshaped predictions to the GeoTIFF
    with rasterio.open(output_tif_path, 'w', **meta) as dst:
        # Ensure that the transform attribute is correctly set
        dst.transform = transform
        dst.write(predictions_reshaped.astype(rasterio.float32), 1)
    
    print(f"Predictions exported to {output_tif_path}")

    
    # Determine time
    end_time = datetime.datetime.now()
    time_diff = end_time - start_time
    print(f"The operation took {time_diff} seconds.")
        
    #except:
    #    continue
print("done!!!!")

# End for now 