<a href="https://colab.research.google.com/github/TewabeTigp/MODIS_LST/blob/main/RF_MODIS_LST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install rasterio

In [None]:
%pip install rasterio numpy matplotlib


# **Import Required Libraries**

In [3]:
import numpy as np
import pandas as pd
import rasterio
from rasterio.plot import show
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



# **Load Stacked GeoTIFF File**

In [None]:
# Define the path to the stacked GeoTIFF file
input_tif = "path/to/stacked_data.tif"

# Open the raster file
with rasterio.open(input_tif) as src:
    profile = src.profile  # Store metadata for saving results
    data = src.read()  # Read all bands (Shape: [Bands, Height, Width])
    transform = src.transform  # Get the transformation matrix
    crs = src.crs  # Get the coordinate reference system

# Display raster properties
print(f"Raster Shape: {data.shape} (Bands, Height, Width)")
print(f"CRS: {crs}")
print(f"Transform: {transform}")


# **Reshape the Raster Data**

In [None]:
# Reshape raster data into 2D array (pixels as rows, bands as columns)
bands, height, width = data.shape
reshaped_data = data.reshape(bands, height * width).T  # Transpose to shape (Pixels, Bands)

# Convert to DataFrame
df = pd.DataFrame(reshaped_data, columns=["NDVI", "EVI", "NDWI", "LAI", "ALB", "ELV", "SLP", "Fa", "CSR", "LST"])

# Remove any pixels with NaN values (optional)
df = df.dropna().reset_index(drop=True)

# Print dataset information
print(df.info())
print(df.describe())


# **Define Features and Target**

In [None]:
# Define independent variables (features)
features = ["NDVI", "EVI", "NDWI", "LAI", "ALB", "ELV", "SLP", "Fa", "CSR"]
X = df[features]

# Define dependent variable (target)
y = df["LST"]


# **Train-Test Split**

In [None]:
# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print dataset sizes
print(f"Training data: {X_train.shape}, Testing data: {X_test.shape}")


# **Train Random Forest Regression Model**

In [None]:
# Initialize Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model
rf_model.fit(X_train, y_train)

# Print feature importance
feature_importances = pd.DataFrame({"Feature": features, "Importance": rf_model.feature_importances_})
feature_importances = feature_importances.sort_values(by="Importance", ascending=False)
print(feature_importances)


# **Make Predictions on Test Set**

In [None]:
# Predict on test set
y_pred = rf_model.predict(X_test)

# Calculate model evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")


# **Predict LST for Entire Image**

In [None]:
# Predict LST for all pixels
predicted_lst = rf_model.predict(reshaped_data[:, :-1])  # Exclude LST from features

# Reshape predicted LST back to the original raster shape
predicted_lst_raster = predicted_lst.reshape(height, width)


# **Save the Predicted LST as a GeoTIFF**

In [None]:
# Define output file path
output_tif = "path/to/predicted_lst.tif"

# Update metadata for saving the output
profile.update(dtype=rasterio.float32, count=1)

# Write to a new GeoTIFF file
with rasterio.open(output_tif, "w", **profile) as dst:
    dst.write(predicted_lst_raster.astype(np.float32), 1)

print(f"Predicted LST saved as {output_tif}")


# **Plot the Results**

In [None]:
# Plot the original and predicted LST side by side
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Plot original LST
ax[0].imshow(data[-1], cmap="jet")  # Assuming LST is the last band
ax[0].set_title("Original LST")
ax[0].axis("off")

# Plot predicted LST
ax[1].imshow(predicted_lst_raster, cmap="jet")
ax[1].set_title("Predicted LST (RF Model)")
ax[1].axis("off")

plt.tight_layout()
plt.show()


# **Implementing Spatial Cross-Validation**

In [None]:
%pip install geopandas scikit-learn-extra


Define Spatial Folds

In [None]:
import geopandas as gpd
from sklearn.model_selection import GroupKFold

# Load spatial data (ensure df contains "Longitude" and "Latitude" columns)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]))

# Define number of spatial folds
num_folds = 5
group_kfold = GroupKFold(n_splits=num_folds)

# Assign groups based on spatial clustering (e.g., k-means or region-based)
gdf["Region"] = pd.qcut(gdf["Longitude"], num_folds, labels=False)

# Perform spatial cross-validation
for fold, (train_idx, test_idx) in enumerate(group_kfold.split(gdf, groups=gdf["Region"])):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train, y_train)

    # Predict
    y_pred = rf_model.predict(X_test)

    # Evaluate model performance for each fold
    print(f"Fold {fold + 1}:")
    print(f"  MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"  RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
    print(f"  R²: {r2_score(y_test, y_pred):.4f}\n")


# **Estimating Prediction Uncertainty**

In [None]:
%pip install mapie


Quantile Regression for Uncertainty

In [None]:
from mapie.regression import MapieRegressor
from sklearn.ensemble import RandomForestQuantileRegressor

# Train Quantile Random Forest Model
qrf = RandomForestQuantileRegressor(n_estimators=100, random_state=42, n_jobs=-1)
qrf.fit(X_train, y_train)

# Predict median (50th percentile), lower (5th percentile), and upper (95th percentile) bounds
y_pred_median = qrf.predict(X_test, quantile=0.5)
y_pred_lower = qrf.predict(X_test, quantile=0.05)
y_pred_upper = qrf.predict(X_test, quantile=0.95)

# Compute prediction interval width
interval_width = y_pred_upper - y_pred_lower

# Plot prediction intervals
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred_median, label="Predicted LST", alpha=0.6)
plt.fill_between(y_test, y_pred_lower, y_pred_upper, color="gray", alpha=0.3, label="95% Confidence Interval")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], "--r", label="Ideal Fit")
plt.xlabel("Observed LST")
plt.ylabel("Predicted LST")
plt.legend()
plt.title("LST Prediction with Uncertainty Intervals")
plt.show()


# **Bayesian Uncertainty Estimation**

In [None]:
%pip install scipy


Implement Bayesian Ridge Regression

In [None]:
from sklearn.linear_model import BayesianRidge

# Train Bayesian Ridge Regression Model
bayesian_ridge = BayesianRidge()
bayesian_ridge.fit(X_train, y_train)

# Predict with uncertainty estimates
y_pred, y_std = bayesian_ridge.predict(X_test, return_std=True)

# Compute 95% confidence intervals
y_pred_lower = y_pred - 1.96 * y_std
y_pred_upper = y_pred + 1.96 * y_std

# Plot Bayesian uncertainty
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, label="Predicted LST", alpha=0.6)
plt.fill_between(y_test, y_pred_lower, y_pred_upper, color="gray", alpha=0.3, label="95% Confidence Interval")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], "--r", label="Ideal Fit")
plt.xlabel("Observed LST")
plt.ylabel("Predicted LST")
plt.legend()
plt.title("Bayesian Uncertainty in LST Prediction")
plt.show()
