In [25]:
import os
from datetime import datetime
from itertools import product
import rasterio
from rasterio import windows
from shapely.geometry import box
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as mticker
import numpy as np

# Functions

In [15]:
def get_labels(labelpath):
    otsu_ims = [os.path.join(labelpath, f'otsu/{file}') for file in os.listdir(os.path.join(labelpath, f'otsu')) if file.endswith('.tif')]
    kmeans_ims = [os.path.join(labelpath, f'kmeans/{file}') for file in os.listdir(os.path.join(labelpath, f'kmeans')) if file.endswith('.tif')]
    gmm_ims = [os.path.join(labelpath, f'gmm/{file}') for file in os.listdir(os.path.join(labelpath, f'gmm')) if file.endswith('.tif')]
    majority_ims = [os.path.join(labelpath, f'majority/{file}') for file in os.listdir(os.path.join(labelpath, f'majority')) if file.endswith('.tif')]

    
    otsu_ims = sorted(otsu_ims, key=lambda x: datetime.strptime(x[-14:-4], '%Y-%m-%d'))
    kmeans_ims = sorted(kmeans_ims, key=lambda x: datetime.strptime(x[-14:-4], '%Y-%m-%d'))
    gmm_ims = sorted(gmm_ims, key=lambda x: datetime.strptime(x[-14:-4], '%Y-%m-%d'))
    majority_ims = sorted(majority_ims, key=lambda x: datetime.strptime(x[-14:-4], '%Y-%m-%d'))

    return otsu_ims, kmeans_ims, gmm_ims, majority_ims

def get_grd(grdpath):
    orig_ims = [os.path.join(grdpath, file) for file in os.listdir(grdpath) if file.endswith('.tif')]
    orig_ims = sorted(orig_ims, key=lambda x: datetime.strptime(x[-14:-4], '%Y-%m-%d'))

    return orig_ims

def get_glcm(glcmpath):
    orig_glcms = [os.path.join(glcmpath, file) for file in os.listdir(glcmpath) if file.endswith('.tif')]
    orig_glcms = sorted(orig_glcms, key=lambda x: datetime.strptime(x[-14:-4], '%Y-%m-%d'))

    return orig_glcms
def find_closest_dates(labels, backscatter_ims, glcm_ims, max_days=12):
    closest_dates = []  # To store the closest matches for each label

    # Iterate through each label
    for label in labels:
        label_date = datetime.strptime(label[-14:-4], '%Y-%m-%d')  # Extract date from label
        min_diff = max_days + 1  # Initialize minimum difference as larger than max_days
        closest_backscatter = None  # To store the closest backscatter match
        closest_glcm = None  # To store the closest GLCM match

        # Iterate through both backscatter and GLCM images
        for backscatter, glcm in zip(backscatter_ims, glcm_ims):
            backscatter_date = datetime.strptime(backscatter[-14:-4], '%Y-%m-%d')  # Extract date from backscatter
            glcm_date = datetime.strptime(glcm[-14:-4], '%Y-%m-%d')  # Extract date from GLCM

            # Calculate the absolute difference in days
            day_difference = abs((backscatter_date - label_date).days)

            # Check if the difference is within max_days and closer than the current minimum
            if day_difference <= max_days and day_difference < min_diff:
                min_diff = day_difference
                closest_backscatter = backscatter
                closest_glcm = glcm

        # Store the closest matches for the current label
        closest_dates.append((label, closest_backscatter, closest_glcm))

    return closest_dates

# Collect Imagery for model training

In [16]:
###################### WSL #########################
# labels = get_labels('/mnt/d/SabineRS/s2classifications')
# backscatter_ims = get_grd('/mnt/d/SabineRS/GRD/3_ratio')
# glcm_ims = get_glcm('/mnt/d/SabineRS/GRD/2_registered/glcm')

###################### Linux #########################
otsu_ims, kmeans_ims, gmm_ims, majority_ims = get_labels('/home/wcc/Desktop/SabineRS/MSI/s2classifications')
backscatter_ims = get_grd('/home/wcc/Desktop/SabineRS/GRD/3_ratio')
glcm_ims = get_glcm('/home/wcc/Desktop/SabineRS/GRD/2_registered/glcm')

In [44]:
# pair the Sentinel-1 backscatter and glcm  with labels according to date
labeledPairs = find_closest_dates(majority_ims, backscatter_ims, glcm_ims)

# Filter out tuples that contain any None entries
# no close matches between S2 labels and S1 images
filtered_data = [entry[:2] for entry in labeledPairs if None not in entry]

In [58]:
s1matches = [set[1] for set in filtered_data]
s1_X = [i for i in backscatter_ims if i not in s1matches]   # unlabeled S1 data for model training

# Data prep

In [48]:
def load_image_pair(s2_path, s1_path):
    # Load Sentinel-2 labels
    with rasterio.open(s2_path) as src:
        s2_labels = src.read(1).astype(np.int32)  # Assume the labels are in the first band

    # Load Sentinel-1 four-band image
    with rasterio.open(s1_path) as src:
        s1_data = np.stack([src.read(i).astype(np.float32) for i in range(1, 5)], axis=-1)

    return s2_labels, s1_data

def dynamic_tile_image(image, overlap=0.2):
    # Calculate tile size based on half the height and width
    tile_height = image.shape[0] // 2
    tile_width = image.shape[1] // 2
    tile_size = (tile_height, tile_width)
    
    overlap_pixels_x = int(tile_width * overlap)
    overlap_pixels_y = int(tile_height * overlap)
    
    stride_x = tile_width - overlap_pixels_x
    stride_y = tile_height - overlap_pixels_y

    tiles = []
    
    for y in range(0, image.shape[0] - tile_height + 1, stride_y):
        for x in range(0, image.shape[1] - tile_width + 1, stride_x):
            tile = image[y:y + tile_height, x:x + tile_width]
            if tile.shape[:2] == (tile_height, tile_width):
                tiles.append(tile)

    return tiles


In [79]:
X_train = []
y_train = []
X_unlabeled = []

for set in filtered_data:
    with rasterio.open(set[1]) as src:
        vv = src.read(1).astype(np.float32)
        vh = src.read(2).astype(np.float32)
        rvi = src.read(3).astype(np.float32)
        sdwi = src.read(4).astype(np.float32)

        # Convert from dB to linear scale
        vv_linear = 10 ** (vv / 10)
        vh_linear = 10 ** (vh / 10)

        vv_lin_norm = (vv_linear - vv_linear.min()) / (vv_linear.max() - vv_linear.min())
        vh_lin_norm = (vh_linear - vh_linear.min()) / (vh_linear.max() - vh_linear.min())
        rvi_norm = (rvi - rvi.min()) / (rvi.max() - rvi.min())
        sdwi_norm = (sdwi - sdwi.min()) / (sdwi.max() - sdwi.min())

        s1_data = np.stack([vv_lin_norm, vh_lin_norm, rvi_norm, sdwi_norm], axis=-1)

    with rasterio.open(set[0]) as src:
        s2_labels = src.read(1).astype(np.int32)
    
    # s1_lab_tiles = dynamic_tile_image(s1_data, overlap=0.2)
    X_train.append(s1_data)
    # s2_tiles = dynamic_tile_image(s2_labels, overlap=0.2)
    y_train.append(s2_labels)

for im in s1_X:
    with rasterio.open(im) as src:
        vv = src.read(1).astype(np.float32)
        vh = src.read(2).astype(np.float32)
        rvi = src.read(3).astype(np.float32)
        sdwi = src.read(4).astype(np.float32)

        # Convert from dB to linear scale
        vv_linear = 10 ** (vv / 10)
        vh_linear = 10 ** (vh / 10)

        vv_lin_norm = (vv_linear - vv_linear.min()) / (vv_linear.max() - vv_linear.min())
        vh_lin_norm = (vh_linear - vh_linear.min()) / (vh_linear.max() - vh_linear.min())
        rvi_norm = (rvi - rvi.min()) / (rvi.max() - rvi.min())
        sdwi_norm = (sdwi - sdwi.min()) / (sdwi.max() - sdwi.min())

        s1_unlab_data = np.stack([vv_lin_norm, vh_lin_norm, rvi_norm, sdwi_norm], axis=-1)

    # Tile images with dynamically calculated size
    # s1_unlab_tiles = dynamic_tile_image(s1_X_data, overlap = 0.2)
    X_unlabeled.append(s1_unlab_data)

In [89]:
# Flatten each stacked image and its labels for RF training
X_train_flattened = np.vstack([img.reshape(-1, 4) for img in X_train])  # Shape: (total_pixels, 4)
y_train_flattened = np.hstack([label.flatten() for label in y_train])  # Shape: (total_pixels,)
X_unlabeled_flattened = np.hstack([img.reshape(-1, 4) for img in X_unlabeled])

# Supervised classifiers on small dataset to test feasibility

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.model_selection import train_test_split

# Split into training and validation sets (e.g., 80% train, 20% validation)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_flattened, y_train_flattened, test_size=0.2, random_state=42
)

# Initialize models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(64, 64), max_iter=100, random_state=42)

# Train and evaluate Random Forest model
rf_model.fit(X_train_split, y_train_split)
y_val_pred_rf = rf_model.predict(X_val_split)
print("Random Forest Results:")
print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred_rf))
print("Classification Report:\n", classification_report(y_val_split, y_val_pred_rf))

# Predict on unlabeled data using RF
rf_unlabeled_pred = rf_model.predict(X_unlabeled.reshape(-1, X_unlabeled.shape[-1])).reshape(X_unlabeled.shape[:2])
print("\nRandom Forest Predicted Labels for Unlabeled Data:", rf_unlabeled_pred)

# Train and evaluate XGBoost model
xgb_model.fit(X_train_split, y_train_split)
y_val_pred_xgb = xgb_model.predict(X_val_split)
print("\nXGBoost Results:")
print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred_xgb))
print("Classification Report:\n", classification_report(y_val_split, y_val_pred_xgb))

# Predict on unlabeled data using XGBoost
xgb_unlabeled_pred = xgb_model.predict(X_unlabeled.reshape(-1, X_unlabeled.shape[-1])).reshape(X_unlabeled.shape[:2])
print("\nXGBoost Predicted Labels for Unlabeled Data:", xgb_unlabeled_pred)

# Train and evaluate MLP model
mlp_model.fit(X_train_split, y_train_split)
y_val_pred_mlp = mlp_model.predict(X_val_split)
print("\nMLP Results:")
print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred_mlp))
print("Classification Report:\n", classification_report(y_val_split, y_val_pred_mlp))

# Predict on unlabeled data using MLP
mlp_unlabeled_pred = mlp_model.predict(X_unlabeled.reshape(-1, X_unlabeled.shape[-1])).reshape(X_unlabeled.shape[:2])
print("\nMLP Predicted Labels for Unlabeled Data:", mlp_unlabeled_pred)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assume rf_unlabeled_pred, xgb_unlabeled_pred, and mlp_unlabeled_pred 
# are the predictions reshaped to (height, width)

# Example shapes (adjust according to your data)
height, width = rf_unlabeled_pred.shape

# Set up a single figure with three subplots in a row
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Visualize the Random Forest predictions
axes[0].imshow(rf_unlabeled_pred[0], cmap='viridis', vmin=0, vmax=2)  # Adjust vmin/vmax based on your class labels
axes[0].set_title("Random Forest Predictions")
axes[0].axis("off")

# Visualize the XGBoost predictions
axes[1].imshow(xgb_unlabeled_pred[0], cmap='viridis', vmin=0, vmax=2)
axes[1].set_title("XGBoost Predictions")
axes[1].axis("off")

# Visualize the MLP predictions
axes[2].imshow(mlp_unlabeled_pred[0], cmap='viridis', vmin=0, vmax=2)
axes[2].set_title("MLP Predictions")
axes[2].axis("off")

# Add a color bar to indicate classes (optional)
cbar = fig.colorbar(plt.cm.ScalarMappable(cmap='viridis'), ax=axes, orientation='vertical', shrink=0.6, aspect=10)
cbar.set_label('Class Label', rotation=270, labelpad=15)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# morphological operators if needed

cleaned_ims = {"otsu": [],
               "kmeans": [], 
               "gmm": []
               }

for i, (method, entry) in enumerate(zip(classification_methods, [relabeled_images['otsu'], relabeled_images["kmeans"], relabeled_images['gmm']])):
    for j, im in enumerate(entry):
        # Define a square kernel; adjust the size as needed
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))

        # apply morphological functions to eliminate isolated pixels from each class
        subaqueous = (im == 0).astype(np.uint8)
        subaerial = (im == 1).astype(np.uint8)

        ######## KMeans
        # Apply opening to remove small isolated pixels
        subaerial_cleaned = cv2.morphologyEx(subaerial, cv2.MORPH_OPEN, kernel)
        subaqueous_cleaned = cv2.morphologyEx(subaqueous, cv2.MORPH_OPEN, kernel)

        # Apply closing to fill small holes
        subaerial_cleaned = cv2.morphologyEx(subaerial_cleaned, cv2.MORPH_CLOSE, kernel)
        subaqueous_cleaned = cv2.morphologyEx(subaqueous_cleaned, cv2.MORPH_CLOSE, kernel)

        # Reconstruct the classified image
        cleaned_classified_image = (subaqueous_cleaned * i +
                                    subaqueous_cleaned * 1)      

        # Add the processed relabeled image to the dictionary
        cleaned_ims[method].append(cleaned_classified_image)

# 5. Train NN on larger dataset for many sites if good results

In [None]:
from tensorflow.keras import layers, models

def build_model(input_shape=(128, 128, 4), num_classes=3):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Compile and train
model = build_model(input_shape=(128, 128, 4), num_classes=3)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=16, validation_split=0.2)


In [None]:
# Assuming `X_unlabeled` is your preprocessed and tiled unlabeled S1 data
predictions = model.predict(X_unlabeled)

# Convert predictions to class labels
predicted_labels = np.argmax(predictions, axis=-1)


In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Use if labels are integer-encoded
              metrics=['accuracy'])


In [None]:
# Assuming X_train and y_train are prepared (possibly by tiling the images)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, batch_size=16)


# 6. Evaluate the accuracy of the NN

# Ground truthing
- get water extent maps from various sources to serve as ground truth data for confirming the classification results

1. https://global-surface-water.appspot.com/download
2. USGS LandCover
3. Copernicus Water and Wetness Product?
4. Chesapeake Conservancy High-Resolution Land Cover Dataset
5. RAMSAR Wetlands Sites
6. MODIS Land Cover Type Product (MCD12Q1)
7. Sentinel-2 Labeled Datasets for Wetland Classification
8. OSM