In [None]:
import numpy as np
import pandas as pd
from PIL import Image
import gc
import os
import sys
import cv2
from datasets import Dataset, load_from_disk
import rcsHandlingFunctions as rcs
import creationOfDataframe as cdf
import rasterio

In [None]:
import sys
# Add the directory containing lit_sam_model.py to the Python path
sys.path.append(os.path.abspath("../"))
from sam.modeling.PMD_features import PMD_features
from dataprocessing.slope import calculate_slope
import torch

In [None]:
root_dir = ""

# Get all dem paths
dem_paths = cdf.get_all_image_paths(root_dir, ['dem.tif'])

# Get all rcs paths
rcs_paths = cdf.get_all_image_paths(root_dir, ['rcs.tif'])

In [None]:
print("Number of rcs: ", len(rcs_paths))
print("Number of dem: ", len(dem_paths))

In [None]:
from collections import Counter

# Count the occurrences of each rcs path
rcs_counter = Counter(rcs_paths)

# Collect the duplicates (paths that appear more than once)
duplicates = [path for path, count in rcs_counter.items() if count > 1]

print("Duplicate rcs paths:", duplicates)


In [None]:

dem_rcs_paths = list(zip(dem_paths, rcs_paths))

In [None]:
# Function to substitute DEM data into the image
def substitute_dem(image, dem_data, channel=2):
    """
    Substitute one of the layers of the image with the DEM data.
    Args:
        image (np.ndarray): The original image.
        dem_data (np.ndarray): The DEM data.
        channel (int): The channel to be replaced with DEM data.
    Returns:
        np.ndarray: The modified image with DEM data.
    """
    modified_image = image.copy().astype(np.float32)
    modified_image[:, :, channel] = dem_data / 4000.0  # Reduce the DEM data in the 0-1 range
    return modified_image

In [None]:
# Function to substitute Slope data into the image
def substitute_slope(image, dem_data, channel=2):
    """
    Substitute one of the layers of the image with the Slope data.
    Args:
        image (np.ndarray): The original image.
        dem_data (np.ndarray): The DEM data.
        channel (int): The channel to be replaced with Slope data.
    Returns:
        np.ndarray: The modified image with Slope data.
    """
    slope = calculate_slope(dem_data / 4000.0)
    modified_image = image.copy().astype(np.float32)
    modified_image[:, :, channel] = slope
    return modified_image

In [None]:
def extract_id(path):
    # Normalize path separators
    path = path.replace("\\", "/")
    # Get the filename without extension
    filename = os.path.basename(path)
    id_, _ = os.path.splitext(filename)
    # Remove the trailing "_rcs" if present
    if id_.endswith("_rcs"):
        id_ = id_[:-4]
    return id_

In [None]:
# Function to process data in batches
def process_in_batches(dem_rcs_paths, batch_size=100):
    for i in range(0, len(dem_rcs_paths), batch_size):

        # Get the batch
        batch_paths = dem_rcs_paths[i:i + batch_size]
        dataset_dict = {
                "image": [],
                "label": [],
                "box": []
            }
        
        for dem_path, rcs_path in batch_paths:

            with rasterio.open(dem_path) as src:
                    dem = src.read(1)
            
            image = cv2.resize(substitute_dem(rcs._merge_to_rgb_all(*rcs.read_rcs_image(rcs_path))[0], dem), (512, 512), interpolation=cv2.INTER_LINEAR)
            pmd = PMD_features([32, 64, 128, 256])
            image_ori = rcs._merge_to_rgb_all(*rcs.read_rcs_image(rcs_path))[0]
            image_tensor = torch.from_numpy(np.array(image_ori)).permute(2, 0, 1).unsqueeze(0).float()
            output = pmd(image_tensor)
            output_np = output.cpu().detach().numpy().squeeze().transpose(1, 2, 0)  # adjust transpose if necessary
            label = cv2.resize(substitute_slope(output_np, dem), (512, 512), interpolation=cv2.INTER_LINEAR)
            pmd = PMD_features([32, 64, 128, 256])
            
            
            dataset_dict["image"].append(image)
            dataset_dict["label"].append(label) 
            dataset_dict["box"].append([0, 0, 512, 512])  # The box is the entire image 

        dataset = Dataset.from_dict(dataset_dict)
        dataset.save_to_disk('datasetBoxes' + str(i))
        # Clear memory
        del dataset_dict, dataset
        gc.collect()

In [None]:
datasetTest = []

In [None]:
# Function to process data in batches
def process_in_batches(dem_rcs_paths, batch_size=100):
    for i in range(0, len(dem_rcs_paths), batch_size):

        # Get the batch
        batch_paths = dem_rcs_paths[i:i + batch_size]
        dataset_dict = {
            "VH0": [],
            "VH1": [],
            "VV0": [],
            "VV1": [],
            "dem": [],
            "label": [],
            "box": [],
            "slope": [],
            "id": []
        }
        
        for dem_path, rcs_path in batch_paths:

            with rasterio.open(dem_path) as src:
                    dem = src.read(1)
            
            image = rcs._merge_all(*rcs.read_rcs_image(rcs_path))[0]
            
            
            dataset_dict["VH0"].append(cv2.resize(image[:, :, 0], (512, 512), interpolation=cv2.INTER_LINEAR))
            dataset_dict["VH1"].append(cv2.resize(image[:, :, 1], (512, 512), interpolation=cv2.INTER_LINEAR))
            dataset_dict["VV0"].append(cv2.resize(image[:, :, 2], (512, 512), interpolation=cv2.INTER_LINEAR))
            dataset_dict["VV1"].append(cv2.resize(image[:, :, 3], (512, 512), interpolation=cv2.INTER_LINEAR))
            dataset_dict["dem"].append(cv2.resize(dem / 4000.0, (512, 512), interpolation=cv2.INTER_LINEAR))
            dataset_dict["slope"].append(cv2.resize(calculate_slope(dem) / 90, (512, 512), interpolation=cv2.INTER_LINEAR))
            dataset_dict["box"].append([0, 0, 512, 512])  # The box is the entire image 
            dataset_dict["id"].append(extract_id(rcs_path))
            dummy_label = np.zeros((512, 512), dtype=np.float32)
            dataset_dict["label"].append(dummy_label)

        dataset = Dataset.from_dict(dataset_dict)
        # Save the dataset to disk
        dataset.save_to_disk('datasetBoxes' + str(i))
    
        # Clear memory
        del dataset_dict, dataset
        gc.collect()
        

In [None]:
divide = 20

In [None]:
process_in_batches(dem_rcs_paths, batch_size= len(rcs_paths)//divide)

In [None]:
print(dem_rcs_paths[0])

In [None]:
def is_dataset_loaded_and_not_empty(dataset):
    """
    Check if the dataset is loaded and not empty.

    Parameters:
    dataset (Dataset): The loaded dataset.

    Returns:
    bool: True if the dataset is loaded and not empty, False otherwise.
    """
    if dataset is None:
        return False
    if len(dataset) == 0:
        return False
    return True

In [None]:
datasetsNames = ['datasetBoxes' + str(i) for i in range(0, len(dem_rcs_paths), len(dem_rcs_paths)//divide)]

In [None]:
#Half of the datasets
datasetsNames = datasetsNames[len(datasetsNames)//2:]

In [None]:
datasetList = []
for datasetName in datasetsNames:
    dataset = load_from_disk(datasetName)
    if is_dataset_loaded_and_not_empty(dataset):
        print(f"Dataset '{datasetName}' loaded successfully.")
    datasetList.append(dataset)

In [None]:
from datasets import concatenate_datasets
# Concatenate the datasets
merged_dataset = concatenate_datasets(datasetList)
# Save the dataset to disk
#merged_dataset.save_to_disk('datasetSelfSupDEMFloat')

In [None]:
dataset = load_from_disk('datasetSelfSupDEMFloat')

In [None]:
dataset = merged_dataset

In [None]:
print(dataset.shape)

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets (90% training, 10% test)
train_test_split_ratio = 0.9
train_dataset, test_dataset = dataset.train_test_split(test_size=1 - train_test_split_ratio, seed = 20).values()

In [None]:
# Split the training dataset into training and validation sets (90% training, 10% validation)
train_val_split_ratio = 0.9
train_dataset, val_dataset = train_dataset.train_test_split(test_size=1 - train_val_split_ratio, seed = 20).values()

In [None]:
print(train_dataset.shape)
print(val_dataset.shape)
print(test_dataset.shape)

In [None]:

train_dataset.save_to_disk('datasetTrainSelfSupFinal')
val_dataset.save_to_disk('datasetValSelfSupSlopeFinal')
test_dataset.save_to_disk('datasetTestSelfSupSlopeFinal')

In [None]:
# You can add this as a new cell in your notebook

import matplotlib.pyplot as plt

# Set the number of samples you want to visualize
n_samples = 5

for idx in range(n_samples):
    sample = dataset[idx]  # Assuming 'dataset' contains your merged dataset
    image = sample['image']
    label = sample['label']
    
    # Create a figure with two subplots
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    
    # Display the image (convert to uint8 if necessary)
    axes[0].imshow(image)
    axes[0].set_title("Image")
    axes[0].axis('off')
    
    # Display the label (using grayscale)
    axes[1].imshow(label)
    axes[1].set_title("Label(Denoising Kernels and Slope)")
    axes[1].axis('off')
    
    plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Set the number of samples you want to visualize
n_samples = 5

for idx in range(n_samples):
    sample = dataset[idx]
    image = sample['image']
    label = sample['label']
    
    # Convert to numpy arrays if necessary
    if isinstance(image, list):
        image = np.array(image)
    if isinstance(label, list):
        label = np.array(label)
    
    # Extract the third channel, assuming images are in HWC format
    image_third = image[:, :, 2]
    label_third = label[:, :, 2]

    # Create a figure with two subplots
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    
    # Display the third channel from the image and label
    axes[0].imshow(image_third, cmap='gray')
    axes[0].set_title("Image - Third Channel")
    axes[0].axis('off')
    
    axes[1].imshow(label_third, cmap='gray')
    axes[1].set_title("Label - Third Channel")
    axes[1].axis('off')
    
    plt.show()