## Howto

- Different functions for different purposes. 
- First cell was to check, if tf.vectorized_map(mask_2_crop) is working, because that was were I got the first error
- If files are found, use delete corrupted files to delete files from all directories
- Another function is to check, if all files have the same dimension, so 256*256 for dem for example
- check_sum_inner_outer checks, if the inner part of my mask (64*64) is equal to 4096. If not, this means, that functions inside of my model wont work

## Best approach

- Probably, just checking the sum of the inner-outer-mask is sufficient. Start with this, then delete files
- Also check for division by zero at the end!

In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import logging
import time

# set up dataset
version = 'V1.6'

base_path = f'/home/robin/Nextcloud_sn/Masterarbeit/DataSet/{version}/'

dem_path = f'/home/robin/Nextcloud_sn/Masterarbeit/DataSet/{version}/DEMs'
inner_outer_path = f'/home/robin/Nextcloud_sn/Masterarbeit/DataSet/{version}/Inner-Outer Mask'
intersection_path = f'/home/robin/Nextcloud_sn/Masterarbeit/DataSet/{version}/Intersection Mask'
intersection_small_path = f'/home/robin/Nextcloud_sn/Masterarbeit/DataSet/{version}/Intersection Mask Small'

timestr = time.strftime("%Y%m%d-%H%M%S")
corrupt_files = []

In [None]:
# check if sum insider inner-outer is equal to 4096

def check_sum_inner_outer(inner_outer_path, dim = (64,64)):

    """Checks if the sum of the inner-outer mask is equal to dim * dim"""
    sum_value = dim[0] * dim[1]
    corrupt_files = []
    for filename in tqdm(os.listdir(inner_outer_path)):
        filepath = os.path.join(inner_outer_path, filename)
        data = np.load(filepath)
        if np.sum(data) != sum_value:
            corrupt_files.append(os.path.join(inner_outer_path, filename).encode())
    return corrupt_files



def check_min_max_values(folder_path, shape = (256,256)):

    """Checks if the min and max values of the DEMs are corrupt, to avoid division by zero errors"""

    for file in tqdm(os.listdir(folder_path)):
        filepath = os.path.join(folder_path, file)
        data = cv2.imread(filepath, cv2.IMREAD_LOAD_GDAL)
        data = cv2.resize(data, dsize=shape, interpolation=cv2.INTER_AREA)
        if np.max(data) - np.min(data) == 0:
            corrupt_files.append(os.path.join(folder_path, file).encode())
    return corrupt_files




def resize_dems(dem_path, shape = (256,256)):

    """Resizes the DEMs to the desired shape"""

    for file in tqdm(os.listdir(dem_path)):
        filepath = os.path.join(dem_path, file)
        data = cv2.imread(filepath, cv2.IMREAD_LOAD_GDAL)
        data = cv2.resize(data, dsize=shape, interpolation=cv2.INTER_AREA)
        cv2.imwrite(filepath, data)




def delete_corrupt_files(corrupt_files, dem_path, inner_outer_path, intersection_path ,intersection_small_path, base_path):

    """Deletes corrupt files from the dataset
    Args:
        corrupt_files (list): List of corrupt files
        dem_path (str): Path to DEM files
        inner_outer_path (str): Path to Inner-Outer Mask files
        intersection_path (str): Path to Intersection Mask files
        intersection_small_path (str): Path to Intersection Mask small files"""

    for file in corrupt_files:
        
        if file.decode('UTF-8').endswith('.tif'):
            dem_file = file.decode('UTF-8')
        
        if file.decode('UTF-8').endswith('.npy'):
            dem_file = os.path.join(dem_path, os.path.basename(file.decode('UTF-8').replace('.npy', '.tif')))
                                    
        inner_outer_file = os.path.join(inner_outer_path, os.path.basename(file.decode('UTF-8')).replace('.tif', '.npy'))
        intersection_file = os.path.join(intersection_path, os.path.basename(file.decode('UTF-8')).replace('.tif', '.npy'))
        intersection_small_file = os.path.join(intersection_small_path, os.path.basename(file.decode('UTF-8')).replace('.tif', '.npy'))

        os.remove(dem_file)
        os.remove(inner_outer_file)
        os.remove(intersection_file)
        os.remove(intersection_small_file)

        with open(os.path.join(base_path, 'corrupt_files.txt'), 'a') as f:
            for item in corrupt_files:
                f.write("%s\n" % item)

    print(f'Deleted {len(corrupt_files)} corrupt files')

def norm_dems(dem_path):

    for file in tqdm(os.listdir(dem_path)):
        filepath = os.path.join(dem_path, file)
        data = cv2.imread(filepath, cv2.IMREAD_LOAD_GDAL)
        min_val = np.min(data)
        max_val = np.max(data)

        if max_val - min_val == 0:
            raise ValueError(f'Min and max values are equal for file {file}')
        
        data = ((data - min_val) / (max_val - min_val)) * 2 -1 # normalize to [-1,1]
        #cv2.imwrite(filepath, data)


In [None]:
# initialize corrupt files list
corrupt_files = []

# check for corrupt files
corrupt_sum = check_sum_inner_outer(inner_outer_path, dim = (64,64))
corrupt_min_max = check_min_max_values(dem_path, shape = (256,256))


# combine corrupt files and remove duplicates
corrupt_files = corrupt_sum + corrupt_min_max
corrupt_files = list(dict.fromkeys(corrupt_files))

# delete corrupt files
delete_corrupt_files(corrupt_files, dem_path, inner_outer_path, intersection_path, intersection_small_path, base_path)

# resize and normalize DEMs
resize_dems(dem_path, shape = (256,256))
norm_dems(dem_path)




### Going further is probably not necessary. The cells above should ensure integrity of the dataset

In [None]:
# Iterates over the whole dataset and checks if the cropping algorithm works correctly

corrupt_files = []

logging.basicConfig(filename='/home/robin/Nextcloud_sn/Masterarbeit/DataSet/V1.3/integrity_check' + timestr + '.log', level=logging.ERROR)

# Define file patterns
dem_pattern = os.path.join(dem_path, "*.tif")  # Assuming your DEM files are in TIFF format
inner_outer_pattern = os.path.join(inner_outer_path, "*.npy")
intersection_small_pattern = os.path.join(intersection_small_path, "*.npy")

# Create datasets
dem_dataset = tf.data.Dataset.list_files(dem_pattern, shuffle=False)
inner_outer_dataset = tf.data.Dataset.list_files(inner_outer_pattern, shuffle=False)
intersection_small_dataset = tf.data.Dataset.list_files(intersection_small_pattern, shuffle=False)

# Convert to lists if needed
dem_list = list(dem_dataset.as_numpy_iterator())
inner_outer_list = list(inner_outer_dataset.as_numpy_iterator())
intersection_small_list = list(intersection_small_dataset.as_numpy_iterator())


# Sort the lists, so that files align correctly
dem_list.sort()
inner_outer_list.sort()
intersection_small_list.sort()


print('Starting to iterate over whole dataset')
for i in tqdm(range(len(dem_list))):
    f_dem = dem_list[i]
    f_inner_outer = inner_outer_list[i]
    f_intersection_small = intersection_small_list[i]

    # Load the DEM
    data_dem = cv2.imread(f_dem.decode('Utf-8'), cv2.IMREAD_LOAD_GDAL)
    data_dem = cv2.resize(data_dem, dsize=(256,256), interpolation=cv2.INTER_AREA)

    image = tf.convert_to_tensor(data_dem)
    image = tf.expand_dims(image, axis=-1)

    # Load the Inner-Outer Mask
    data_inner_outer = np.load(f_inner_outer)
    inner_outer = tf.convert_to_tensor(data_inner_outer)
    inner_outer = tf.expand_dims(inner_outer, axis=-1)
    mask_inner_outer = tf.image.resize(inner_outer, (256,256), method='nearest')

    # Load intersection_mask_small
    data_intersection_small = np.load(f_intersection_small)
    intersection_small = tf.convert_to_tensor(data_intersection_small)
    intersection_small = tf.expand_dims(intersection_small, axis=-1)
    mask_intersection_small = tf.image.resize(intersection_small, (64,64), method='nearest')

    # Find indices where mask_inner_outer is 1
    if tf.reduce_any(mask_inner_outer == 1):

        indices = tf.where(mask_inner_outer == 1)
        top_left = tf.reduce_min(indices, axis=0)

        x1 = tf.cast(top_left[0], tf.int32)
        y1 = tf.cast(top_left[1], tf.int32)

        shapex = tf.constant(64, tf.int32)
        shapey = tf.constant(64, tf.int32)

        try:
            crop = tf.image.crop_to_bounding_box(image, x1, y1, shapex, shapey)
        except (IOError, ValueError, tf.errors.OpError) as e:
            logging.error(f"Error processing file {f_dem}: {e}")
            corrupt_files.append(f_dem)
            continue

        # test if crop has right shape
        if crop.shape != (64,64,1):
            raise ValueError(f'Error: {f_dem}')
    
    else:
        raise ValueError(f'No intersection found for {f_inner_outer}')

In [None]:
# check for file size mismatch  
import os
import cv2
import numpy as np


#dem_path = '/mnt/HDD/Masterarbeit_local/DEMs'
#inner_outer_path = '/mnt/HDD/Masterarbeit_local/Inner-Outer Mask'
#intersection_path = '//mnt/HDD/Masterarbeit_local/Intersection Mask'
#intersection_small_path = '/mnt/HDD/Masterarbeit_local/Intersection Mask small'

def check_image_dimension(folder_path, desired_shape = (128,128), extensions=(".jpg", ".jpeg", ".png", ".tif", ".npy")):

    """Checks if the images in the dataset have the desired shape"""

    mismatched_files = []

    for filename in tqdm(os.listdir(folder_path)):

        if not filename.lower().endswith(extensions):
            raise ValueError(f'File {filename} is not in a desired file format')
        
        filepath = os.path.join(folder_path, filename)

        if filename.endswith('.npy'):
            try:
                data = np.load(filepath)
                if data.shape != desired_shape:
                    mismatched_files.append(filename)
            except Exception as e:
                print(f'Error: {e}')
        elif filename.endswith('.tif'):
            try:
                data = cv2.imread(filepath, cv2.IMREAD_LOAD_GDAL)
                if data.shape[:2] != desired_shape:
                    mismatched_files.append(filename)
            except Exception as e:
                print(f'Error: {e}')
        else:
            try:
                data = cv2.imread(filepath)
                if data.shape[:2] != desired_shape:
                    mismatched_files.append(filename)
            except Exception as e:
                print(f'Error: {e}')
    
    return mismatched_files

mismatched_dem = check_image_dimension(dem_path, desired_shape=(256,256))
mismatched_inner_outer = check_image_dimension(inner_outer_path, desired_shape=(256,256))
mismatched_intersection = check_image_dimension(intersection_path, desired_shape=(256,256))
mismatched_intersection_small = check_image_dimension(intersection_small_path, desired_shape=(64,64))

print(len(mismatched_dem))
print(len(mismatched_inner_outer))
print(len(mismatched_intersection))
print(len(mismatched_intersection_small))
            

In [None]:
# Also remember to look out for dtype of numpy masks

In [None]:
# Probably not necessary
# corrupt_encode = [file.encode() for file in corrupt_files]
# corrupt_files = [file.decode('UTF-8') for file in corrupt_encode]