In [None]:
# !python -m pip install -U tifffile[all]
# !pip list
# Install dependencies
!python -m pip install --upgrade pip
!pip install tensorflow

In [1]:
import numpy as np
import tensorflow as tf 
from tensorflow.keras import layers, Model
from osgeo import gdal
import json
import matplotlib.pyplot as plt
import tifffile as tiff
import os
import pandas as pd
from skimage.transform import resize
from sklearn.model_selection import train_test_split
import re
from datetime import datetime

2024-04-24 11:54:16.858650: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-24 11:54:16.860228: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-24 11:54:16.892655: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-24 11:54:16.893335: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_json_data(json_file):
    with open(json_file, 'r') as file: 
        data = json.load(file)
    return data 

data_s1 = load_json_data('/home/pedro/Documents/JADS/DeepLearning/SEN12FLOOD/S1list.json')
data_s2 = load_json_data('/home/pedro/Documents/JADS/DeepLearning/SEN12FLOOD/S2list.json')

In [3]:
# ## load images as we saw on kaggle
# img = tiff.imread('/home/pedro/Documents/JADS/DeepLearning/SEN12FLOOD/0066/S2_2019-02-04_B01.tif')
# img_array = np.array(img)
# plt.imshow(img)
# print(img_array.shape)            

In [4]:
### This cell is for read the tiff images using the tiff python library
### Images, filenames and flooding labels are set to be stored in arrays while reading the images
### Function takes a particular directory (A Flloding scenario and fetch for each label in the flooding json files)
### It automatically fetch for the number of the folder being processed 
### And retrieve both the name of the file and the flooding label. e.g from the sequence:
### 
###
### {"0063": {"1": {"date": "2019-02-04", "FLOODING": false, "FULL-DATA-COVERAGE": true, "filename": "S2_2019-02-04"}, "count": 1, "folder": "0063", "geo": {"type": "Polygon", "coordinates": [[[28.29722, -15.382762], [28.297507, -15.429039], [28.345216, -15.428755], [28.344918, -15.382479], [28.29722, -15.382762]]]}},
### Based on the name of the folder "0063" it reads the images of the folder and it appends both filename and flooding label.
### To cross and validate whether there is flooding or not the folder "0200" is used on the next cell. 
### This is the second sequence in the S2 json data. Easy to validate

In [5]:
def load_images_from_directory(directory, s1_data, s2_data):
    images = []
    filenames = []
    labels = []
    
    folder_name = os.path.basename(directory)
    print(f"Processing folder: {folder_name}")

    for file_name in os.listdir(directory):
        if file_name.endswith('.tif') or file_name.endswith('.tiff'):
            json_data = s1_data if file_name.startswith('S1') else s2_data if file_name.startswith('S2') else None
            if json_data is None: 
                print(f"Skipping file (not S1 or S2): {file_name}")
                continue
            
            found = False
            for item in json_data.get(folder_name, {}).values():
                if isinstance(item, dict) and 'filename' in item and item['filename'] in file_name:
                    file_path = os.path.join(directory, file_name)
                    img = tiff.imread(file_path)
                    img_array = np.array(img)
                    images.append(img_array)
                    filenames.append(file_name)
                    labels.append(item.get('FLOODING', False))
                    found = True 
                    break
    return images, filenames, labels

In [6]:
### Definition to extract the date and order by date so we respect time-series nature for the model

In [7]:
def extract_date(filename):
    # Try to extract S2 date format first
    match = re.search(r'\d{4}-\d{2}-\d{2}', filename)
    if match:
        return datetime.strptime(match.group(), '%Y-%m-%d').date()
    else:
        # Try the S1 date format
        match = re.search(r'\d{8}T\d{6}', filename)
        if match:
            return datetime.strptime(match.group(), '%Y%m%dT%H%M%S').date()
    return None  # Important to handle cases where no date is found

In [8]:
### Following code is to get a list of all the flooding events (folders)
### present on the daataset. There is also code to select which folders are
### going to be selected to train and test the model

In [9]:
#### This code is to apply recursively the function to read the images to all 
### The folders present on the dataset. As the number of the sequence event is stored in a 
### dictionary everythins is stored on the same dictionary with sequence and flooding label

def process_folders(folder_list, main_directory, s1_data, s2_data):
    all_images = []
    all_labels = []
    all_filenames = []
    scenario_labels = []
    
    for folder in folder_list:
        directory = os.path.join(main_directory, folder)
        images, filenames, labels = load_images_from_directory(directory, s1_data, s2_data)

        if not images:
            print(f"No images found in folder {folder}. Continuing to next folder...")
            continue
        
        # Collect data and attempt to sort it
        temp_data = [(img, fname, lbl, extract_date(fname)) for img, fname, lbl in zip(images, filenames, labels)]
        # Print dates to verify correct extraction
        for _, fname, _, date in temp_data:
            print(f"Filename: {fname}, Extracted Date: {date}")

        # Filter out entries without a valid date
        temp_data = [data for data in temp_data if data[3] is not None]

        if not temp_data:
            print(f"No valid dates found in folder {folder}. Skipping sorting and adding...")
            continue
        
        # Sort by date
        temp_data.sort(key=lambda x: x[3])
        sorted_images, sorted_filenames, sorted_labels, _ = zip(*temp_data)

        all_images.extend(sorted_images)
        all_labels.extend(sorted_labels)
        all_filenames.extend(sorted_filenames)
        scenario_labels.extend([folder] * len(sorted_labels))

    return all_images, all_labels, all_filenames, scenario_labels


In [10]:
def list_folders(main_directory):
    return [f for f in os.listdir(main_directory) if os.path.isdir(os.path.join(main_directory, f))]

# Main SEN12FLOOD directory, whole data is there
main_directory = '/home/pedro/Documents/JADS/DeepLearning/SEN12FLOOD'

# List all folders
all_folders = list_folders(main_directory)

# Manually select folders for training and testing
# test_folders = [str(i) for i in range(68)]  # Example folders for training
# train_folders = ['0001', '0002', '0003', '0004', '0005', '0006', '0007', '0008', '0009', '0010', '0011', '0012', '0013', '0014', '0015', '0016', '0018', '0020', '0021', '0022', '0023', '0024', '0025', '0026', '0027', '0028', '0029', '0030', '0031', '0033', '0034', '0035', '0036', '0037', '0042', '0043', '0044', '0045', '0046', '0047', '0048', '0050', '0053', '0054', '0055', '0057', '0059', '0060', '0061', '0063', '0065', '0066', '0067', '0068', '0069', '0070', '0071', '0072', '0073', '0074', '0075', '0076', '0077', '0079', '0080', '0081', '0082', '0084', '0085', '0086', '0088', '0089', '0090', '0091', '0093', '0094', '0095', '0096', '0097', '0098', '0099', '0100', '0101', '0102', '0103', '0104', '0105', '0106', '0107', '0108', '0109', '0111', '0115', '0116', '0117', '0118', '0120', '0121', '0122', '0123', '0124', '0125', '0126', '0127', '0128', '0130', '0131', '0132', '0133', '0134', '0135', '0137', '0138', '0139', '0140', '0141', '0143', '0144', '0145', '0146', '0147', '0148', '0149', '0150', '0151', '0154', '0155', '0156', '0157', '0158', '0159', '0160', '0161', '0162', '0163', '0165', '0166', '0167', '0168', '0169', '0170', '0171', '0173', '0174', '0176', '0177', '0178', '0181', '0182', '0184', '0186', '0187', '0188', '0191', '0192', '0193', '0194', '0196', '0198', '0199', '0200', '0201', '0203', '0204', '0205', '0206', '0207', '0208', '0209', '0210', '0212', '0213', '0214', '0215', '0216', '0217', '0218', '0219', '0220', '0221', '0222', '0223', '0225', '0226', '0227', ' 0229', '0230', '0231', '0232', '0233', '0234', '0235', '0236', '0238', '0240', '0241', '0243', '0244', '0245', '0246', '0247', '0248', '0249', '0250', '0253', '0254', '0255', '0256', '0257', '0258', '0259', '0260', '0261', '0262', '0263', '0266', '0267', '0271', '0272', '0273', '0274', '0275', '0276', '0277', '0278', '0279', '0280', '0281', '0282', '0285', '0286', '0287', '0288', '0290', '0293', '0294', '0295', '0296', '0298', '0299', '0300', '0301', '0303', ' 0304', '0305', '0306', '0307', '0308', '0309', '0310', '0311', '0313', '0316', '0318', '0319', '0320', '0321', '0323', '0324', '0325', '0326', '0327', '0328', '0329', '0330', '0331', '0332', '0333', '0334', '0335', '0336']   # Example folders for testing
train_folders = ['0200', '0001', '0002']
test_folders = ['26']
validation_folders = ['61']

In [11]:
# KEEPNG THE CODE BELOW. DISCUSS IT WITH PEDRO

In [12]:
# ## Preprocess train and test folders
# train_images, train_labels, train_scenario_labels = process_folders(train_folders, main_directory, data_s1, data_s2)
# test_images, test_labels, test_scenario_labels = process_folders(test_folders, main_directory, data_s1, data_s2)
# validation_images, validation_labels, validation_scenario_labels = process_folders(validation_folders, main_directory, data_s1, data_s2)

In [13]:
### Preprocess train and test folders
train_images, train_labels, train_filenames, train_scenarios = process_folders(train_folders, main_directory, data_s1, data_s2)
test_images, test_labels, test_filenames, test_scenarios = process_folders(test_folders, main_directory, data_s1, data_s2)
validation_images, validation_labels, validation_filenames, validation_scenarios = process_folders(validation_folders, main_directory, data_s1, data_s2)

Processing folder: 0200
Filename: S1A_IW_GRDH_1SDV_20190412T031706_20190412T031731_026751_030125_194F_corrected_VV.tif, Extracted Date: 2019-04-12
Filename: S2_2019-03-25_B12.tif, Extracted Date: 2019-03-25
Filename: S1A_IW_GRDH_1SDV_20190318T162351_20190318T162420_026394_02F415_5335_corrected_VH.tif, Extracted Date: 2019-03-18
Filename: S1A_IW_GRDH_1SDV_20190331T031706_20190331T031731_026576_02FABC_DD51_corrected_VH.tif, Extracted Date: 2019-03-31
Filename: S2_2019-03-10_B09.tif, Extracted Date: 2019-03-10
Filename: S2_2019-04-24_B08.tif, Extracted Date: 2019-04-24
Filename: S2_2019-03-10_B02.tif, Extracted Date: 2019-03-10
Filename: S2_2019-04-04_B01.tif, Extracted Date: 2019-04-04
Filename: S2_2019-02-28_B05.tif, Extracted Date: 2019-02-28
Filename: S2_2019-02-28_B04.tif, Extracted Date: 2019-02-28
Filename: S2_2019-04-29_B02.tif, Extracted Date: 2019-04-29
Filename: S2_2019-04-29_B07.tif, Extracted Date: 2019-04-29
Filename: S2_2019-02-28_B06.tif, Extracted Date: 2019-02-28
Filenam

Filename: S2_2019-02-13_B05.tif, Extracted Date: 2019-02-13
Filename: S1A_IW_GRDH_1SDV_20190412T031706_20190412T031731_026751_030125_194F_corrected_VV.tif, Extracted Date: 2019-04-12
Filename: S2_2019-03-25_B12.tif, Extracted Date: 2019-03-25
Filename: S1A_IW_GRDH_1SDV_20190318T162351_20190318T162420_026394_02F415_5335_corrected_VH.tif, Extracted Date: 2019-03-18
Filename: S1A_IW_GRDH_1SDV_20190331T031706_20190331T031731_026576_02FABC_DD51_corrected_VH.tif, Extracted Date: 2019-03-31
Filename: S2_2019-03-10_B09.tif, Extracted Date: 2019-03-10
Filename: S2_2019-04-24_B08.tif, Extracted Date: 2019-04-24
Filename: S2_2019-03-10_B02.tif, Extracted Date: 2019-03-10
Filename: S2_2019-04-04_B01.tif, Extracted Date: 2019-04-04
Filename: S2_2019-02-28_B05.tif, Extracted Date: 2019-02-28
Filename: S2_2019-02-28_B04.tif, Extracted Date: 2019-02-28
Filename: S1B_IW_GRDH_1SDV_20190319T161451_20190319T161520_015425_01CE3C_A401_corrected_VH.tif, Extracted Date: 2019-03-19
Filename: S2_2019-04-19_B04.

In [14]:
import numpy as np
from skimage.transform import resize

# Define the target shape for resizing images
target_shape = (522, 544)  # Adjust as needed

# Assuming you have lists of images from previous steps
resized_train_images = [resize(image, target_shape, preserve_range=True) for image in train_images]
resized_test_images = [resize(image, target_shape, preserve_range=True) for image in test_images]
resized_validation_images = [resize(image, target_shape, preserve_range=True) for image in validation_images]

# Convert the resized images and labels into numpy arrays for use in the model
X_train = np.array(resized_train_images)
X_test = np.array(resized_test_images)
X_val = np.array(resized_validation_images)
y_train = np.array(train_labels)
y_test = np.array(test_labels)
y_val = np.array(validation_labels)

# Print sample data from training set to check structure
print("Training data sample:")
for filename, label, scenario in zip(train_filenames[:5], train_labels[:5], train_scenarios[:5]):
    print(f"Filename train: {filename}, Flooding: {label}, Scenario: {scenario}")

# Include scenario labels in the validation and test set print statements
print("\nValidation data sample:")
for filename, label, scenario in zip(validation_filenames[:5], validation_labels[:5], validation_scenarios[:5]):
    print(f"Filename validation: {filename}, Flooding: {label}, Scenario: {scenario}")

print("\nTest data sample:")
for filename, label, scenario in zip(test_filenames[:5], test_labels[:5], test_scenarios[:5]):
    print(f"Filename test: {filename}, Flooding: {label}, Scenario: {scenario}")

# Additional debug prints to check the shape of the arrays
print("\nShape of datasets:")
print("Train Images Shape:", X_train.shape)
print("Test Images Shape:", X_test.shape)
print("Validation Images Shape:", X_val.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Labels Shape:", y_test.shape)
print("Validation Labels Shape:", y_val.shape)

Training data sample:
Filename train: S2_2019-02-23_B04.tif, Flooding: False, Scenario: 0200
Filename train: S2_2019-02-23_B09.tif, Flooding: False, Scenario: 0200
Filename train: S2_2019-02-23_B02.tif, Flooding: False, Scenario: 0200
Filename train: S2_2019-02-23_B01.tif, Flooding: False, Scenario: 0200
Filename train: S2_2019-02-23_B05.tif, Flooding: False, Scenario: 0200

Validation data sample:
Filename validation: S2_2019-02-13_B05.tif, Flooding: False, Scenario: 61
Filename validation: S2_2019-02-13_B11.tif, Flooding: False, Scenario: 61
Filename validation: S2_2019-02-13_B06.tif, Flooding: False, Scenario: 61
Filename validation: S2_2019-02-13_B04.tif, Flooding: False, Scenario: 61
Filename validation: S2_2019-02-13_B09.tif, Flooding: False, Scenario: 61

Test data sample:
Filename test: S2_2018-12-28_B05.tif, Flooding: False, Scenario: 26
Filename test: S2_2018-12-28_B07.tif, Flooding: False, Scenario: 26
Filename test: S2_2018-12-28_B01.tif, Flooding: False, Scenario: 26
Filen

In [None]:
#### Structure the data to prepare the input for the ConvLSTM neural network

In [None]:
import numpy as np 
from collections import defaultdict 

def group_by_scenario_and_type(images, labels, scenarios, filenames):
    grouped_data = defaultdict(lambda: {'S1': [], 'S2': [], 'labels': []})
    for img, lbl, scn, fname in zip(images, labels, scenarios, filenames):
        if 'S1' in fname:
            grouped_data[scn]['S1'].append(img)
        elif 'S2' in fname:
            grouped_data[scn]['S2'].append(img)
        grouped_data[scn]['labels'].append(lbl)  # Assuming labels are common and not split by type
    return grouped_data

In [None]:
train_grouped = group_by_scenario_and_type(resized_train_images, train_labels, train_scenarios, train_filenames)
test_grouped = group_by_scenario_and_type(resized_test_images, test_labels, test_scenarios, test_filenames)
validation_grouped = group_by_scenario_and_type(resized_validation_images, validation_labels, validation_scenarios, validation_filenames)

In [None]:
def prepare_for_conv_lstm(grouped_data):
    X_s1, X_s2, y = [], [], []
    for scenario in grouped_data:
        scenario_images_s1 = np.array(grouped_data[scenario]['S1'])
        scenario_images_s2 = np.array(grouped_data[scenario]['S2'])
        scenario_labels = np.array(grouped_data[scenario]['labels'])  # Adjusted to 'labels'
        
        if len(scenario_images_s1) > 0 and len(scenario_images_s2) > 0:  # Ensure both types have images
            X_s1.append(scenario_images_s1)
            X_s2.append(scenario_images_s2)
            y.append(scenario_labels[0])  # Taking the first label as an example; adjust as needed based on your labeling strategy
    return np.array(X_s1), np.array(X_s2), np.array(y)

In [None]:
#### Erase the previous model. Strategy must be to train, test and validate model 
#### By sequences and not by all te images not considering the scenario (folder) label

#### After consultating with my doctor he has recommend me to go with the 
### Convolutional LSTM as it is convenient for time-series works
### And also can work multidimensionally which make it a good fit for image and video analysis

#### Define the model

# Structure Data Appropriately: Each input for the ConvLSTM needs to be shaped as (samples, time_steps, height, width, channels), where samples is the number of sequences, time_steps is the number of images in each sequence, height and width are the dimensions of each image, and channels refers to the number of channels (1 for grayscale, 3 for RGB).

In [None]:
X_train_s1, X_train_s2, y_train = prepare_for_conv_lstm(train_grouped)

In [None]:
# Example function to prepare the data correctly
def prepare_data(X):
    # Check if the data is nested and attempt to regularize the shape
    if isinstance(X[0], np.ndarray):
        # Assuming all elements are numpy arrays and have the same shape
        try:
            # Stack arrays along the first dimension (adding a new batch dimension)
            X = np.stack(X, axis=0)
        except ValueError as e:
            print("Error stacking arrays:", e)
            # If error in stacking, print shapes of individual arrays
            for i, arr in enumerate(X):
                print(f"Shape of array {i}: {arr.shape}")
    return X

# Convert and check each training set
X_train_s1 = prepare_data(X_train_s1)
X_train_s2 = prepare_data(X_train_s2)

# Check shapes and types again
print("X_train_s1 shape:", X_train_s1.shape, "Type:", X_train_s1.dtype)
print("X_train_s2 shape:", X_train_s2.shape, "Type:", X_train_s2.dtype)

# Convert labels to the correct type
y_train = y_train.astype('float32')  # Convert boolean labels to float32 if necessary

In [None]:
### Here we need to introduce "padding" to work in the different lenghts of the 
### sequences as different sequences have a different number of images.
### paddding is the way of fixing this dimensionality problem

#Masking: After padding, using a Masking layer tells the model to ignore the padding during training, which helps in focusing on the meaningful data. Without masking, the model might learn the padding as a significant part of the input pattern, potentially skewing results.
### Padding at the beggining is the common practice when dealing with time series

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Masking, ConvLSTM2D, BatchNormalization, Dense, Flatten, concatenate, GlobalAveragePooling2D
from tensorflow.keras.models import Model

def pad_image_sequences(sequences, maxlen=None):
    ### Pad sequences to the same input length 
    padded_sequences = pad_sequences(sequences, maxlen=maxlen, dtype='float32', padding='post', value=0.0)
    ## Adding a channel dimension, assuming images are grayscale (I dont know what is this honestly)
    padded_sequences = np.expand_dims(padded_sequences, axis=-1)
    return padded_sequences

In [None]:
#### Padding the sequences images for training

max_len = max(max([seq.shape[0] for seq in X_train_s1]), max([seq.shape[0] for seq in X_train_s2]))

X_train_s1_padded = np.array([pad_image_sequences(seq, maxlen=max_len) for seq in X_train_s1])
X_train_s2_padded = np.array([pad_image_sequences(seq, maxlen=max_len) for seq in X_train_s2])

In [None]:


input_shape = (None, 522, 544, 1)

s1_input = Input(shape=input_shape, name='S1_input')
s2_input = Input(shape=input_shape, name='S2_input')

# S1 Branch
s1_branch = ConvLSTM2D(32, (3, 3), activation='relu', return_sequences=True)(s1_input)
s1_branch = BatchNormalization()(s1_branch)
s1_branch = ConvLSTM2D(16, (3, 3), activation='relu', return_sequences=False)(s1_branch)
s1_branch = BatchNormalization()(s1_branch)
s1_branch = Flatten()(s1_branch)

# S2 Branch
s2_branch = ConvLSTM2D(32, (3, 3), activation='relu', return_sequences=True)(s2_input)
s2_branch = BatchNormalization()(s2_branch)
s2_branch = ConvLSTM2D(16, (3, 3), activation='relu', return_sequences=False)(s2_branch)
s2_branch = BatchNormalization()(s2_branch)
s2_branch = Flatten()(s2_branch)

print("S1 Branch Shape:", s1_branch.shape)
print("S2 Branch Shape:", s2_branch.shape)

# Combine the outputs from both branches
combined = concatenate([s1_branch, s2_branch])

final_layer = Dense(64, activation='relu')(combined)
output = Dense(1, activation='sigmoid')(final_layer)

model = Model(inputs=[s1_input, s2_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
### Troubleshoot data types



In [None]:
### Fit the model with the training sequences

In [None]:
model.fit([X_train_s1, X_train_s2], y_train, epochs=10, batch_size=1, validation_split=0.2)

In [None]:
### Predictions 

In [None]:
predicted_probabilities = model.predict(X_test_s1, X_test_s2)
predicted_labels = (predicted_probabilities > 0.5).astype(int)

In [None]:
### Print Results

for i in range(len(predicted_labels)):
    print(f"Test Label: {y_test['i']}, Predicted: {predicted_labels[i]}")