# Dataset creation

This notebook is used for creating the datasets used for the training, validation and testing of the deep-learning model. 

Author of origibal notebook: Antonio Magherini (Antonio.Magherini@deltares.nl).

Notebook modified by Mathias Ruhe (mathias.d.ruhe@gmail.com)

**Modifications:**
- 

In [2]:
# move to root directory

%cd c:\Users\mathi\Desktop\TU Delft\TU Delft year 5\Data_science\Morphology_project\jamunet-morpho-braided

c:\Users\mathi\Desktop\TU Delft\TU Delft year 5\Data_science\Morphology_project\jamunet-morpho-braided


In [3]:
# reload modules to avoid restarting the notebook every time these are updated

%load_ext autoreload
%autoreload 2

In [4]:
# import modules 

import torch 

from preprocessing.dataset_generation import * 

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## path definition

In [13]:
# Set river
rivers = ['Jamuna','Ganges', 'Indus', 'Ghangara']
river = rivers[1]  # change index to select different river

# 2. Build the specific paths
base_dir = os.path.join('data', 'satellite', f'{river}_images')
dir_orig = os.path.join(base_dir, 'original')
dir_proc = os.path.join(base_dir, 'preprocessed')
dir_dataset = os.path.join(base_dir, 'dataset')
print(f'Base directory set to: {base_dir}')

# Subdirectories
dir_dataset_1024x512 = os.path.join(base_dir, 'dataset_1024x512')
dir_dataset_jan = os.path.join(dir_proc, 'month_1')
dir_dataset_feb = os.path.join(dir_proc, 'month_2')
dir_dataset_mar = os.path.join(dir_proc, 'month_3')
dir_dataset_apr = os.path.join(dir_proc, 'month_4')

# Available collections
JRC = r'JRC_GSW1_4_MonthlyHistory'

Base directory set to: data\satellite\Ganges_images


Set string variables.

In [14]:
train = 'training'
val = 'validation'
test = 'testing'

train_val_test_list = [train, val, test]
train_list = [train]

The next cells are used just to show how the different functions work. 

1. Create the input and target datasets: all images are loaded regardless of their quality.

In [25]:
# input_mar, target_mar = create_datasets(train, 1, 5, dir_folders=dir_dataset_mar)

# Modified code:

In [17]:
import os
import numpy as np
import pandas as pd
from osgeo import gdal

# --- Helper Loading Functions (Assumed you have them, provided here for safety) ---
def load_image_array(path, scaled_classes=True):
    ds = gdal.Open(path)
    if ds is None: return None
    arr = ds.ReadAsArray()
    # If scaled, ensure mapping is correct (-1, 0, 1) or keep raw
    return arr

def load_avg(train_val_test, reach, year, dir_averages):
    # Construct path to the specific average file
    # Structure: averages / average_training_r1 / average_1988_training_r1.csv
    folder_name = f'average_{train_val_test}_r{reach}'
    file_name = f'average_{year}_{train_val_test}_r{reach}.csv'
    
    full_path = os.path.join(dir_averages, folder_name, file_name)
    
    if os.path.exists(full_path):
        # Read CSV without header/index
        return pd.read_csv(full_path, header=None).values
    else:
        # Fallback if specific year is missing (return None or zeros)
        # print(f"Warning: Average for {year} not found at {full_path}")
        return None

# --- CORRECTED FUNCTIONS ---

def create_list_images(train_val_test, reach, dir_folders, collection):
    '''
    Robust version: Searches for the folder ending in '_r{reach}' 
    instead of guessing the full name.
    '''
    list_dir_images = []
    
    # 1. Find the correct reach folder inside the month folder (dir_folders)
    if not os.path.exists(dir_folders):
        print(f"‚ùå Error: Directory not found: {dir_folders}")
        return []

    target_folder_path = None
    
    # Search for folder ending in "_r1" (e.g.)
    for folder_name in os.listdir(dir_folders):
        # We check if it matches the reach ID
        if folder_name.endswith(f'_r{reach}'):
            target_folder_path = os.path.join(dir_folders, folder_name)
            break
            
    if target_folder_path is None:
        print(f"‚ùå Error: No folder found for reach {reach} in {dir_folders}")
        return []

    # 2. Collect .tif images
    # Sort them to ensure years are in order (1988, 1989...)
    sorted_files = sorted(os.listdir(target_folder_path))
    
    for image in sorted_files:
        if image.endswith('.tif'):
            path_image = os.path.join(target_folder_path, image)
            list_dir_images.append(path_image)
            
    return list_dir_images

def create_datasets(train_val_test, reach, year_target=5, nodata_value=-1, dir_folders=r'data\satellite\dataset', 
                    collection=r'JRC_GSW1_4_MonthlyHistory', scaled_classes=True):
    
    # 1. Get Images
    list_dir_images = create_list_images(train_val_test, reach, dir_folders, collection)
    
    if not list_dir_images:
        return [], [] # Return empty if path failed

    # 2. Load Images into Arrays
    images_array = []
    valid_indices = [] # Keep track of which years actually loaded
    
    # We extract the year from the filename to match with averages
    # Assuming filename format: ..._1988-03_r1.tif
    loaded_years = []

    for idx, path in enumerate(list_dir_images):
        img = load_image_array(path, scaled_classes=scaled_classes)
        if img is not None:
            images_array.append(img)
            
            # Extract year safely
            filename = os.path.basename(path)
            # Find year (4 digits)
            try:
                # Split by '_' or '-' and find the item that looks like a year
                parts = filename.replace('-', '_').split('_')
                year = next(p for p in parts if p.isdigit() and len(p) == 4)
                loaded_years.append(int(year))
            except:
                # Fallback if naming is weird, assume sequential start 1988
                loaded_years.append(1988 + idx)

    # 3. Load Averages
    # Point to the correct averages directory
    # If dir_folders is ".../Ganges_images/preprocessed/month_3"
    # We want ".../Ganges_images/averages"
    base_proj_dir = os.path.dirname(os.path.dirname(dir_folders)) # Go up two levels
    dir_averages = os.path.join(base_proj_dir, 'averages')

    avg_imgs = []
    for year in loaded_years:
        avg = load_avg(train_val_test, reach, year, dir_averages)
        # If avg is missing for a year, use a zero-array as fallback to prevent crash
        if avg is None:
            avg = np.zeros_like(images_array[0]) 
        avg_imgs.append(avg)

    # 4. Replace No-Data (Binary Conversion)
    good_images_array = [np.where(image == nodata_value, avg_imgs[i], image) 
                         for i, image in enumerate(images_array)]
        
    input_dataset = []
    target_dataset = []
    
    # 5. Create Sequences (n-to-1)
    # Ensure we have enough images for the sequence
    if len(good_images_array) < year_target:
        print(f"‚ö†Ô∏è Not enough images for Reach {reach} (Found {len(good_images_array)}, need {year_target})")
        return [], []

    for i in range(len(good_images_array) - year_target + 1):
        # Input: Sequence of (year_target - 1) images
        input_seq = good_images_array[i : i + year_target - 1]
        
        # Target: The next image
        target_seq = [good_images_array[i + year_target - 1]]
        
        input_dataset.append(input_seq)
        target_dataset.append(target_seq)

    return input_dataset, target_dataset

In [18]:
# --- CONFIGURATION ---
rivers = ['Jamuna','Ganges', 'Indus', 'Ghangara']
river = rivers[1] # Ganges

base_dir = os.path.join('data', 'satellite', f'{river}_images')
dir_proc = os.path.join(base_dir, 'preprocessed')

# Point explicitly to Month 3 (March)
dir_dataset_mar = os.path.join(dir_proc, 'month_3')

print(f"Reading from: {dir_dataset_mar}")

# --- EXECUTION ---
# Note: 'train_val_test' string is used mainly for finding the Average file
input_mar, target_mar = create_datasets(
    train_val_test='training', # Use string 'training', not variable 'train'
    reach=1, 
    year_target=5, 
    dir_folders=dir_dataset_mar
)

print(f"Success! Created {len(input_mar)} samples.")

Reading from: data\satellite\Ganges_images\preprocessed\month_3




Success! Created 19 samples.


In [20]:
# --- CONFIGURATION ---
train_val_test = 'training'  # Ensure this is a string
reach_id = 1                 # The reach you are targeting
prediction_horizon = 5       # 5th year prediction
# ---------------------

# Dictionary to store results: {'month_1': (input, target), 'month_2': ...}
all_datasets = {}

print(f"üöÄ Starting dataset generation for {train_val_test} (Reach {reach_id})...\n")

for month in range(1, 5):
    # 1. Construct the path dynamically (month_1, month_2, etc.)
    month_folder = os.path.join(dir_proc, f'month_{month}')
    
    # 2. Check if folder exists
    if not os.path.exists(month_folder):
        print(f"‚ö†Ô∏è  Skipping Month {month}: Folder not found at {month_folder}")
        continue

    print(f"Processing Month {month}...")

    # 3. Create Dataset for this month
    # We pass the string 'training' explicitly
    inputs, targets = create_datasets(
        train_val_test=train_val_test, 
        reach=reach_id, 
        year_target=prediction_horizon, 
        dir_folders=month_folder
    )

    # 4. Store results
    all_datasets[f'month_{month}'] = (inputs, targets)
    
    print(f"   > Generated {len(inputs)} samples.\n")

# --- OPTIONAL: Unpack into variables if you specifically need them ---
# This matches your original variable naming scheme
input_jan, target_jan = all_datasets.get('month_1', ([], []))
input_feb, target_feb = all_datasets.get('month_2', ([], []))
input_mar, target_mar = all_datasets.get('month_3', ([], []))
input_apr, target_apr = all_datasets.get('month_4', ([], []))

print("‚úÖ Done! Variables input_jan, input_feb, etc. are ready.")

üöÄ Starting dataset generation for training (Reach 1)...

Processing Month 1...
   > Generated 19 samples.

Processing Month 2...
   > Generated 19 samples.

Processing Month 3...
   > Generated 19 samples.

Processing Month 4...
   > Generated 19 samples.

‚úÖ Done! Variables input_jan, input_feb, etc. are ready.


In [26]:
def combine_datasets(train_val_test, reach, year_target=5, 
                     nonwater_threshold=480000, nodata_value=-1, nonwater_value=0,   
                     dir_folders=r'data\satellite\dataset', 
                     collection=r'JRC_GSW1_4_MonthlyHistory', scaled_classes=True):
    '''
    Filters image sequences. If any image in the input sequence (or the target) 
    has too many non-water pixels (implying it's just a placeholder or bad data),
    the whole sequence is discarded.
    '''
    
    # 1. Generate the raw dataset using your previous function
    # Note: We rely on the create_datasets function we fixed earlier
    input_dataset, target_dataset = create_datasets(
        train_val_test, reach, year_target, nodata_value, 
        dir_folders, collection, scaled_classes
    )

    filtered_input_dataset = []
    filtered_target_dataset = []

    print(f"  Filtering {len(input_dataset)} sequences (Threshold: < {nonwater_threshold} non-water pixels)...")

    # 2. Filter pairs
    for input_images, target_image_seq in zip(input_dataset, target_dataset):
        
        # Check Inputs: Are ALL images in the sequence "good"?
        # (Good = non-water count is BELOW the threshold, meaning there is enough water)
        is_input_good = True
        for img in input_images:
            # Check non-water pixels (value 0)
            n_nonwater = np.sum(img == nonwater_value)
            if n_nonwater >= nonwater_threshold:
                is_input_good = False
                break
        
        if is_input_good:
            # Check Target: Is the target image also "good"?
            # target_image_seq is a list [img], so we take [0]
            target_img = target_image_seq[0]
            n_nonwater_target = np.sum(target_img == nonwater_value)
            
            if n_nonwater_target < nonwater_threshold:
                # Both input sequence and target are valid
                filtered_input_dataset.append(input_images)
                filtered_target_dataset.append(target_img) # Store as raw image, not list

    print(f"  > Kept {len(filtered_input_dataset)} / {len(input_dataset)} sequences.")
    
    return filtered_input_dataset, filtered_target_dataset

In [27]:
# --- CONFIGURATION ---
train_val_test = 'training'
reach_id = 1
prediction_horizon = 5

# Dictionary to hold the final filtered data
# Structure: {'month_1': (inputs, targets), ...}
final_datasets = {}

print(f"üöÄ Starting Filtering for {train_val_test} (Reach {reach_id})...\n")

for month in range(1, 5):
    # Construct path: .../preprocessed/month_1
    month_path = os.path.join(dir_proc, f'month_{month}')
    
    if not os.path.exists(month_path):
        print(f"Skipping Month {month} (Path not found)")
        continue
        
    print(f"Processing Month {month}...")
    
    # Run Filter
    inputs, targets = combine_datasets(
        train_val_test=train_val_test, 
        reach=reach_id, 
        dir_folders=month_path,
        year_target=prediction_horizon
    )
    
    # Store in dictionary
    final_datasets[f'month_{month}'] = (inputs, targets)
    print("") # Empty line for readability

# --- UNPACKING (If you need individual variables) ---
input_jan_filtered, target_jan_filtered = final_datasets.get('month_1', ([], []))
input_feb_filtered, target_feb_filtered = final_datasets.get('month_2', ([], []))
input_mar_filtered, target_mar_filtered = final_datasets.get('month_3', ([], []))
input_apr_filtered, target_apr_filtered = final_datasets.get('month_4', ([], []))

print("‚úÖ Filtering Complete. Filtered variables are ready.")

üöÄ Starting Filtering for training (Reach 1)...

Processing Month 1...
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 17 / 19 sequences.

Processing Month 2...
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 18 / 19 sequences.

Processing Month 3...
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 14 / 19 sequences.

Processing Month 4...
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 18 / 19 sequences.

‚úÖ Filtering Complete. Filtered variables are ready.


In [28]:
print(f"Jan: {len(input_jan_filtered)} samples")
print(f"Feb: {len(input_feb_filtered)} samples")
print(f"Mar: {len(input_mar_filtered)} samples")
print(f"Apr: {len(input_apr_filtered)} samples")

Jan: 17 samples
Feb: 18 samples
Mar: 14 samples
Apr: 18 samples


In [29]:
import torch
import numpy as np
import os
from torch.utils.data import TensorDataset

def create_full_dataset(train_val_test, year_target=5, 
                        nonwater_threshold=480000, nodata_value=-1, nonwater_value=0, 
                        dir_folders=r'data\satellite\dataset', 
                        collection=r'JRC_GSW1_4_MonthlyHistory', 
                        scaled_classes=True, device='cuda:0', dtype=torch.float32):
    '''
    Combines ALL reaches within a specific month folder into a single TensorDataset.
    Optimized to convert numpy arrays to tensors efficiently.
    '''
    
    # initialize lists
    all_inputs = []
    all_targets = []
    
    # 1. Scan the folder for valid reach sub-directories
    if not os.path.exists(dir_folders):
        print(f"‚ùå Error: Path not found: {dir_folders}")
        return TensorDataset(torch.empty(0), torch.empty(0))

    # Look for folders like "JRC_..._r1", "JRC_..._r8", etc.
    potential_folders = [f for f in os.listdir(dir_folders) if os.path.isdir(os.path.join(dir_folders, f))]
    
    print(f"üìÇ Scanning {dir_folders}...")

    count_reaches = 0
    
    for folder_name in potential_folders:
        # We need to extract the Reach ID from the folder name.
        # Naming convention is expected to be: "..._r{number}"
        try:
            reach_id_str = folder_name.split('_r')[-1]
            reach_id = int(reach_id_str)
        except (IndexError, ValueError):
            # Skip folders that don't match the "_rX" pattern
            continue
            
        # 2. Get data for this specific reach
        # combine_datasets performs the loading + filtering
        inputs, targets = combine_datasets(
            train_val_test, 
            reach=reach_id, 
            year_target=year_target, 
            nonwater_threshold=nonwater_threshold,
            nodata_value=nodata_value, 
            nonwater_value=nonwater_value, 
            dir_folders=dir_folders, # Passes the month folder
            collection=collection, 
            scaled_classes=scaled_classes
        )
        
        if len(inputs) > 0:
            all_inputs.extend(inputs)
            all_targets.extend(targets)
            count_reaches += 1
            
    print(f"   > Aggregated data from {count_reaches} reaches.")

    if len(all_inputs) == 0:
        print("‚ö†Ô∏è Warning: No valid samples found after filtering.")
        return TensorDataset(torch.empty(0), torch.empty(0))

    # 3. Convert list of numpy arrays to a single Torch Tensor
    # Converting numpy list -> numpy array -> torch tensor is much faster/safer than list -> tensor
    
    try:
        # Convert list of lists/arrays into a single large numpy array
        input_np = np.array(all_inputs) 
        target_np = np.array(all_targets)
        
        # Create Tensors on the specified device
        input_tensor = torch.tensor(input_np, dtype=dtype, device=device)
        target_tensor = torch.tensor(target_np, dtype=dtype, device=device)
        
        dataset = TensorDataset(input_tensor, target_tensor)
        return dataset
        
    except Exception as e:
        print(f"‚ùå Error creating tensors (Check memory or array shapes): {e}")
        return TensorDataset(torch.empty(0), torch.empty(0))

In [30]:
# --- CONFIGURATION ---
train_val_test = 'training'  # Make sure this is the string 'training'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
target_dtype = torch.float32

# Store datasets in a dictionary or list
datasets_by_month = {}

print(f"üöÄ Creating Full Datasets on {device}...\n")

# Loop Jan - Apr
for month_idx, month_dir in zip(
    ['Jan', 'Feb', 'Mar', 'Apr'], 
    [dir_dataset_jan, dir_dataset_feb, dir_dataset_mar, dir_dataset_apr]
):
    print(f"Processing {month_idx}...")
    
    ds = create_full_dataset(
        train_val_test=train_val_test, 
        dir_folders=month_dir, 
        device=device, 
        dtype=target_dtype
    )
    
    datasets_by_month[month_idx] = ds
    print(f"   > Created dataset with {len(ds)} samples.\n")


# Print Summary
print(f'Total training samples considering different months:\n\
January --> {len(datasets_by_month["Jan"])}\n\
February --> {len(datasets_by_month["Feb"])}\n\
March --> {len(datasets_by_month["Mar"])}\n\
April --> {len(datasets_by_month["Apr"])}')

üöÄ Creating Full Datasets on cuda:0...

Processing Jan...
üìÇ Scanning data\satellite\Ganges_images\preprocessed\month_1...
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 17 / 19 sequences.
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 17 / 19 sequences.
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 6 / 19 sequences.
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 6 / 19 sequences.
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 17 / 19 sequences.
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 17 / 19 sequences.
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 15 / 19 sequences.
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 14 / 19 sequences.
  Filtering 19 sequences (Threshold: < 480000 non-water pixels)...
  > Kept 14 / 19 sequences.
  Filtering 19 seque

# Continue normal code

In [22]:
# print(f'Input and target shape month by month (training reach 1):\n\
# March --> input shape: {np.shape(input_mar)} - Target shape: {np.shape(target_mar)}')

In [23]:
# input_jan, target_jan = create_datasets(train, 1, 5, dir_folders=dir_dataset_jan)
# input_feb, target_feb = create_datasets(train, 1, 5, dir_folders=dir_dataset_feb)
# input_mar, target_mar = create_datasets(train, 1, 5, dir_folders=dir_dataset_mar)
# input_apr, target_apr = create_datasets(train, 1, 5, dir_folders=dir_dataset_apr)

In [24]:
print(f'Input and target shape month by month (training reach 1):\n\
January --> input shape: {np.shape(input_jan)} - Target shape: {np.shape(target_jan)}\n\
February --> input shape: {np.shape(input_feb)} - Target shape: {np.shape(target_feb)}\n\
March --> input shape: {np.shape(input_mar)} - Target shape: {np.shape(target_mar)}\n\
April --> input shape: {np.shape(input_apr)} - Target shape: {np.shape(target_apr)}')

Input and target shape month by month (training reach 1):
January --> input shape: (19, 4, 1000, 500) - Target shape: (19, 1, 1000, 500)
February --> input shape: (19, 4, 1000, 500) - Target shape: (19, 1, 1000, 500)
March --> input shape: (19, 4, 1000, 500) - Target shape: (19, 1, 1000, 500)
April --> input shape: (19, 4, 1000, 500) - Target shape: (19, 1, 1000, 500)


2. Combine input and target datasets filtering out bad images (based on <code>no-data</code> and <code>water</code> thresholds). 

In [10]:
input_jan_filtered, target_jan_filtered = combine_datasets(train, 1, dir_folders=dir_dataset_jan)
input_feb_filtered, target_feb_filtered = combine_datasets(train, 1, dir_folders=dir_dataset_feb)
input_mar_filtered, target_mar_filtered = combine_datasets(train, 1, dir_folders=dir_dataset_mar)
input_apr_filtered, target_apr_filtered = combine_datasets(train, 1, dir_folders=dir_dataset_apr)

In [11]:
print(f'Input and target shape month by month after filtering out not suitable images (training reach 1):\n\
January --> input shape: {np.shape(input_jan_filtered)} - Target shape: {np.shape(target_jan_filtered)}\n\
February --> input shape: {np.shape(input_feb_filtered)} - Target shape: {np.shape(target_feb_filtered)}\n\
March --> input shape: {np.shape(input_mar_filtered)} - Target shape: {np.shape(target_mar_filtered)}\n\
April --> input shape: {np.shape(input_apr_filtered)} - Target shape: {np.shape(target_apr_filtered)}')

Input and target shape month by month after filtering out not suitable images (training reach 1):
January --> input shape: (6, 4, 1000, 500) - Target shape: (6, 1000, 500)
February --> input shape: (17, 4, 1000, 500) - Target shape: (17, 1000, 500)
March --> input shape: (13, 4, 1000, 500) - Target shape: (13, 1000, 500)
April --> input shape: (10, 4, 1000, 500) - Target shape: (10, 1000, 500)


### 1. Training dataset

In [12]:
# training
dtype = dtype=torch.float32

dataset_train_jan = create_full_dataset(train, dir_folders=dir_dataset_jan, device=device, dtype=dtype)
dataset_train_feb = create_full_dataset(train, dir_folders=dir_dataset_feb, device=device, dtype=dtype)
dataset_train_mar = create_full_dataset(train, dir_folders=dir_dataset_mar, device=device, dtype=dtype)
dataset_train_apr = create_full_dataset(train, dir_folders=dir_dataset_apr, device=device, dtype=dtype)

print(f'Total training samples considering different months:\n\
January --> {len(dataset_train_jan)}\n\
February --> {len(dataset_train_feb)}\n\
March --> {len(dataset_train_mar)}\n\
April --> {len(dataset_train_apr)}')

Total training samples considering different months:
January --> 378
February --> 402
March --> 413
April --> 262


In [13]:
print(f"Datasets shape (same for every monthly dataset)\n\
Input dataset sample shape: {dataset_train_jan[0][0].shape} - Target dataset sample shape: {dataset_train_jan[0][1].shape}")

Datasets shape (same for every monthly dataset)
Input dataset sample shape: torch.Size([4, 1000, 500]) - Target dataset sample shape: torch.Size([1000, 500])


### 2. Validation dataset

In [14]:
# validation
dataset_val_jan = create_full_dataset(val, dir_folders=dir_dataset_jan, device=device, dtype=dtype)
dataset_val_feb = create_full_dataset(val, dir_folders=dir_dataset_feb, device=device, dtype=dtype)
dataset_val_mar = create_full_dataset(val, dir_folders=dir_dataset_mar, device=device, dtype=dtype)
dataset_val_apr = create_full_dataset(val, dir_folders=dir_dataset_apr, device=device, dtype=dtype)

print(f'Total validation samples considering different months:\n\
January --> {len(dataset_val_jan)}\n\
February --> {len(dataset_val_feb)}\n\
March --> {len(dataset_val_mar)}\n\
April --> {len(dataset_val_apr)}')

Total validation samples considering different months:
January --> 9
February --> 19
March --> 13
April --> 17


### 3. Testing dataset

In [15]:
# testing
dataset_test_jan = create_full_dataset(test, dir_folders=dir_dataset_jan, device=device, dtype=dtype)
dataset_test_feb = create_full_dataset(test, dir_folders=dir_dataset_feb, device=device, dtype=dtype)
dataset_test_mar = create_full_dataset(test, dir_folders=dir_dataset_mar, device=device, dtype=dtype)
dataset_test_apr = create_full_dataset(test, dir_folders=dir_dataset_apr, device=device, dtype=dtype)

print(f'Total validation samples considering different months:\n\
January --> {len(dataset_test_jan)}\n\
February --> {len(dataset_test_feb)}\n\
March --> {len(dataset_test_mar)}\n\
April --> {len(dataset_test_apr)}')

Total validation samples considering different months:
January --> 16
February --> 19
March --> 17
April --> 17
