first a quality check to see if there are the exact same amounts of NIR bands as there are RGB bands:

In [None]:
import os
import csv

def count_files_in_subfolder(path):
    try:
        return set(f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)))
    except FileNotFoundError:
        return set()

def load_csv_data(csv_file):
    non_imputed_files = set()
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row['imputed'].lower() == 'no':
                non_imputed_files.add(row['file_name'])
    return non_imputed_files

def check_file_consistency(base_dir, folders):
    for folder in folders:
        full_path = os.path.join(base_dir, folder)
        nir_path = os.path.join(full_path, 'NIR')
        rgb_path = os.path.join(full_path, 'RGB')
        csv_file = os.path.join(full_path, f"{folder}_detailed.csv")
        
        nir_files = count_files_in_subfolder(nir_path)
        rgb_files = count_files_in_subfolder(rgb_path)
        non_imputed_files = load_csv_data(csv_file)
        
        print(f"Folder: {folder}")
        print(f"  NIR files: {len(nir_files)}")
        print(f"  RGB files: {len(rgb_files)}")
        print(f"  Non-imputed files in CSV: {len(non_imputed_files)}")
        
        nir_missing = non_imputed_files - nir_files
        rgb_missing = non_imputed_files - rgb_files
        extra_nir = nir_files - non_imputed_files
        extra_rgb = rgb_files - non_imputed_files
        
        print("  Missing non-imputed files in NIR:", len(nir_missing))
        if nir_missing:
            print("    ", ", ".join(list(nir_missing)[:5]), "..." if len(nir_missing) > 5 else "")
        
        print("  Missing non-imputed files in RGB:", len(rgb_missing))
        if rgb_missing:
            print("    ", ", ".join(list(rgb_missing)[:5]), "..." if len(rgb_missing) > 5 else "")
        
        print("  Extra files in NIR:", len(extra_nir))
        if extra_nir:
            print("    ", ", ".join(list(extra_nir)[:5]), "..." if len(extra_nir) > 5 else "")
        
        print("  Extra files in RGB:", len(extra_rgb))
        if extra_rgb:
            print("    ", ", ".join(list(extra_rgb)[:5]), "..." if len(extra_rgb) > 5 else "")
        
        print()

# Base directory
output_base_dir = r'C:\TjallingData\greenearthnet_additional'

# Folders to check
folders_to_check = [
    'iid_chopped',
    'ood-s_chopped',
    'ood-st_chopped',
    'ood-t_chopped',
    'val_chopped',
    'train'
]

check_file_consistency(output_base_dir, folders_to_check)

The quality check showed more NIR images than RGB images being extracted, due to them being less sensitive to the cloud cover. The excess NIR images will be deleted in the cell below and imputed afterwards, to ensure consistency towards the RGB images. Below is the same code as above, but with an extra delete function.

In [None]:
import os
import csv

def count_files_in_subfolder(path):
    try:
        return set(f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)))
    except FileNotFoundError:
        return set()

def load_csv_data(csv_file):
    non_imputed_files = set()
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row['imputed'].lower() == 'no':
                non_imputed_files.add(row['file_name'])
    return non_imputed_files

def check_file_consistency(base_dir, folders):
    for folder in folders:
        full_path = os.path.join(base_dir, folder)
        nir_path = os.path.join(full_path, 'NIR')
        rgb_path = os.path.join(full_path, 'RGB')
        csv_file = os.path.join(full_path, f"{folder}_detailed.csv")
        
        nir_files = count_files_in_subfolder(nir_path)
        rgb_files = count_files_in_subfolder(rgb_path)
        non_imputed_files = load_csv_data(csv_file)
        
        print(f"Folder: {folder}")
        print(f"  NIR files: {len(nir_files)}")
        print(f"  RGB files: {len(rgb_files)}")
        print(f"  Non-imputed files in CSV: {len(non_imputed_files)}")
        
        nir_missing = non_imputed_files - nir_files
        rgb_missing = non_imputed_files - rgb_files
        extra_nir = nir_files - non_imputed_files
        extra_rgb = rgb_files - non_imputed_files
        
        print("  Missing non-imputed files in NIR:", len(nir_missing))
        if nir_missing:
            print("    ", ", ".join(list(nir_missing)[:5]), "..." if len(nir_missing) > 5 else "")
        
        print("  Missing non-imputed files in RGB:", len(rgb_missing))
        if rgb_missing:
            print("    ", ", ".join(list(rgb_missing)[:5]), "..." if len(rgb_missing) > 5 else "")
        
        print("  Extra files in NIR:", len(extra_nir))
        if extra_nir:
            print("    ", ", ".join(list(extra_nir)[:5]), "..." if len(extra_nir) > 5 else "")
        
        print("  Extra files in RGB:", len(extra_rgb))
        if extra_rgb:
            print("    ", ", ".join(list(extra_rgb)[:5]), "..." if len(extra_rgb) > 5 else "")
        
        print()

def delete_extra_nir_files(base_dir, folders):
    for folder in folders:
        full_path = os.path.join(base_dir, folder)
        nir_path = os.path.join(full_path, 'NIR')
        csv_file = os.path.join(full_path, f"{folder}_detailed.csv")
        
        nir_files = count_files_in_subfolder(nir_path)
        non_imputed_files = load_csv_data(csv_file)
        
        extra_nir = nir_files - non_imputed_files
        
        print(f"Folder: {folder}")
        print(f"  Extra files to delete in NIR: {len(extra_nir)}")
        
        for file in extra_nir:
            file_path = os.path.join(nir_path, file)
            try:
                os.remove(file_path)
                print(f"    Deleted: {file}")
            except Exception as e:
                print(f"    Error deleting {file}: {str(e)}")
        
        print()

# Base directory
output_base_dir = r'C:\TjallingData\greenearthnet_additional'

# Folders to check
folders_to_check = [
    'iid_chopped',
    'ood-s_chopped',
    'ood-st_chopped',
    'ood-t_chopped',
    'val_chopped',
    'train'
]

# First cell: Check file consistency
print("Checking file consistency:")
check_file_consistency(output_base_dir, folders_to_check)

# Second cell: Delete extra NIR files
print("\nDeleting extra NIR files:")
delete_extra_nir_files(output_base_dir, folders_to_check)

# Optional: Check file consistency again after deletion
print("\nChecking file consistency after deletion:")
check_file_consistency(output_base_dir, folders_to_check)

Here is where the NIR image imputation takes place.

In [None]:
import os
import csv
import numpy as np
from PIL import Image
from datetime import datetime, timedelta

def load_csv_data(csv_file):
    data = {}
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            minicube_name = row['file_name'].split('_')[1:5]
            minicube_name = '_'.join(minicube_name)
            if minicube_name not in data:
                data[minicube_name] = []
            data[minicube_name].append(row)
    return data

def impute_image(before_img, after_img, total_gap, current_gap):
    before_arr = np.array(before_img)
    after_arr = np.array(after_img)
    
    if total_gap == 0:
        return before_arr
    
    before_weight = 1 - (current_gap / total_gap)
    after_weight = current_gap / total_gap
    imputed_arr = (before_weight * before_arr + after_weight * after_arr).astype(np.uint8)
    return imputed_arr


def process_folder(input_folder, output_folder, csv_file):
    csv_data = load_csv_data(csv_file)
    
    for minicube_name, minicube_data in csv_data.items():
        print(f"Processing minicube: {minicube_name}")
        
        # Sort minicube data by date
        minicube_data.sort(key=lambda x: x['date'])
        
        existing_dates = [datetime.strptime(row['date'], '%Y-%m-%d').date() for row in minicube_data]
        first_date = min(existing_dates)
        last_date = max(existing_dates)
        
        for i, row in enumerate(minicube_data):
            if row['imputed'] == 'yes':
                print(f"Imputing image: {row['file_name']}")
                current_date = datetime.strptime(row['date'], '%Y-%m-%d').date()
                
                # Find the nearest non-imputed images before and after
                before = max((d for d in existing_dates if d <= current_date and minicube_data[existing_dates.index(d)]['imputed'] == 'no'), default=first_date)
                after = min((d for d in existing_dates if d >= current_date and minicube_data[existing_dates.index(d)]['imputed'] == 'no'), default=last_date)
                
                before_idx = existing_dates.index(before)
                after_idx = existing_dates.index(after)
                
                before_row = minicube_data[before_idx]
                after_row = minicube_data[after_idx]
                
                # Try to open images, use fallback if file not found
                before_img = open_image_with_fallback(input_folder, before_row['file_name'])
                after_img = open_image_with_fallback(input_folder, after_row['file_name'])
                
                if before_img is None or after_img is None:
                    print(f"Warning: Could not impute image {row['file_name']} due to missing reference images")
                    continue
                
                total_gap = (after - before).days
                current_gap = (current_date - before).days
                
                if total_gap == 0:
                    imputed_arr = np.array(before_img)
                else:
                    before_weight = 1 - (current_gap / total_gap)
                    after_weight = current_gap / total_gap
                    imputed_arr = (before_weight * np.array(before_img) + after_weight * np.array(after_img)).astype(np.uint8)
                
                imputed_img = Image.fromarray(imputed_arr)
                
                # Save the imputed image
                output_filename = f"NIR_imputed_{row['file_name']}"
                output_path = os.path.join(output_folder, output_filename)
                imputed_img.save(output_path)
                print(f"Saved imputed image: {output_path}")
        
        # Verify the total number of images
        total_images = len([f for f in os.listdir(input_folder) if f.endswith('.png')])
        total_imputed = len([f for f in os.listdir(output_folder) if f.startswith('NIR_imputed_')])
        print(f"Total images: {total_images}, Imputed images: {total_imputed}")
        if total_images + total_imputed != 30:
            print(f"Warning: Incorrect total number of images for {minicube_name}. Expected 30, got {total_images + total_imputed}")

def open_image_with_fallback(folder, filename):
    try:
        return Image.open(os.path.join(folder, filename))
    except FileNotFoundError:
        print(f"Warning: File not found: {filename}")
        # You can implement a fallback method here if needed
        # For example, return a blank image or the most recent available image
        return None

# The rest of the code remains the same

# List of folders to process
folders = [
    r'C:\TjallingData\greenearthnet_additional\iid_chopped',
    r'C:\TjallingData\greenearthnet_additional\ood-s_chopped',
    r'C:\TjallingData\greenearthnet_additional\ood-st_chopped',
    r'C:\TjallingData\greenearthnet_additional\ood-t_chopped',
    r'C:\TjallingData\greenearthnet_additional\val_chopped',
    r'C:\TjallingData\greenearthnet_additional\train',

]


for folder in folders:
    print(f"Processing folder: {folder}")
    input_folder = os.path.join(folder, 'NIR')
    output_folder = os.path.join(folder, 'NIR_imputed')
    os.makedirs(output_folder, exist_ok=True)
    
    csv_file = os.path.join(folder, f"{os.path.basename(folder)}_detailed.csv")
    
    process_folder(input_folder, output_folder, csv_file)

print("Imputation process completed for all folders.")

doing the exact same kind of imputation for RGB images.

In [None]:
import os
import csv
import numpy as np
from PIL import Image
from datetime import datetime, timedelta

def load_csv_data(csv_file):
    data = {}
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            minicube_name = row['file_name'].split('_')[1:5]
            minicube_name = '_'.join(minicube_name)
            if minicube_name not in data:
                data[minicube_name] = []
            data[minicube_name].append(row)
    return data

def impute_image(before_img, after_img, total_gap, current_gap):
    before_arr = np.array(before_img)
    after_arr = np.array(after_img)
    
    if total_gap == 0:
        return before_arr
    
    before_weight = 1 - (current_gap / total_gap)
    after_weight = current_gap / total_gap
    imputed_arr = (before_weight * before_arr + after_weight * after_arr).astype(np.uint8)
    return imputed_arr

def process_folder(input_folder, output_folder, csv_file):
    csv_data = load_csv_data(csv_file)
    
    for minicube_name, minicube_data in csv_data.items():
        print(f"Processing minicube: {minicube_name}")
        
        # Sort minicube data by date
        minicube_data.sort(key=lambda x: x['date'])
        
        existing_dates = [datetime.strptime(row['date'], '%Y-%m-%d').date() for row in minicube_data]
        first_date = min(existing_dates)
        last_date = max(existing_dates)
        
        for i, row in enumerate(minicube_data):
            if row['imputed'] == 'yes':
                output_filename = f"RGB_imputed_{row['file_name']}"
                output_path = os.path.join(output_folder, output_filename)
                
                # Check if the imputed image already exists
                if os.path.exists(output_path):
                    print(f"Skipping existing imputed image: {output_filename}")
                    continue
                
                print(f"Imputing image: {row['file_name']}")
                current_date = datetime.strptime(row['date'], '%Y-%m-%d').date()
                
                # Find the nearest non-imputed images before and after
                before = max((d for d in existing_dates if d <= current_date and minicube_data[existing_dates.index(d)]['imputed'] == 'no'), default=first_date)
                after = min((d for d in existing_dates if d >= current_date and minicube_data[existing_dates.index(d)]['imputed'] == 'no'), default=last_date)
                
                before_idx = existing_dates.index(before)
                after_idx = existing_dates.index(after)
                
                before_row = minicube_data[before_idx]
                after_row = minicube_data[after_idx]
                
                # Try to open images, use fallback if file not found
                before_img = open_image_with_fallback(input_folder, before_row['file_name'])
                after_img = open_image_with_fallback(input_folder, after_row['file_name'])
                
                if before_img is None or after_img is None:
                    print(f"Warning: Could not impute image {row['file_name']} due to missing reference images")
                    continue
                
                total_gap = (after - before).days
                current_gap = (current_date - before).days
                
                if total_gap == 0:
                    imputed_arr = np.array(before_img)
                else:
                    before_weight = 1 - (current_gap / total_gap)
                    after_weight = current_gap / total_gap
                    imputed_arr = (before_weight * np.array(before_img) + after_weight * np.array(after_img)).astype(np.uint8)
                
                imputed_img = Image.fromarray(imputed_arr)
                
                # Save the imputed image
                imputed_img.save(output_path)
                print(f"Saved imputed image: {output_path}")
        
        # Verify the total number of images
        total_images = len([f for f in os.listdir(input_folder) if f.endswith('.png')])
        total_imputed = len([f for f in os.listdir(output_folder) if f.startswith('RGB_imputed_')])
        print(f"Total images: {total_images}, Imputed images: {total_imputed}")
        if total_images + total_imputed != 30:
            print(f"Warning: Incorrect total number of images for {minicube_name}. Expected 30, got {total_images + total_imputed}")

def open_image_with_fallback(folder, filename):
    try:
        return Image.open(os.path.join(folder, filename))
    except FileNotFoundError:
        print(f"Warning: File not found: {filename}")
        # You can implement a fallback method here if needed
        # For example, return a blank image or the most recent available image
        return None

def count_existing_imputed(output_folder):
    return len([f for f in os.listdir(output_folder) if f.startswith('RGB_imputed_')])

# List of folders to process
folders = [
    r'C:\TjallingData\greenearthnet_additional\iid_chopped',
    r'C:\TjallingData\greenearthnet_additional\ood-s_chopped',
    r'C:\TjallingData\greenearthnet_additional\ood-st_chopped',
    r'C:\TjallingData\greenearthnet_additional\ood-t_chopped',
    r'C:\TjallingData\greenearthnet_additional\val_chopped',
    r'C:\TjallingData\greenearthnet_additional\train'

]

for folder in folders:
    print(f"Processing folder: {folder}")
    input_folder = os.path.join(folder, 'RGB')
    output_folder = os.path.join(folder, 'RGB_imputed')
    os.makedirs(output_folder, exist_ok=True)
    
    csv_file = os.path.join(folder, f"{os.path.basename(folder)}_detailed.csv")
    
    existing_imputed = count_existing_imputed(output_folder)
    print(f"Found {existing_imputed} existing imputed images")
    
    process_folder(input_folder, output_folder, csv_file)
    
    final_imputed = count_existing_imputed(output_folder)
    print(f"Folder completed. Total imputed images: {final_imputed}")

print("Imputation process completed for all folders.")

post-imputing quality check:

In [None]:
import os
import csv

def count_files_in_subfolder(path):
    try:
        return set(f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)))
    except FileNotFoundError:
        return set()

def load_csv_data(csv_file):
    non_imputed_files = set()
    imputed_files = set()
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row['imputed'].lower() == 'no':
                non_imputed_files.add(row['file_name'])
            else:
                imputed_files.add(row['file_name'])
    return non_imputed_files, imputed_files

def check_folder_consistency(folder_path, files_in_csv, imputed):
    folder_files = count_files_in_subfolder(folder_path)
    prefix = "imputed_" if imputed else ""
    expected_files = {f"{prefix}{f}" for f in files_in_csv}
    
    missing_files = expected_files - folder_files
    extra_files = folder_files - expected_files
    
    return folder_files, missing_files, extra_files

def print_results(folder_name, folder_files, missing_files, extra_files):
    print(f"  {folder_name} files: {len(folder_files)}")
    print(f"  Missing files in {folder_name}:", len(missing_files))
    if missing_files:
        print("    ", ", ".join(list(missing_files)[:5]), "..." if len(missing_files) > 5 else "")
    print(f"  Extra files in {folder_name}:", len(extra_files))
    if extra_files:
        print("    ", ", ".join(list(extra_files)[:5]), "..." if len(extra_files) > 5 else "")

def check_file_consistency(base_dir, folders):
    for folder in folders:
        full_path = os.path.join(base_dir, folder)
        csv_file = os.path.join(full_path, f"{folder}_detailed.csv")
        
        non_imputed_files, imputed_files = load_csv_data(csv_file)
        
        print(f"Folder: {folder}")
        print(f"  Non-imputed files in CSV: {len(non_imputed_files)}")
        print(f"  Imputed files in CSV: {len(imputed_files)}")
        
        for subfolder in ['NIR', 'NIR_imputed', 'RGB', 'RGB_imputed']:
            subfolder_path = os.path.join(full_path, subfolder)
            is_imputed = 'imputed' in subfolder.lower()
            files_in_csv = imputed_files if is_imputed else non_imputed_files
            
            folder_files, missing_files, extra_files = check_folder_consistency(subfolder_path, files_in_csv, is_imputed)
            print_results(subfolder, folder_files, missing_files, extra_files)
        
        print()

# Base directory
output_base_dir = r'C:\TjallingData\greenearthnet_additional'

# Folders to check
folders_to_check = [
    'iid_chopped',
    'ood-s_chopped',
    'ood-st_chopped',
    'ood-t_chopped',
    'val_chopped'
]

check_file_consistency(output_base_dir, folders_to_check)

After running the post-imputation quality check, it is obvious many imputed NIR images are missing. in the text below, I explicitly try to impute these to see where it goes wrong.

First I create a .csv file containing all missing images:

In [None]:
import os
import csv

def count_files_in_subfolder(path):
    try:
        return set(f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)))
    except FileNotFoundError:
        return set()

def load_csv_data(csv_file):
    non_imputed_files = set()
    imputed_files = set()
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row['imputed'].lower() == 'no':
                non_imputed_files.add(row['file_name'])
            else:
                imputed_files.add(row['file_name'])
    return non_imputed_files, imputed_files

def save_missing_imputed(missing_files, output_file):
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['missing_file'])
        for file in sorted(missing_files):
            writer.writerow([file])

def check_file_consistency(base_dir, folders):
    for folder in folders:
        full_path = os.path.join(base_dir, folder)
        nir_path = os.path.join(full_path, 'NIR')
        nir_imputed_path = os.path.join(full_path, 'NIR_imputed')
        csv_file = os.path.join(full_path, f"{folder}_detailed.csv")
        
        nir_files = count_files_in_subfolder(nir_path)
        nir_imputed_files = count_files_in_subfolder(nir_imputed_path)
        non_imputed_files, imputed_files = load_csv_data(csv_file)
        
        print(f"Folder: {folder}")
        print(f"  NIR files: {len(nir_files)}")
        print(f"  NIR_imputed files: {len(nir_imputed_files)}")
        print(f"  Non-imputed files in CSV: {len(non_imputed_files)}")
        print(f"  Imputed files in CSV: {len(imputed_files)}")
        
        nir_missing = non_imputed_files - nir_files
        nir_imputed_missing = {f"NIR_imputed_{f}" for f in imputed_files} - nir_imputed_files
        extra_nir = nir_files - non_imputed_files
        extra_nir_imputed = nir_imputed_files - {f"NIR_imputed_{f}" for f in imputed_files}
        
        print("  Missing non-imputed files in NIR:", len(nir_missing))
        print("  Missing imputed files in NIR_imputed:", len(nir_imputed_missing))
        print("  Extra files in NIR:", len(extra_nir))
        print("  Extra files in NIR_imputed:", len(extra_nir_imputed))
        
        # Save missing imputed files to CSV
        if nir_imputed_missing:
            output_file = os.path.join(base_dir, f"{folder}_missing_imputed.csv")
            save_missing_imputed(nir_imputed_missing, output_file)
            print(f"  Saved list of missing imputed files to: {output_file}")
        
        print()

# Base directory
output_base_dir = r'C:\TjallingData\greenearthnet_additional'

# Folders to check
folders_to_check = [
    'iid_chopped',
    'ood-s_chopped',
    'ood-st_chopped',
    'ood-t_chopped',
    'val_chopped'
]

check_file_consistency(output_base_dir, folders_to_check)

Now I append those from the created .csv lists and impute them using more refined code:

In [None]:
import os
import csv
import numpy as np
from PIL import Image
from datetime import datetime, timedelta
from dateutil import parser

def load_csv_data(csv_file):
    data = {}
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            minicube_name = '_'.join(row['file_name'].split('_')[1:5])
            if minicube_name not in data:
                data[minicube_name] = []
            data[minicube_name].append(row)
    return data

def parse_date(date_str):
    try:
        return datetime.strptime(date_str, '%d/%m/%Y').date()
    except ValueError:
        return datetime.strptime(date_str, '%Y-%m-%d').date()

def process_missing_imputed(base_dir, folder, missing_imputed_csv):
    input_folder = os.path.join(base_dir, folder, 'NIR')
    output_folder = os.path.join(base_dir, folder, 'NIR_imputed')
    os.makedirs(output_folder, exist_ok=True)
    
    csv_file = os.path.join(base_dir, folder, f"{folder}_detailed.csv")
    csv_data = load_csv_data(csv_file)
    
    with open(missing_imputed_csv, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            missing_file = row[0]
            
            # Extract minicube_name from missing_file
            minicube_name = '_'.join(missing_file.split('_')[3:7])
            
            if minicube_name not in csv_data:
                print(f"Warning: Minicube {minicube_name} not found in CSV data")
                continue
            
            minicube_data = csv_data[minicube_name]
            minicube_data.sort(key=lambda x: parse_date(x['date']))
            
            existing_dates = [parse_date(row['date']) for row in minicube_data if row['imputed'] == 'no']
            
            # Skip if there are fewer than 2 non-imputed images
            if len(existing_dates) < 2:
                print(f"Warning: Not enough images to impute for {minicube_name}")
                continue
            
            all_dates = [existing_dates[0] + timedelta(days=5*i) for i in range(30)]
            missing_date = datetime.strptime(missing_file.split('_')[-1].split('.')[0], '%Y-%m-%d').date()
            
            first_date = min(existing_dates)
            last_date = max(existing_dates)
            
            before = max((d for d in existing_dates if d <= missing_date), default=first_date)
            after = min((d for d in existing_dates if d >= missing_date), default=last_date)
            
            before_row = next(row for row in minicube_data if parse_date(row['date']) == before)
            after_row = next(row for row in minicube_data if parse_date(row['date']) == after)
            
            before_img = open_image_with_fallback(input_folder, before_row['file_name'])
            after_img = open_image_with_fallback(input_folder, after_row['file_name'])
            
            if before_img is None or after_img is None:
                print(f"Warning: Could not impute image {missing_file} due to missing reference images")
                continue
            
            total_gap = (after - before).days
            current_gap = (missing_date - before).days
            
            imputed_arr = impute_image(before_img, after_img, total_gap, current_gap)
            imputed_img = Image.fromarray(imputed_arr)
            
            output_path = os.path.join(output_folder, missing_file)
            imputed_img.save(output_path)
            print(f"Saved imputed image: {output_path}")

# Base directory
output_base_dir = r'C:\TjallingData\greenearthnet_additional'

# Folders to process
folders_to_process = [
    'iid_chopped',
    'ood-s_chopped',
    'ood-st_chopped',
    'ood-t_chopped',
    'val_chopped',
    'train'
]

for folder in folders_to_process:
    missing_imputed_csv = os.path.join(output_base_dir, f"{folder}_missing_imputed.csv")
    if os.path.exists(missing_imputed_csv):
        print(f"Processing missing imputed images for {folder}")
        process_missing_imputed(output_base_dir, folder, missing_imputed_csv)
    else:
        print(f"No missing imputed CSV found for {folder}")

print("Imputation process completed for all folders.")


Now, we combine the NIR, NIR_imputed, RGB, and RGB_imputed into one single directory respectively, ready for preprocessing and training. We check in the metadata.csv file of each separate track to check if the file exists in it. If so, it is moved. This functions as an extra layer of control, making sure only images present in the .csv file are moved. 

In [None]:
import os
import csv
from collections import defaultdict
from tqdm import tqdm
import shutil


# Base path
base_path = r"C:\TjallingData\greenearthnet_additional"

subfolders =  ['val_chopped', 'iid_chopped', 'ood-s_chopped', 'ood-st_chopped', 'ood-t_chopped', 'train']


def copy_folders(subfolder_path, source_folders, target_folder):
    target_path = os.path.join(subfolder_path, target_folder)

    # Create target folder if it doesn't exist
    if not os.path.exists(target_path):
        os.makedirs(target_path)

    # Copy contents of source folders
    for source_folder in source_folders:
        source_path = os.path.join(subfolder_path, source_folder)
        if os.path.exists(source_path):
            for item in os.listdir(source_path):
                s = os.path.join(source_path, item)
                d = os.path.join(target_path, item)
                if os.path.isdir(s):
                    shutil.copytree(s, d, dirs_exist_ok=True)
                else:
                    shutil.copy2(s, d)

    print(f"Copied contents of {', '.join(source_folders)} to {target_folder} in {subfolder_path}")

# Process each subfolder
for subfolder in tqdm(subfolders, desc="Processing subfolders"):
    subfolder_path = os.path.join(base_path, subfolder)
    input_file = os.path.join(subfolder_path, f"{subfolder}_combined_imputed_with_region_normalized_and_season.csv")
    output_file = os.path.join(subfolder_path, f"{subfolder}_combined_imputed_with_region_normalized_season_minicube_and_usable.csv")

    if os.path.exists(input_file):
        print(f"\nProcessing {subfolder}")

        # Copy NIR folders
        copy_folders(subfolder_path, ['NIR', 'NIR_imputed'], 'NIR_total')

        # Copy RGB folders
        copy_folders(subfolder_path, ['RGB', 'RGB_imputed'], 'RGB_total')

print("\nProcessing complete.")