Working Monthly AQUA MODIS CHL merge

In [2]:
import xarray as xr
import pandas as pd
import glob
import os
from pathlib import Path

def extract_date_from_filename(filename):
    """Extract the first date from filename format like 'AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc'"""
    basename = os.path.basename(filename)
    parts = basename.split('.')
    
    # Look for the date range part (e.g., "20150501_20150531")
    for part in parts:
        if '_' in part and len(part) == 17:  # Format: YYYYMMDD_YYYYMMDD
            date_parts = part.split('_')
            if len(date_parts) == 2 and len(date_parts[0]) == 8 and date_parts[0].isdigit():
                try:
                    # Return the first date
                    return pd.to_datetime(date_parts[0], format='%Y%m%d')
                except:
                    continue
    
    # Fallback: look for any 8-digit date
    for part in basename.split('_'):
        if len(part) == 8 and part.isdigit():
            try:
                return pd.to_datetime(part, format='%Y%m%d')
            except:
                continue
    
    return None

def combine_netcdf_files(folder_path, output_file):
    """
    Simple function to combine all NetCDF files in a folder
    """
    # Find all files
    file_pattern = os.path.join(folder_path, "*AQUA_MODIS.*x_chlor_a.nc*")
    files = sorted(glob.glob(file_pattern))
    
    if not files:
        print(f"No files found matching pattern in {folder_path}")
        return
    
    print(f"Found {len(files)} files to combine")
    
    # Quick file size check
    sample_size = sum(os.path.getsize(f) for f in files[:5]) / 5
    total_gb = (sample_size * len(files)) / (1024**3)
    print(f"Estimated total size: {total_gb:.1f} GB")
    
    if total_gb > 20:
        print("Large dataset detected - using chunked processing...")
        return combine_large_dataset(files, output_file)
    else:
        print("Using simple concatenation...")
        return combine_small_dataset(files, output_file)

def combine_small_dataset(files, output_file):
    """For smaller datasets - load everything into memory"""
    datasets = []
    
    for i, file_path in enumerate(files):
        if i % 100 == 0:
            print(f"Processing file {i+1}/{len(files)}")
        
        try:
            ds = xr.open_dataset(file_path)
            
            # Add time coordinate
            date = extract_date_from_filename(file_path)
            if date:
                ds = ds.expand_dims('time')
                ds = ds.assign_coords(time=[date])
                datasets.append(ds)
            else:
                print(f"Warning: Could not extract date from {file_path}")
                
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    if not datasets:
        print("No valid datasets found!")
        return
    
    # Combine all datasets
    print("Concatenating datasets...")
    combined = xr.concat(datasets, dim='time')
    combined = combined.sortby('time')
    
    # Add metadata
    combined.attrs['created_from'] = f"{len(datasets)} files"
    combined.attrs['creation_date'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    
    # Save with compression
    print(f"Saving to {output_file}...")
    encoding = {var: {'zlib': True, 'complevel': 1} for var in combined.data_vars}
    combined.to_netcdf(output_file, encoding=encoding)
    
    # Clean up
    for ds in datasets:
        ds.close()
    combined.close()
    
    print(f" Successfully created {output_file}")
    print(f"Time range: {combined.time.min().values} to {combined.time.max().values}")

def combine_large_dataset(files, output_file):
    """For larger datasets - use xarray's built-in chunking"""
    print("Using xarray open_mfdataset for large dataset...")
    
    # Use xarray's built-in multi-file dataset opener
    def preprocess(ds):
        """Add time coordinate during preprocessing"""
        # This is tricky because we don't have filename in preprocess
        # So we'll use a different approach
        return ds
    
    # Alternative approach for large datasets
    temp_files = []
    batch_size = 500  # Process in smaller batches
    
    for i in range(0, len(files), batch_size):
        batch = files[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(files)-1)//batch_size + 1}")
        
        batch_datasets = []
        for file_path in batch:
            try:
                ds = xr.open_dataset(file_path)
                date = extract_date_from_filename(file_path)
                if date:
                    ds = ds.expand_dims('time')
                    ds = ds.assign_coords(time=[date])
                    batch_datasets.append(ds)
            except Exception as e:
                print(f"Error with {file_path}: {e}")
                continue
        
        if batch_datasets:
            batch_combined = xr.concat(batch_datasets, dim='time')
            temp_file = f"temp_batch_{i//batch_size}.nc4"
            batch_combined.to_netcdf(temp_file)
            temp_files.append(temp_file)
            
            # Clean up
            batch_combined.close()
            for ds in batch_datasets:
                ds.close()
    
    # Combine all temporary files
    print("Combining all batches...")
    if temp_files:
        final_ds = xr.open_mfdataset(temp_files, concat_dim='time', combine='nested')
        final_ds = final_ds.sortby('time')
        
        # Save final result
        encoding = {var: {'zlib': True, 'complevel': 1} for var in final_ds.data_vars}
        final_ds.to_netcdf(output_file, encoding=encoding)
        final_ds.close()
        
        # Clean up temp files
        for temp_file in temp_files:
            os.remove(temp_file)
        
        print(f"✅ Successfully created {output_file}")

# Test the date extraction function
def test_date_extraction():
    """Test function to verify date extraction works correctly"""
    test_filename = "AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc"
    extracted_date = extract_date_from_filename(test_filename)
    print(f"Test filename: {test_filename}")
    print(f"Extracted date: {extracted_date}")
    print(f"Expected: 2015-05-01")

if __name__ == "__main__":
    # Test the date extraction
    test_date_extraction()
    
    # Simple usage
    folder_path = "CHL_MO_subset_v2"
    output_file = "CHL_MO_subset_combined.nc4"
    
    combine_netcdf_files(folder_path, output_file)

Test filename: AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc
Extracted date: 2015-05-01 00:00:00
Expected: 2015-05-01
Found 121 files to combine
Estimated total size: 1.3 GB
Using simple concatenation...
Processing file 1/121
Processing file 101/121
Concatenating datasets...
Saving to CHL_MO_subset_combined.nc4...
✅ Successfully created CHL_MO_subset_combined.nc4
Time range: 2015-05-01T00:00:00.000000000 to 2025-05-01T00:00:00.000000000


Working 8Day AQUA MODIS CHL merge

In [3]:
import xarray as xr
import pandas as pd
import glob
import os
from pathlib import Path

def extract_date_from_filename(filename):
    """Extract the first date from filename format like 'AQUA_MODIS.20150501_20150508.L3m.8D.CHL.x_chlor_a.nc'"""
    basename = os.path.basename(filename)
    parts = basename.split('.')
    
    # Look for the date range part (e.g., "20150501_20150531")
    for part in parts:
        if '_' in part and len(part) == 17:  # Format: YYYYMMDD_YYYYMMDD
            date_parts = part.split('_')
            if len(date_parts) == 2 and len(date_parts[0]) == 8 and date_parts[0].isdigit():
                try:
                    # Return the first date
                    return pd.to_datetime(date_parts[0], format='%Y%m%d')
                except:
                    continue
    
    # Fallback: look for any 8-digit date
    for part in basename.split('_'):
        if len(part) == 8 and part.isdigit():
            try:
                return pd.to_datetime(part, format='%Y%m%d')
            except:
                continue
    
    return None

def combine_netcdf_files(folder_path, output_file):
    """
    Simple function to combine all NetCDF files in a folder
    """
    # Find all files
    file_pattern = os.path.join(folder_path, "*AQUA_MODIS.*x_chlor_a.nc*")
    files = sorted(glob.glob(file_pattern))
    
    if not files:
        print(f"No files found matching pattern in {folder_path}")
        return
    
    print(f"Found {len(files)} files to combine")
    
    # Quick file size check
    sample_size = sum(os.path.getsize(f) for f in files[:5]) / 5
    total_gb = (sample_size * len(files)) / (1024**3)
    print(f"Estimated total size: {total_gb:.1f} GB")
    
    if total_gb > 20:
        print("Large dataset detected - using chunked processing...")
        return combine_large_dataset(files, output_file)
    else:
        print("Using simple concatenation...")
        return combine_small_dataset(files, output_file)

def combine_small_dataset(files, output_file):
    """For smaller datasets - load everything into memory"""
    datasets = []
    
    for i, file_path in enumerate(files):
        if i % 100 == 0:
            print(f"Processing file {i+1}/{len(files)}")
        
        try:
            ds = xr.open_dataset(file_path)
            
            # Add time coordinate
            date = extract_date_from_filename(file_path)
            if date:
                ds = ds.expand_dims('time')
                ds = ds.assign_coords(time=[date])
                datasets.append(ds)
            else:
                print(f"Warning: Could not extract date from {file_path}")
                
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    if not datasets:
        print("No valid datasets found!")
        return
    
    # Combine all datasets
    print("Concatenating datasets...")
    combined = xr.concat(datasets, dim='time')
    combined = combined.sortby('time')
    
    # Add metadata
    combined.attrs['created_from'] = f"{len(datasets)} files"
    combined.attrs['creation_date'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    
    # Save with compression
    print(f"Saving to {output_file}...")
    encoding = {var: {'zlib': True, 'complevel': 1} for var in combined.data_vars}
    combined.to_netcdf(output_file, encoding=encoding)
    
    # Clean up
    for ds in datasets:
        ds.close()
    combined.close()
    
    print(f"✅ Successfully created {output_file}")
    print(f"Time range: {combined.time.min().values} to {combined.time.max().values}")

def combine_large_dataset(files, output_file):
    """For larger datasets - use xarray's built-in chunking"""
    print("Using xarray open_mfdataset for large dataset...")
    
    # Use xarray's built-in multi-file dataset opener
    def preprocess(ds):
        """Add time coordinate during preprocessing"""
        # This is tricky because we don't have filename in preprocess
        # So we'll use a different approach
        return ds
    
    # Alternative approach for large datasets
    temp_files = []
    batch_size = 500  # Process in smaller batches
    
    for i in range(0, len(files), batch_size):
        batch = files[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(files)-1)//batch_size + 1}")
        
        batch_datasets = []
        for file_path in batch:
            try:
                ds = xr.open_dataset(file_path)
                date = extract_date_from_filename(file_path)
                if date:
                    ds = ds.expand_dims('time')
                    ds = ds.assign_coords(time=[date])
                    batch_datasets.append(ds)
            except Exception as e:
                print(f"Error with {file_path}: {e}")
                continue
        
        if batch_datasets:
            batch_combined = xr.concat(batch_datasets, dim='time')
            temp_file = f"temp_batch_{i//batch_size}.nc4"
            batch_combined.to_netcdf(temp_file)
            temp_files.append(temp_file)
            
            # Clean up
            batch_combined.close()
            for ds in batch_datasets:
                ds.close()
    
    # Combine all temporary files
    print("Combining all batches...")
    if temp_files:
        final_ds = xr.open_mfdataset(temp_files, concat_dim='time', combine='nested')
        final_ds = final_ds.sortby('time')
        
        # Save final result
        encoding = {var: {'zlib': True, 'complevel': 1} for var in final_ds.data_vars}
        final_ds.to_netcdf(output_file, encoding=encoding)
        final_ds.close()
        
        # Clean up temp files
        for temp_file in temp_files:
            os.remove(temp_file)
        
        print(f"✅ Successfully created {output_file}")

# Test the date extraction function
def test_date_extraction():
    """Test function to verify date extraction works correctly"""
    test_filename = "AQUA_MODIS.20150501_20150508.L3m.8D.CHL.x_chlor_a.nc"
    extracted_date = extract_date_from_filename(test_filename)
    print(f"Test filename: {test_filename}")
    print(f"Extracted date: {extracted_date}")
    print(f"Expected: 2015-05-01")

if __name__ == "__main__":
    # Test the date extraction
    test_date_extraction()
    
    # Simple usage
    folder_path = "CHL_8D_subset_v2"
    output_file = "CHL_8D_subset_combined.nc4"
    
    combine_netcdf_files(folder_path, output_file)

Test filename: AQUA_MODIS.20150501_20150508.L3m.8D.CHL.x_chlor_a.nc
Extracted date: 2015-05-01 00:00:00
Expected: 2015-05-01
Found 463 files to combine
Estimated total size: 3.7 GB
Using simple concatenation...
Processing file 1/463
Processing file 101/463
Processing file 201/463
Processing file 301/463
Processing file 401/463
Concatenating datasets...
Saving to CHL_8D_subset_combined.nc4...
✅ Successfully created CHL_8D_subset_combined.nc4
Time range: 2015-05-01T00:00:00.000000000 to 2025-05-25T00:00:00.000000000


Monthly merge for 2004-2014

In [1]:
import xarray as xr
import pandas as pd
import glob
import os
from pathlib import Path

def extract_date_from_filename(filename):
    """Extract the first date from filename format like 'AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc'"""
    basename = os.path.basename(filename)
    parts = basename.split('.')
    
    # Look for the date range part (e.g., "20150501_20150531")
    for part in parts:
        if '_' in part and len(part) == 17:  # Format: YYYYMMDD_YYYYMMDD
            date_parts = part.split('_')
            if len(date_parts) == 2 and len(date_parts[0]) == 8 and date_parts[0].isdigit():
                try:
                    # Return the first date
                    return pd.to_datetime(date_parts[0], format='%Y%m%d')
                except:
                    continue
    
    # Fallback: look for any 8-digit date
    for part in basename.split('_'):
        if len(part) == 8 and part.isdigit():
            try:
                return pd.to_datetime(part, format='%Y%m%d')
            except:
                continue
    
    return None

def combine_netcdf_files(folder_path, output_file):
    """
    Simple function to combine all NetCDF files in a folder
    """
    # Find all files
    file_pattern = os.path.join(folder_path, "*AQUA_MODIS.*x_chlor_a.nc*")
    files = sorted(glob.glob(file_pattern))
    
    if not files:
        print(f"No files found matching pattern in {folder_path}")
        return
    
    print(f"Found {len(files)} files to combine")
    
    # Quick file size check
    sample_size = sum(os.path.getsize(f) for f in files[:5]) / 5
    total_gb = (sample_size * len(files)) / (1024**3)
    print(f"Estimated total size: {total_gb:.1f} GB")
    
    if total_gb > 20:
        print("Large dataset detected - using chunked processing...")
        return combine_large_dataset(files, output_file)
    else:
        print("Using simple concatenation...")
        return combine_small_dataset(files, output_file)

def combine_small_dataset(files, output_file):
    """For smaller datasets - load everything into memory"""
    datasets = []
    
    for i, file_path in enumerate(files):
        if i % 100 == 0:
            print(f"Processing file {i+1}/{len(files)}")
        
        try:
            ds = xr.open_dataset(file_path)
            
            # Add time coordinate
            date = extract_date_from_filename(file_path)
            if date:
                ds = ds.expand_dims('time')
                ds = ds.assign_coords(time=[date])
                datasets.append(ds)
            else:
                print(f"Warning: Could not extract date from {file_path}")
                
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    if not datasets:
        print("No valid datasets found!")
        return
    
    # Combine all datasets
    print("Concatenating datasets...")
    combined = xr.concat(datasets, dim='time')
    combined = combined.sortby('time')
    
    # Add metadata
    combined.attrs['created_from'] = f"{len(datasets)} files"
    combined.attrs['creation_date'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    
    # Save with compression
    print(f"Saving to {output_file}...")
    encoding = {var: {'zlib': True, 'complevel': 1} for var in combined.data_vars}
    combined.to_netcdf(output_file, encoding=encoding)
    
    # Clean up
    for ds in datasets:
        ds.close()
    combined.close()
    
    print(f" Successfully created {output_file}")
    print(f"Time range: {combined.time.min().values} to {combined.time.max().values}")

def combine_large_dataset(files, output_file):
    """For larger datasets - use xarray's built-in chunking"""
    print("Using xarray open_mfdataset for large dataset...")
    
    # Use xarray's built-in multi-file dataset opener
    def preprocess(ds):
        """Add time coordinate during preprocessing"""
        # This is tricky because we don't have filename in preprocess
        # So we'll use a different approach
        return ds
    
    # Alternative approach for large datasets
    temp_files = []
    batch_size = 500  # Process in smaller batches
    
    for i in range(0, len(files), batch_size):
        batch = files[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(files)-1)//batch_size + 1}")
        
        batch_datasets = []
        for file_path in batch:
            try:
                ds = xr.open_dataset(file_path)
                date = extract_date_from_filename(file_path)
                if date:
                    ds = ds.expand_dims('time')
                    ds = ds.assign_coords(time=[date])
                    batch_datasets.append(ds)
            except Exception as e:
                print(f"Error with {file_path}: {e}")
                continue
        
        if batch_datasets:
            batch_combined = xr.concat(batch_datasets, dim='time')
            temp_file = f"temp_batch_{i//batch_size}.nc4"
            batch_combined.to_netcdf(temp_file)
            temp_files.append(temp_file)
            
            # Clean up
            batch_combined.close()
            for ds in batch_datasets:
                ds.close()
    
    # Combine all temporary files
    print("Combining all batches...")
    if temp_files:
        final_ds = xr.open_mfdataset(temp_files, concat_dim='time', combine='nested')
        final_ds = final_ds.sortby('time')
        
        # Save final result
        encoding = {var: {'zlib': True, 'complevel': 1} for var in final_ds.data_vars}
        final_ds.to_netcdf(output_file, encoding=encoding)
        final_ds.close()
        
        # Clean up temp files
        for temp_file in temp_files:
            os.remove(temp_file)
        
        print(f"✅ Successfully created {output_file}")

# Test the date extraction function
def test_date_extraction():
    """Test function to verify date extraction works correctly"""
    test_filename = "AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc"
    extracted_date = extract_date_from_filename(test_filename)
    print(f"Test filename: {test_filename}")
    print(f"Extracted date: {extracted_date}")
    print(f"Expected: 2015-05-01")

if __name__ == "__main__":
    # Test the date extraction
    test_date_extraction()
    
    # Simple usage
    folder_path = "CHL_MO_2004_2014_subset"
    output_file = "CHL_MO_2004_2014_subset_combined.nc4"
    
    combine_netcdf_files(folder_path, output_file)

Test filename: AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc
Extracted date: 2015-05-01 00:00:00
Expected: 2015-05-01
Found 121 files to combine
Estimated total size: 1.3 GB
Using simple concatenation...
Processing file 1/121
Processing file 101/121
Concatenating datasets...
Saving to CHL_MO_2004_2014_subset_combined.nc4...
✅ Successfully created CHL_MO_2004_2014_subset_combined.nc4
Time range: 2004-05-01T00:00:00.000000000 to 2014-05-01T00:00:00.000000000


Monthly Merge for Hawaii example

In [1]:
import xarray as xr
import pandas as pd
import glob
import os
from pathlib import Path

def extract_date_from_filename(filename):
    """Extract the first date from filename format like 'AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc'"""
    basename = os.path.basename(filename)
    parts = basename.split('.')
    
    # Look for the date range part (e.g., "20150501_20150531")
    for part in parts:
        if '_' in part and len(part) == 17:  # Format: YYYYMMDD_YYYYMMDD
            date_parts = part.split('_')
            if len(date_parts) == 2 and len(date_parts[0]) == 8 and date_parts[0].isdigit():
                try:
                    # Return the first date
                    return pd.to_datetime(date_parts[0], format='%Y%m%d')
                except:
                    continue
    
    # Fallback: look for any 8-digit date
    for part in basename.split('_'):
        if len(part) == 8 and part.isdigit():
            try:
                return pd.to_datetime(part, format='%Y%m%d')
            except:
                continue
    
    return None

def combine_netcdf_files(folder_path, output_file):
    """
    Simple function to combine all NetCDF files in a folder
    """
    # Find all files
    file_pattern = os.path.join(folder_path, "*AQUA_MODIS.*x_chlor_a.nc*")
    files = sorted(glob.glob(file_pattern))
    
    if not files:
        print(f"No files found matching pattern in {folder_path}")
        return
    
    print(f"Found {len(files)} files to combine")
    
    # Quick file size check
    sample_size = sum(os.path.getsize(f) for f in files[:5]) / 5
    total_gb = (sample_size * len(files)) / (1024**3)
    print(f"Estimated total size: {total_gb:.1f} GB")
    
    if total_gb > 20:
        print("Large dataset detected - using chunked processing...")
        return combine_large_dataset(files, output_file)
    else:
        print("Using simple concatenation...")
        return combine_small_dataset(files, output_file)

def combine_small_dataset(files, output_file):
    """For smaller datasets - load everything into memory"""
    datasets = []
    
    for i, file_path in enumerate(files):
        if i % 100 == 0:
            print(f"Processing file {i+1}/{len(files)}")
        
        try:
            ds = xr.open_dataset(file_path)
            
            # Add time coordinate
            date = extract_date_from_filename(file_path)
            if date:
                ds = ds.expand_dims('time')
                ds = ds.assign_coords(time=[date])
                datasets.append(ds)
            else:
                print(f"Warning: Could not extract date from {file_path}")
                
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    if not datasets:
        print("No valid datasets found!")
        return
    
    # Combine all datasets
    print("Concatenating datasets...")
    combined = xr.concat(datasets, dim='time')
    combined = combined.sortby('time')
    
    # Add metadata
    combined.attrs['created_from'] = f"{len(datasets)} files"
    combined.attrs['creation_date'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    
    # Save with compression
    print(f"Saving to {output_file}...")
    encoding = {var: {'zlib': True, 'complevel': 1} for var in combined.data_vars}
    combined.to_netcdf(output_file, encoding=encoding)
    
    # Clean up
    for ds in datasets:
        ds.close()
    combined.close()
    
    print(f"✅ Successfully created {output_file}")
    print(f"Time range: {combined.time.min().values} to {combined.time.max().values}")

def combine_large_dataset(files, output_file):
    """For larger datasets - use xarray's built-in chunking"""
    print("Using xarray open_mfdataset for large dataset...")
    
    # Use xarray's built-in multi-file dataset opener
    def preprocess(ds):
        """Add time coordinate during preprocessing"""
        # This is tricky because we don't have filename in preprocess
        # So we'll use a different approach
        return ds
    
    # Alternative approach for large datasets
    temp_files = []
    batch_size = 500  # Process in smaller batches
    
    for i in range(0, len(files), batch_size):
        batch = files[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(files)-1)//batch_size + 1}")
        
        batch_datasets = []
        for file_path in batch:
            try:
                ds = xr.open_dataset(file_path)
                date = extract_date_from_filename(file_path)
                if date:
                    ds = ds.expand_dims('time')
                    ds = ds.assign_coords(time=[date])
                    batch_datasets.append(ds)
            except Exception as e:
                print(f"Error with {file_path}: {e}")
                continue
        
        if batch_datasets:
            batch_combined = xr.concat(batch_datasets, dim='time')
            temp_file = f"temp_batch_{i//batch_size}.nc4"
            batch_combined.to_netcdf(temp_file)
            temp_files.append(temp_file)
            
            # Clean up
            batch_combined.close()
            for ds in batch_datasets:
                ds.close()
    
    # Combine all temporary files
    print("Combining all batches...")
    if temp_files:
        final_ds = xr.open_mfdataset(temp_files, concat_dim='time', combine='nested')
        final_ds = final_ds.sortby('time')
        
        # Save final result
        encoding = {var: {'zlib': True, 'complevel': 1} for var in final_ds.data_vars}
        final_ds.to_netcdf(output_file, encoding=encoding)
        final_ds.close()
        
        # Clean up temp files
        for temp_file in temp_files:
            os.remove(temp_file)
        
        print(f"✅ Successfully created {output_file}")

# Test the date extraction function
def test_date_extraction():
    """Test function to verify date extraction works correctly"""
    test_filename = "AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc"
    extracted_date = extract_date_from_filename(test_filename)
    print(f"Test filename: {test_filename}")
    print(f"Extracted date: {extracted_date}")
    print(f"Expected: 2015-05-01")

if __name__ == "__main__":
    # Test the date extraction
    test_date_extraction()
    
    # Simple usage
    folder_path = "CHLHawaii_MO"
    output_file = "CHL_MO_Hawaii.nc4"
    
    combine_netcdf_files(folder_path, output_file)

Test filename: AQUA_MODIS.20150501_20150531.L3m.MO.CHL.x_chlor_a.nc
Extracted date: 2015-05-01 00:00:00
Expected: 2015-05-01
Found 126 files to combine
Estimated total size: 0.0 GB
Using simple concatenation...
Processing file 1/126
Error reading CHLHawaii_MO\AQUA_MODIS.20150101_20150131.L3m.MO.CHL.x_chlor_a.nc.aux.xml: did not find a match in any of xarray's currently installed IO backends ['netcdf4', 'h5netcdf', 'scipy', 'gini', 'kerchunk', 'pydap', 'rasterio', 'zarr']. Consider explicitly selecting one of the installed engines via the ``engine`` parameter, or installing additional IO dependencies, see:
https://docs.xarray.dev/en/stable/getting-started-guide/installing.html
https://docs.xarray.dev/en/stable/user-guide/io.html
Processing file 101/126
Concatenating datasets...
Saving to CHL_MO_Hawaii.nc4...
✅ Successfully created CHL_MO_Hawaii.nc4
Time range: 2015-01-01T00:00:00.000000000 to 2025-05-01T00:00:00.000000000
