# Test making daily Ameriflux files
I am copying most of the code from the `read_results_toNC.py`.

In [1]:
import os
import glob
import zipfile
import shutil
import re
import pandas as pd
import numpy as np
from datetime import datetime
import xarray as xr

import argparse


# Configure logging
import logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("process_log.log", mode="w"),
    ],
)


In [2]:

def extract_zip_files(root_dir, start_dt, end_dt, temp_csv_dir):
    """Extracts all .zip files within a given date and time range."""
    year_month = start_dt.strftime("%Y/%m")

    month_dir = os.path.join(root_dir, "results", year_month)

    if os.path.isdir(month_dir):
        # Find all .zip files
        zip_files = glob.glob(os.path.join(month_dir, "*.zip"))

        # Filter files for date range
        files_in_range = []
        for zip_file in zip_files:
            # Extract date from (assumes filename structure YYYY-MM-DDTHHMMSS)
            match = re.search(r"(\d{4}-\d{2}-\d{2}T\d{6})", os.path.basename(zip_file))
            if match:
                file_datetime_str = match.group(1)
                file_dt = datetime.strptime(file_datetime_str, "%Y-%m-%dT%H%M%S")

                if start_dt <= file_dt <= end_dt:
                    files_in_range.append(zip_file)

        logging.info(f"Found {len(files_in_range)} files in the date range {start_dt} to {end_dt}")

        for zip_file in files_in_range:
            logging.info(f"Extracting {zip_file}")
            try:
                with zipfile.ZipFile(zip_file, "r") as zip_ref:
                    zip_ref.extractall(temp_csv_dir)
            except zipfile.BadZipFile as e:
                logging.error(f"Failed to extract {zip_file}: {e}")



In [5]:
root_dir = '/Users/bhupendra/projects/crocus/data/flux_data/data'
prefix = 'US-CU1_HH'
temp_csv_dir = os.path.join(root_dir, "temp", "csv")
os.makedirs(temp_csv_dir, exist_ok=True)
start_datetime = datetime.strptime('2024-08-01T00:00:00', "%Y-%m-%dT%H:%M:%S")
end_datetime = datetime.strptime('2024-08-01T23:59:59', "%Y-%m-%dT%H:%M:%S")

extract_zip_files(root_dir, start_datetime, end_datetime, temp_csv_dir)

2025-02-11 22:31:13,013 [INFO] Found 48 files in the date range 2024-08-01 00:00:00 to 2024-08-01 23:59:59
2025-02-11 22:31:13,014 [INFO] Extracting /Users/bhupendra/projects/crocus/data/flux_data/data/results/2024/08/2024-08-01T070000_smart3-00694.zip
2025-02-11 22:31:13,018 [INFO] Extracting /Users/bhupendra/projects/crocus/data/flux_data/data/results/2024/08/2024-08-01T083000_smart3-00694.zip
2025-02-11 22:31:13,019 [INFO] Extracting /Users/bhupendra/projects/crocus/data/flux_data/data/results/2024/08/2024-08-01T020000_smart3-00694.zip
2025-02-11 22:31:13,021 [INFO] Extracting /Users/bhupendra/projects/crocus/data/flux_data/data/results/2024/08/2024-08-01T023000_smart3-00694.zip
2025-02-11 22:31:13,022 [INFO] Extracting /Users/bhupendra/projects/crocus/data/flux_data/data/results/2024/08/2024-08-01T080000_smart3-00694.zip
2025-02-11 22:31:13,024 [INFO] Extracting /Users/bhupendra/projects/crocus/data/flux_data/data/results/2024/08/2024-08-01T073000_smart3-00694.zip
2025-02-11 22:31:

In [6]:
temp_csv_dir = os.path.join(root_dir, "temp", "csv")
csv_files = glob.glob(os.path.join(temp_csv_dir, "output", "eddypro_exp_fluxnet*_exp.csv"))
if not csv_files:
    logging.error("No CSV files found for the given date range.")
csv_files

['/Users/bhupendra/projects/crocus/data/flux_data/data/temp/csv/output/eddypro_exp_fluxnet_2024-08-01T110051_exp.csv',
 '/Users/bhupendra/projects/crocus/data/flux_data/data/temp/csv/output/eddypro_exp_fluxnet_2024-08-01T190047_exp.csv',
 '/Users/bhupendra/projects/crocus/data/flux_data/data/temp/csv/output/eddypro_exp_fluxnet_2024-08-01T210045_exp.csv',
 '/Users/bhupendra/projects/crocus/data/flux_data/data/temp/csv/output/eddypro_exp_fluxnet_2024-08-01T080050_exp.csv',
 '/Users/bhupendra/projects/crocus/data/flux_data/data/temp/csv/output/eddypro_exp_fluxnet_2024-08-01T030044_exp.csv',
 '/Users/bhupendra/projects/crocus/data/flux_data/data/temp/csv/output/eddypro_exp_fluxnet_2024-08-01T023047_exp.csv',
 '/Users/bhupendra/projects/crocus/data/flux_data/data/temp/csv/output/eddypro_exp_fluxnet_2024-08-01T073034_exp.csv',
 '/Users/bhupendra/projects/crocus/data/flux_data/data/temp/csv/output/eddypro_exp_fluxnet_2024-08-01T163051_exp.csv',
 '/Users/bhupendra/projects/crocus/data/flux_dat

In [10]:
def combine_csv_files(file_paths):
    dataframes = []

    for file_path in file_paths:
        # Read each file into a DataFrame
        df = pd.read_csv(file_path)
        dataframes.append(df)

    # Concatenate all DataFrames along the rows
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Sort the DataFrame by 'time'. May not be need, but keep it.
    combined_df = combined_df.sort_values(by="TIMESTAMP_START").reset_index(drop=True)

    logging.info(f"Combined DataFrame time range: {combined_df['TIMESTAMP_START'].min()} to {combined_df['TIMESTAMP_START'].max()}")

    return combined_df

combined_df = combine_csv_files(csv_files)
combined_df.head()

2025-02-11 22:50:11,690 [INFO] Combined DataFrame time range: 202408010000 to 202408012330


Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,DOY_START,DOY_END,FILENAME_HF,SW_IN_POT,NIGHT,EXPECT_NR,FILE_NR,CUSTOM_FILTER_NR,...,HPATH_GA_NONE,VPATH_GA_NONE,RESPONSE_TIME_GA_NONE,NUM_CUSTOM_VARS,CUSTOM_VIN_SF_MEAN,CUSTOM_CO2_MEAN,CUSTOM_H2O_MEAN,CUSTOM_DEW_POINT_MEAN,CUSTOM_CO2_SIGNAL_STRENGTH_7500_MEAN,NUM_BIOMET_VARS
0,202408010000,202408010030,214.0,214.0208,2024-08-01T000000_smart3-00694.ghg,0.0,1,18000,18000,18000,...,,,,5,23.98,465.903,23.0334,19.5559,98.394,0
1,202408010030,202408010100,214.0208,214.0416,2024-08-01T003000_smart3-00694.ghg,0.0,1,18000,18000,18000,...,,,,5,23.98,471.74,25.0695,20.9332,98.3387,0
2,202408010100,202408010130,214.0416,214.0624,2024-08-01T010000_smart3-00694.ghg,0.0,1,18000,18000,18000,...,,,,5,23.9865,426.182,24.6792,20.6809,98.325,0
3,202408010130,202408010200,214.0624,214.0833,2024-08-01T013000_smart3-00694.ghg,0.0,1,18000,18000,18000,...,,,,5,23.99,403.114,24.9673,20.8787,98.3283,0
4,202408010200,202408010230,214.0833,214.1041,2024-08-01T020000_smart3-00694.ghg,0.0,1,18000,18000,18000,...,,,,5,23.99,402.175,24.5888,20.6465,98.3273,0


In [13]:
year_month_str = end_datetime.strftime("%Y%m")
ts_start = combined_df["TIMESTAMP_START"].min()
ts_end = combined_df["TIMESTAMP_START"].max()
fp_dir = os.path.join(root_dir, "AmeriFlux", year_month_str)
os.makedirs(fp_dir, exist_ok=True)
out_filename = f"{prefix}_{ts_start}_{ts_end}.csv"
out_filepath = os.path.join(fp_dir, out_filename)
combined_df.to_csv(out_filepath, index=False)