In [24]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

# Paths and constants

In [30]:
input_dir = Path(r"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\Raw")
output_dir = Path(r"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\Processed")
volfilt_file = Path(r"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\ap_VolFilt.csv")

# Make sure output dir exists
output_dir.mkdir(parents=True, exist_ok=True)

# Read volume filtered data
volfilt_data = pd.read_csv(volfilt_file)

# Fixed parameters
filterarea  = np.pi * 1.05**2 * 1e-4  # cm^2 â†’ m^2
zerowvlnmin = 780
zerowvlnmax = 800

# Collect raw files

In [31]:
all_raw_files = list(input_dir.rglob("*_RawData.txt"))
all_raw_names = [f.name for f in all_raw_files]

# Files referenced in CSV (for info only)
csv_files = pd.concat(
    [volfilt_data["p_filename"], volfilt_data["d_filename"]],
    ignore_index=True
)
csv_files = csv_files.dropna().unique().tolist()

In [32]:
# Track missing files (referenced in CSV but not found) and processed bottles
missing_files = []
processed_keys = set()   # (cruise, cast, niskin)


# Processing functions

In [33]:
def read_spectrum_file(fname):
    """
    Read a LabSolutions UV-Vis text file where the numeric table starts
    after a line like:

        Wavelength (nm),Absorbance

    Returns
    -------
    wvln : np.ndarray
        Wavelengths (nm)
    absorb : np.ndarray
        Absorbance (unitless)
    """
    fname = Path(fname)
    header_row = None

    with fname.open("r") as f:
        for i, line in enumerate(f):
            if line.strip().startswith("Wavelength"):
                header_row = i
                break

    if header_row is None:
        raise ValueError(f"Could not find 'Wavelength' header in {fname}")

    df = pd.read_csv(
        fname,
        skiprows=header_row + 1,
        header=None,
        names=["Wavelength", "Absorbance"]
    )

    if df.shape[1] < 2:
        raise ValueError(
            f"File {fname} has only {df.shape[1]} column(s) after header."
        )

    return df["Wavelength"].to_numpy(), df["Absorbance"].to_numpy()


def apspecproc(apfilename, adfilename, volfilt, filterarea,
               zerowvlnmin, zerowvlnmax):
    """
    Process raw absorbance data into absorption coefficients.
    Python version of apspecproc_Dec2012.
    """
    Mitcha = 0.392
    Mitchb = 0.655

    # ap
    wvln, abs_ap = read_spectrum_file(apfilename)

    zero_range = (wvln >= zerowvlnmin) & (wvln <= zerowvlnmax)
    if not np.any(zero_range):
        raise ValueError(
            f"No wavelengths in [{zerowvlnmin}, {zerowvlnmax}] in {apfilename}"
        )

    ap_zero = abs_ap[zero_range].mean()
    ap_corrected = abs_ap - ap_zero
    ap_od = Mitcha * ap_corrected + Mitchb * ap_corrected**2
    ap = 2.3 * ap_od / (volfilt / filterarea)

    # ad
    if adfilename is not None:
        _, abs_ad = read_spectrum_file(adfilename)
        ad_zero = abs_ad[zero_range].mean()
        ad_corrected = abs_ad - ad_zero
        ad_od = Mitcha * ad_corrected + Mitchb * ad_corrected**2
        ad = 2.3 * ad_od / (volfilt / filterarea)
    else:
        abs_ad = np.full_like(abs_ap, np.nan, dtype=float)
        ad = np.full_like(ap, -9999.0, dtype=float)

    aph = ap - ad

    return wvln, ap, aph, ad, abs_ap, abs_ad


def parse_key_from_filename(name):
    """
    Parse (cruise, cast, niskin) from a filename stem such as:
        EN715_C23N14da_RawData
    or with extension:
        EN715_C23N14da_RawData.txt
    Returns (cruise, cast, niskin) or None if pattern not found.
    """
    # Use only the stem part for safety
    stem = Path(name).stem              # EN715_C23N14da_RawData
    m = re.match(r"([A-Za-z0-9]+)_C(\d+)N(\d+)", stem)
    if not m:
        return None
    cruise = m.group(1).upper()
    cast = int(m.group(2))
    niskin = int(m.group(3))
    return (cruise, cast, niskin)


# Main loop over rows

In [34]:
for i, row in volfilt_data.iterrows():
    apfile = row.get("p_filename")
    adfile = row.get("d_filename")

    # Missing filenames in CSV
    if pd.isna(apfile) or pd.isna(adfile):
        print(f"Skipping pair: {apfile} / {adfile} (filename missing in CSV)")
        if isinstance(apfile, str):
            missing_files.append(apfile)
        if isinstance(adfile, str):
            missing_files.append(adfile)
        continue

    ap_path = input_dir / apfile
    ad_path = input_dir / adfile

    # Check existence
    if not ap_path.is_file() or not ad_path.is_file():
        print(f"Skipping pair: {apfile} / {adfile} (file not found)")
        if not ap_path.is_file():
            missing_files.append(apfile)
        if not ad_path.is_file():
            missing_files.append(adfile)
        continue

    # Volume in mL to m^3
    if "Ap_vol" in volfilt_data.columns:
        vol_mL = row["Ap_vol"]
    elif "volume_filtered" in volfilt_data.columns:
        vol_mL = row["volume_filtered"]
    else:
        raise ValueError(
            f"No Ap_vol or volume_filtered column found in {volfilt_file}"
        )

    if not np.isfinite(vol_mL):
        print(f"Skipping {apfile}: invalid volume {vol_mL}.")
        continue

    volfilt = vol_mL * 1e-6

    # Process spectra
    try:
        wvln, ap, aph, ad, abs_ap, abs_ad = apspecproc(
            ap_path, ad_path, volfilt, filterarea,
            zerowvlnmin, zerowvlnmax
        )
    except Exception as e:
        print(f"Error processing {apfile} / {adfile}: {e}")
        continue

    # Output name from ap filename
    base = Path(apfile).stem  # EN715_C23N14da_RawData
    base_clean = re.sub(r"(p[a-z]|d[a-z])_RawData$", "", base)
    processed_name = base_clean + "_Processed.csv"
    output_file = output_dir / processed_name

    output_table = pd.DataFrame({
        "Wavelength": wvln,
        "Particle_Absorption": ap,
        "Phytoplankton_Absorption": aph,
        "Detritus_Absorption": ad,
        "Raw_Absorbance_ap": abs_ap,
        "Raw_Absorbance_ad": abs_ad,
    })

    output_table.to_csv(output_file, index=False)

    # Mark this bottle as processed based on the ap file name
    key = parse_key_from_filename(apfile)
    if key is not None:
        processed_keys.add(key)


Processed and saved: C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\Processed\AE2426_C01N02_Processed.csv
Processed and saved: C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\Processed\AE2426_C01N02_Processed.csv
Processed and saved: C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\Processed\AE2426_C01N02_Processed.csv
Processed and saved: C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\Processed\AE2426_C01N13_Processed.csv
Processed and saved: C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\Processed\AE2426_C01N13_Processed.csv
Processed and saved: C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Researc

# Report unused and missing

In [35]:
unused_files = []

for f in all_raw_files:
    key = parse_key_from_filename(f.name)
    if key is None:
        continue
    # If this bottle key was never processed, then this file belongs
    # to a bottle that had no usable run
    if key not in processed_keys:
        unused_files.append(f.name)

unused_files = sorted(set(unused_files))

if unused_files:
    print("\nRaw files whose bottle (cruise, cast, niskin) was never processed:")
    for fname in unused_files:
        print("   ", fname)
else:
    print("\nAll bottles had at least one representative file processed.")


Raw files whose bottle (cruise, cast, niskin) was never processed:
    EN608_C05N05da_RawData.txt
    EN608_C05N05db_RawData.txt
    EN608_C05N05pa_RawData.txt
    EN706_C20N11pa_RawData.txt


In [36]:
if missing_files:
    print("\nFiles referenced in CSV but missing on disk:")
    for fname in sorted(set(missing_files)):
        print("   ", fname)

print("\nBatch processing completed.")


Batch processing completed.
