In [25]:
import pandas as pd
import os
import shutil
import scipy.io
import glob
import numpy as np

In [30]:
cruise = "EN720" #"EN608"

# Import Directory

In [45]:
ap_dir = rf'C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\AP\ap_VolFilt.csv'
attune_summary_dir = rf'C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\Attune\{cruise}\SummaryTable.csv'
fcs_dir = rf'C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\Attune\{cruise}\FCSList.csv'
edi_dir = rf'C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report2\Data\Attune\{cruise}\EDI_table.csv'

# Set Up Export Directory

In [46]:
matches = glob.glob(fr"Z:\Attune\cruise_data\*_{cruise}\preserved\outputs\class")

if matches:
    folder_path = matches[0]
else:
    folder_path = None
    print(f"No folder found for {cruise}")

dst_dir = rf"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\Attune\{cruise}"

mat_dir = os.path.join(dst_dir, "rawAttune")
os.makedirs(mat_dir, exist_ok=True)

# Read in and Clean Data

## Read in AP volume data

In [32]:
ap = pd.read_csv(ap_dir)
ap.columns = ap.columns.str.lower()
ap.head()

Unnamed: 0,cruise,cast,niskin,p_filename,d_filename,ap_vol,station,depth,date_utc,lat,lon
0,AE2426,1,2,AE2426_C01N02pa_RawData.txt,AE2426_C01N02da_RawData.txt,537,L1,17,20241106.0,41.19714,-70.8827
1,AE2426,1,2,AE2426_C01N02pa_RawData.txt,AE2426_C01N02da_RawData.txt,537,L1,17,20241106.0,41.19714,-70.8827
2,AE2426,1,2,AE2426_C01N02pa_RawData.txt,AE2426_C01N02da_RawData.txt,537,L1,17,20241106.0,41.19714,-70.8827
3,AE2426,1,13,AE2426_C01N13pa_RawData.txt,AE2426_C01N13da_RawData.txt,547,L1,3,20241106.0,41.1959,-70.88358
4,AE2426,1,13,AE2426_C01N13pa_RawData.txt,AE2426_C01N13da_RawData.txt,547,L1,3,20241106.0,41.1959,-70.88358


In [33]:
ap["cruise"] = ap["cruise"].astype(str).str.lower()
ap["cast"]   = pd.to_numeric(ap["cast"], errors="coerce").astype("Int64")
ap["niskin"] = pd.to_numeric(ap["niskin"], errors="coerce").astype("Int64")

# If you want to restrict to this cruise:
ap_sub = ap[ap["cruise"] == cruise.lower()].copy()

# Keep one row per cast/niskin with ap_vol and basic metadata
ap_meta = ap_sub[["cruise", "cast", "niskin", "ap_vol", "lat", "lon", "date_utc"]].drop_duplicates()
ap_meta.head()

Unnamed: 0,cruise,cast,niskin,ap_vol,lat,lon,date_utc
817,en720,2,2,547,41.1954,-70.88002,20240906.0
820,en720,2,13,548,41.19574,-70.88034,20240906.0
823,en720,3,6,532,41.03274,-70.8836,20240907.0
826,en720,3,10,547,41.03324,-70.88358,20240907.0
829,en720,3,18,539,41.0341,-70.88344,20240907.0


## Read in Summary Table

In [34]:
attune = pd.read_csv(attune_summary_dir, parse_dates=['date_sampled'])
attune.columns = attune.columns.str.lower()
attune.columns

Index(['cruise', 'cast', 'niskin', 'latitude', 'longitude', 'nearest_station',
       'salinity', 'potemp090c', 'depth_m', 'date_sampled', 'date_processed',
       'synfile', 'eukfile', 'bacteriafile', 'profile', 'euk_per_ml',
       'syn_per_ml', 'pro_per_ml', 'bac_per_ml', 'low_pe_euk_per_ml',
       'high_pe_euk_per_ml', 'median_volumes_euk', 'median_volumes_syn',
       'median_volumes_bact', 'median_volumes_pro',
       'median_volumes_low_pe_euk', 'median_volumes_high_pe_euk'],
      dtype='object')

In [35]:
attune[['cruise', 'cast', 'niskin', 'nearest_station', 'synfile', 'eukfile', 'bacteriafile', 'profile', 'euk_per_ml',
       'syn_per_ml', 'pro_per_ml', 'bac_per_ml', 'low_pe_euk_per_ml',
       'high_pe_euk_per_ml']].head()

Unnamed: 0,cruise,cast,niskin,nearest_station,synfile,eukfile,bacteriafile,profile,euk_per_ml,syn_per_ml,pro_per_ml,bac_per_ml,low_pe_euk_per_ml,high_pe_euk_per_ml
0,EN720,2,2,L1,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...,,8331.25,39315.625,,926400.0,9.375,56.25
1,EN720,2,7,L1,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,12206.25,54043.75,6041.666667,1013650.0,46.875,9.375
2,EN720,2,13,L1,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...,,16028.125,52365.625,,1047812.5,15.625,56.25
3,EN720,3,3,L2,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...,,3006.25,16884.375,,1142325.0,43.75,71.875
4,EN720,3,6,L2,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...,,6528.125,30109.375,,1251037.5,243.75,209.375


In [36]:
# Attune metadata
meta_cols = ["cruise", "cast", "niskin", "nearest_station", "date_sampled", "depth_m"]
attune_meta = attune[meta_cols].drop_duplicates()
attune_meta["cruise"] = attune_meta["cruise"].astype(str).str.lower()

## Read in FCS List

In [37]:
fcs_list = pd.read_csv(fcs_dir)
fcs_list.columns = fcs_list.columns.str.lower()

In [38]:
# Drop rows where 'text_column' contains hbac
fcs_list = fcs_list.rename(columns={"fcslist": "filename"})
fcs_list = fcs_list[~fcs_list['filename'].str.contains('hbac', case=False, na=False)] 
fcs_list.head()

Unnamed: 0,filename,cast,niskin,date_processed,vol_analyzed_ml,trigger_1,trigger_2,trigger_hv1,trigger_hv2
29,NESLTER_EN720_Sept2024_preserved(1)_phyto_CHL_...,11,2,10-Jun-2025,0.32,"AND_SSC,300","AND_BL3,500",220,340
30,NESLTER_EN720_Sept2024_preserved(1)_phyto_CHL_...,11,3,12-Jun-2025,0.32,"AND_SSC,300","AND_BL3,500",220,340
31,NESLTER_EN720_Sept2024_preserved(1)_phyto_CHL_...,11,4,06-Jun-2025,0.32,"AND_SSC,300","AND_BL3,500",220,340
32,NESLTER_EN720_Sept2024_preserved(1)_phyto_CHL_...,11,7,12-Jun-2025,0.32,"AND_SSC,300","AND_BL3,500",220,340
33,NESLTER_EN720_Sept2024_preserved(1)_phyto_CHL_...,11,9,17-Jun-2025,0.32,"AND_SSC,300","AND_BL3,500",220,340


In [39]:
fcs_list['vol_analyzed_ml'].unique()

array([0.32, 0.12])

## Read in EDI Table

In [40]:
# Load EDI table for Attune
edi = pd.read_csv(edi_dir)
edi.columns = edi.columns.str.lower()

In [41]:
# Standardize cast/niskin/cruise types across tables
for df in (attune, ap, edi):
    df["cruise"] = df["cruise"].astype(str).str.lower()
    df["cast"]   = pd.to_numeric(df["cast"], errors="coerce").astype("Int64")
    df["niskin"] = pd.to_numeric(df["niskin"], errors="coerce").astype("Int64")

In [42]:
edi['hetprok_volume_analyzed_ml'].unique()

array([0.08  , 0.0064,    nan])

In [44]:
edi[['cruise', 'cast', 'niskin', 'syn_volume_analyzed_ml', 'syn_filename', 'redeuk_volume_analyzed_ml',
       'redeuk_filename', 'hetprok_volume_analyzed_ml',
       'hetprok_filename']].head()

Unnamed: 0,cruise,cast,niskin,syn_volume_analyzed_ml,syn_filename,redeuk_volume_analyzed_ml,redeuk_filename,hetprok_volume_analyzed_ml,hetprok_filename
0,en720,2,2,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.08,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...
1,en720,2,7,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.08,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...
2,en720,2,13,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.08,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...
3,en720,3,3,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.08,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...
4,en720,3,6,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.08,NESLTER_EN720_Sept2024_preserved(2)_hbac_SYBR_...


# Extract Volumes

In [47]:
def apply_volume_fallback(df, fcs_list):
    df = df.merge(
        fcs_list[["cast", "niskin", "filename", "vol_analyzed_ml"]]
        .rename(columns={"vol_analyzed_ml": "vol_fcs"}),
        on=["cast", "niskin", "filename"],
        how="left"
    )

    # Use EDI volume if it exists and is not tiny, else use FCS volume
    df["vol_analyzed_ml"] = np.where(
        df["vol_analyzed_ml"].notna() & (df["vol_analyzed_ml"] >= 0.01),
        df["vol_analyzed_ml"],
        df["vol_fcs"]
    )

    df = df.drop(columns=["vol_fcs"])
    return df

## Syn

In [48]:
# Syn: filename and volume from EDI
syn_long = (
    edi[["cruise", "cast", "niskin",
         "syn_filename", "syn_volume_analyzed_ml"]]
    .rename(columns={
        "syn_filename": "filename",
        "syn_volume_analyzed_ml": "vol_analyzed_ml"
    })
)
syn_long["group"] = "syn"

#syn_long = apply_volume_fallback(syn_long, fcs_list)

# Attach metadata
syn_long = syn_long.merge(attune_meta, on=["cruise", "cast", "niskin"], how="left")

In [49]:
syn_long

Unnamed: 0,cruise,cast,niskin,filename,vol_analyzed_ml,group,nearest_station,date_sampled,depth_m
0,en720,2,2,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L1,2024-09-06 18:50:30+00:00,15.299
1,en720,2,7,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,0.32,syn,L1,2024-09-06 18:50:30+00:00,9.428
2,en720,2,13,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,0.32,syn,L1,2024-09-06 18:50:30+00:00,4.111
3,en720,3,3,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L2,2024-09-07 01:58:41+00:00,35.297
4,en720,3,6,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L2,2024-09-07 01:58:41+00:00,27.816
...,...,...,...,...,...,...,...,...,...
67,en720,20,17,NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC...,0.32,syn,L3,2024-09-10 20:34:28+00:00,15.522
68,en720,20,20,NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC...,0.32,syn,L3,2024-09-10 20:34:28+00:00,3.939
69,en720,24,2,NESLTER_EN720_Sept2024_preserved_phyto_PE_SSC_...,0.32,syn,MVCO,2024-09-11 05:33:35+00:00,15.633
70,en720,24,4,NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC...,0.32,syn,MVCO,2024-09-11 05:33:35+00:00,8.467


## Euks

In [50]:
# Euk: filename and volume from EDI
euk_long = (
    edi[["cruise", "cast", "niskin",
         "redeuk_filename", "redeuk_volume_analyzed_ml"]]
    .rename(columns={
        "redeuk_filename": "filename",
        "redeuk_volume_analyzed_ml": "vol_analyzed_ml"
    })
)
euk_long["group"] = "euk"

#euk_long = apply_volume_fallback(euk_long, fcs_list)

euk_long = euk_long.merge(attune_meta, on=["cruise", "cast", "niskin"], how="left")


In [51]:
euk_long.head()

Unnamed: 0,cruise,cast,niskin,filename,vol_analyzed_ml,group,nearest_station,date_sampled,depth_m
0,en720,2,2,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,euk,L1,2024-09-06 18:50:30+00:00,15.299
1,en720,2,7,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,euk,L1,2024-09-06 18:50:30+00:00,9.428
2,en720,2,13,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,euk,L1,2024-09-06 18:50:30+00:00,4.111
3,en720,3,3,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,euk,L2,2024-09-07 01:58:41+00:00,35.297
4,en720,3,6,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,euk,L2,2024-09-07 01:58:41+00:00,27.816


## Pro

In [52]:
# Pro: filename and volume from EDI
pro_long_list = [] 

if "hetprok_volume_analyzed_ml" in edi.columns:
    pro_long = (
        attune[["cruise", "cast", "niskin", 'profile']]
        .rename(columns={'profile': "filename"})
        .dropna(subset=["filename"])
    )
    pro_long["group"] = "pro"

    # Bring in hetprok volumes from EDI as Pro volumes
    pro_long = pro_long.merge(
        edi[["cruise", "cast", "niskin", "hetprok_volume_analyzed_ml"]],
        on=["cruise", "cast", "niskin"],
        how="left"
    ).rename(columns={"hetprok_volume_analyzed_ml": "vol_analyzed_ml"})

    # Apply fallback to FCSList if EDI volume missing/bad
    #pro_long = apply_volume_fallback(pro_long, fcs_list)

    # Attach metadata (depth, station, date)
    pro_long = pro_long.merge(attune_meta, on=["cruise", "cast", "niskin"], how="left")

    pro_long_list.append(pro_long)

else:
    # No Pro file info for this cruise; skip Pro in per-cell Attune pipeline
    print("No Pro file column found in SummaryTable for this cruise; only Syn/Euk will be processed.")
    pro_long = pd.DataFrame(columns=syn_long.columns)  # empty placeholder


In [53]:
pro_long.head()

Unnamed: 0,cruise,cast,niskin,filename,group,vol_analyzed_ml,nearest_station,date_sampled,depth_m
0,en720,2,7,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,pro,0.08,L1,2024-09-06 18:50:30+00:00,9.428
1,en720,6,5,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,pro,0.08,L4,2024-09-07 09:10:20+00:00,33.167
2,en720,6,18,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,pro,0.08,L4,2024-09-07 09:10:20+00:00,4.243
3,en720,7,3,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,pro,0.08,L5,2024-09-07 17:07:12+00:00,57.314
4,en720,7,6,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,pro,0.08,L5,2024-09-07 17:07:12+00:00,42.156


## Concatinate Syn/Euks/Pro volume dfs

In [54]:
dfs = [syn_long, euk_long, pro_long]
dfs = [df for df in dfs if not df.empty]

file_table = pd.concat(dfs, ignore_index=True)

# Drop nans, double check to drop any bacteria
file_table = file_table.dropna(subset=["filename", "vol_analyzed_ml"])
file_table = file_table[~file_table["filename"].str.contains("hbac", case=False, na=False)]

# Sanity
print("Volume summary by group:")
print(file_table.groupby("group")["vol_analyzed_ml"].describe())

Volume summary by group:
       count  mean           std   min   25%   50%   75%   max
group                                                         
euk     72.0  0.32  1.677021e-16  0.32  0.32  0.32  0.32  0.32
pro     34.0  0.08  2.817298e-17  0.08  0.08  0.08  0.08  0.08
syn     72.0  0.32  1.677021e-16  0.32  0.32  0.32  0.32  0.32


In [55]:
file_table

Unnamed: 0,cruise,cast,niskin,filename,vol_analyzed_ml,group,nearest_station,date_sampled,depth_m
0,en720,2,2,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L1,2024-09-06 18:50:30+00:00,15.299
1,en720,2,7,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,0.32,syn,L1,2024-09-06 18:50:30+00:00,9.428
2,en720,2,13,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,0.32,syn,L1,2024-09-06 18:50:30+00:00,4.111
3,en720,3,3,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L2,2024-09-07 01:58:41+00:00,35.297
4,en720,3,6,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L2,2024-09-07 01:58:41+00:00,27.816
...,...,...,...,...,...,...,...,...,...
173,en720,19,15,NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC...,0.08,pro,L9,2024-09-10 06:14:31+00:00,31.708
174,en720,19,17,NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC...,0.08,pro,L9,2024-09-10 06:14:31+00:00,15.554
175,en720,19,20,NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC...,0.08,pro,L9,2024-09-10 06:14:31+00:00,6.011
176,en720,20,15,NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC...,0.08,pro,L3,2024-09-10 20:34:28+00:00,31.352


# Merge w AP so you only get AP and Attune matches

In [56]:
file_table = file_table.merge(
    ap_meta,
    on=["cruise", "cast", "niskin"],
    how="left"
)

file_table = file_table.dropna(subset=["ap_vol"]).reset_index(drop=True)
file_table.head()

Unnamed: 0,cruise,cast,niskin,filename,vol_analyzed_ml,group,nearest_station,date_sampled,depth_m,ap_vol,lat,lon,date_utc
0,en720,2,2,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L1,2024-09-06 18:50:30+00:00,15.299,547.0,41.1954,-70.88002,20240906.0
1,en720,2,13,NESLTER_EN720_Sept2024_preserved(2)_phyto_PE_S...,0.32,syn,L1,2024-09-06 18:50:30+00:00,4.111,548.0,41.19574,-70.88034,20240906.0
2,en720,3,6,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L2,2024-09-07 01:58:41+00:00,27.816,532.0,41.03274,-70.8836,20240907.0
3,en720,3,10,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L2,2024-09-07 01:58:41+00:00,21.277,547.0,41.03324,-70.88358,20240907.0
4,en720,3,18,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,0.32,syn,L2,2024-09-07 01:58:41+00:00,4.921,539.0,41.0341,-70.88344,20240907.0


# Get target attune filenames

In [57]:
all_filenames = file_table["filename"].dropna().unique()

In [58]:
#all_filenames

## Copy matched files to RawAttune and change from fcs to mat file

In [59]:
for fcs_name in all_filenames:
    base_name = os.path.splitext(fcs_name)[0]           # Remove .fcs
    mat_name = f"{base_name}.mat"                       # Add .mat
    src_path = os.path.join(folder_path, mat_name)
    dst_path = os.path.join(mat_dir, mat_name)

    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)
    else:
        print(f"Missing: {src_path}")

# Process Attune and Copy Over to ProcessedAttune

In [60]:
def process_attune(attune_data, vol_analyzed, file_group):
    """
    Convert one Attune .mat into a tidy per-cell dataframe
    and keep ONLY the classes appropriate for this file_group.
    
    file_group must be one of: "syn", "euk", "pro".
    """

    # Extract arrays from .mat
    classes  = attune_data['class'].squeeze()
    biovol   = attune_data['volume'].squeeze()

    df = pd.DataFrame({
        "class": classes.astype(int),
        "Biovolume": biovol.astype(float),
    })

    # Allowed classes for each file type
    allowed = {
        "syn": [2],
        "euk": [1, 5, 6],
        "pro": [4],
    }

    if file_group not in allowed:
        # If something weird happens, return an empty df
        return pd.DataFrame(columns=["Biovolume", "ml_analyzed", "class", "group"])

    # Filter to the class(es) valid for this file_group
    df = df[df["class"].isin(allowed[file_group])]

    # Positive biovolume only
    df = df[df["Biovolume"] > 0]

    # Attach metadata
    df["ml_analyzed"] = vol_analyzed
    df["group"] = file_group

    return df[["Biovolume", "ml_analyzed", "class", "group"]]


In [61]:
attune_meta

Unnamed: 0,cruise,cast,niskin,nearest_station,date_sampled,depth_m
0,en720,2,2,L1,2024-09-06 18:50:30+00:00,15.299
1,en720,2,7,L1,2024-09-06 18:50:30+00:00,9.428
2,en720,2,13,L1,2024-09-06 18:50:30+00:00,4.111
3,en720,3,3,L2,2024-09-07 01:58:41+00:00,35.297
4,en720,3,6,L2,2024-09-07 01:58:41+00:00,27.816
...,...,...,...,...,...,...
67,en720,20,17,L3,2024-09-10 20:34:28+00:00,15.522
68,en720,20,20,L3,2024-09-10 20:34:28+00:00,3.939
69,en720,24,2,MVCO,2024-09-11 05:33:35+00:00,15.633
70,en720,24,4,MVCO,2024-09-11 05:33:35+00:00,8.467


In [62]:
csv_dir = os.path.join(dst_dir, "processedAttune")
os.makedirs(csv_dir, exist_ok=True)

skipped = []

for _, row in file_table.iterrows():
    fcs_name  = row["filename"]
    base_name = os.path.splitext(fcs_name)[0]
    mat_path  = os.path.join(mat_dir, f"{base_name}.mat")
    csv_path  = os.path.join(csv_dir, f"{base_name}.csv")

    if os.path.exists(mat_path):
        mat_data = scipy.io.loadmat(mat_path)
        try:
            vol_analyzed = row["vol_analyzed_ml"]
            file_group   = row["group"]   # 'syn', 'euk', 'pro'

            attune_df = process_attune(mat_data, vol_analyzed, file_group)
            
            # Skip empty outputs
            if attune_df.empty:
                cruise  = row.get("cruise", "NA")
                cast    = row.get("cast", "NA")
                niskin  = row.get("niskin", "NA")
                station = row.get("nearest_station", "NA") 
                fg      = row.get("group", "NA")
            
                print(
                    f"Skipping {base_name}: no valid {fg} cells "
                    f"(cruise={cruise}, cast={cast}, niskin={niskin}, station={station})."
                )

                skipped.append({
                "filename": base_name,
                "cruise": cruise,
                "cast": cast,
                "niskin": niskin,
                "station": station,
                "file_group": fg,
                "reason": "no valid cells"})
                continue

            # Add metadata, including ap_vol
            attune_df["cruise"] = row["cruise"]
            attune_df["cast"] = row["cast"]
            attune_df["niskin"] = row["niskin"]
            attune_df["depth_m"]= row["depth_m"]
            attune_df["nearest_station"] = row["nearest_station"]
            attune_df["datetime"]= row["date_sampled"]

            # ap metadata
            attune_df["ap_vol"]= row["ap_vol"]
            attune_df.to_csv(csv_path, index=False)
        except Exception as e:
            print(f"Failed to process {base_name}: {e}")
    else:
        print(f"MAT file missing: {mat_path}")


Skipping NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_SSC_pro_C06N05: no valid pro cells (cruise=en720, cast=6, niskin=5, station=L4).
Skipping NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC_pro_C24N02: no valid pro cells (cruise=en720, cast=24, niskin=2, station=MVCO).


In [63]:
skipped_df = pd.DataFrame(skipped)
skipped_df

Unnamed: 0,filename,cruise,cast,niskin,station,file_group,reason
0,NESLTER_EN720_Sept2024_preserved(2)_phyto_CHL_...,en720,6,5,L4,pro,no valid cells
1,NESLTER_EN720_Sept2024_preserved_phyto_CHL_SSC...,en720,24,2,MVCO,pro,no valid cells


In [64]:
outdir = rf"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\Attune\{cruise}"
outfile = os.path.join(outdir, rf"{cruise}_skipped_attune_files.csv")

skipped_df.to_csv(outfile, index=False)

print(f"Saved skipped file log to:\n{outfile}")

Saved skipped file log to:
C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\Attune\en720\en720_skipped_attune_files.csv
