In [1]:
import pandas as pd
import numpy as np
import os
import shutil

In [163]:
# User defined Variables
cruise = "AE2426"

# Directory

In [164]:
transect_dir = r"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\NESLTER_transect_metadata.csv"
cruise_dir = r"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\processed_cruises_meta.csv"
ctd_dir = rf"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\CTD\Metadata\{cruise.lower()}_ctd_metadata.csv"
output_dir = rf"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\IFCB\{cruise}"
os.makedirs(output_dir, exist_ok=True)

# Read in Metadata

In [165]:
# CTD Data
ctd_meta = pd.read_csv(ctd_dir)

In [166]:
ctd_meta

Unnamed: 0,cruise,cast,date,latitude,longitude,nearest_station,distance_km
0,AE2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154
1,AE2426,2,2024-11-07 05:26:59+00:00,41.030333,-70.77,u2a,0.213
2,AE2426,3,2024-11-07 07:23:54+00:00,41.030167,-70.994333,d2a,0.022
3,AE2426,4,2024-11-07 08:31:37+00:00,41.030833,-70.882833,L2,0.101
4,AE2426,5,2024-11-07 13:05:01+00:00,40.513167,-70.8835,L5,0.023
5,AE2426,6,2024-11-08 04:28:08+00:00,39.770167,-70.883,L11,0.349
6,AE2426,7,2024-11-08 06:22:58+00:00,39.773833,-70.776,u11a,0.104
7,AE2426,8,2024-11-08 08:04:11+00:00,39.771833,-70.992167,d11a,0.168
8,AE2426,10,2024-11-08 10:36:41+00:00,39.772333,-70.885,L11,0.181
9,AE2426,11,2024-11-08 17:51:06+00:00,39.939,-70.882167,L10,0.147


In [167]:
# IFCB data
transect_meta = pd.read_csv(transect_dir)
transect_meta = transect_meta[transect_meta["cast"].apply(lambda x: str(x).isdigit())]
transect_meta["cast"] = transect_meta["cast"].astype(int)
transect_meta.head()

Unnamed: 0,dataset,pid,sample_time,ifcb,ml_analyzed,latitude,longitude,depth,cruise,cast,...,sample_type,n_images,tag1,tag2,tag3,tag4,tag5,comment_summary,trigger_selection,skip
202,NESLTER_transect,D20170504T124320_IFCB115,2017-05-04 12:43:20+00:00,115,4.710733,,,0.0,AR16,1,...,cast,493,phat,surface,vanmooy,,,,2,0
253,NESLTER_transect,D20170504T180839_IFCB115,2017-05-04 18:08:39+00:00,115,4.363601,,,0.0,AR16,4,...,cast,1446,30__light,phat,vanmooy,,,,2,0
254,NESLTER_transect,D20170504T181042_IFCB009,2017-05-04 18:10:42+00:00,9,1.076917,,,0.0,AR16,4,...,cast,495,30__light,phat,vanmooy,,,,2,0
255,NESLTER_transect,D20170504T182112_IFCB009,2017-05-04 18:21:12+00:00,9,0.699981,,,0.0,AR16,4,...,cast,606,30__light,phat,vanmooy,,,,2,0
256,NESLTER_transect,D20170504T182843_IFCB009,2017-05-04 18:28:43+00:00,9,1.361373,,,0.0,AR16,4,...,cast,470,phat,surface,vanmooy,,,,2,0


# Merge CTD and IFCB Data

In [168]:
# Clean cruise
ctd_meta["cruise"] = ctd_meta["cruise"].astype(str).str.strip().str.lower()
transect_meta["cruise"] = transect_meta["cruise"].astype(str).str.strip().str.lower()

# Clean cast
ctd_meta["cast"] = pd.to_numeric(ctd_meta["cast"], errors="coerce").astype("Int64")
transect_meta["cast"] = pd.to_numeric(transect_meta["cast"], errors="coerce").astype("Int64")

# (Optional) also normalize niskin if you want to merge on it later
transect_meta["niskin"] = pd.to_numeric(transect_meta["niskin"], errors="coerce").astype("Int64")


In [169]:
# Merge on cruise, cast, and niskin
merged = pd.merge(
    ctd_meta,
    transect_meta[["cruise", "cast", "pid", "ml_analyzed", "sample_type", "niskin", "depth", "skip"]],
    on=["cruise", "cast"],
    how="left")
merged = merged.dropna(subset=["pid"])

# Check output
merged.head(20)

Unnamed: 0,cruise,cast,date,latitude,longitude,nearest_station,distance_km,pid,ml_analyzed,sample_type,niskin,depth,skip
0,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T181510_IFCB188,3.261301,cast,13,2.879,0.0
1,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T181609_IFCB109,3.405544,cast,2,16.618,0.0
2,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T183922_IFCB188,3.210241,cast,13,2.879,0.0
3,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T184031_IFCB109,3.304301,cast,2,16.618,0.0
4,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T190334_IFCB188,3.244334,cast,13,2.879,0.0
5,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T190454_IFCB109,3.327905,cast,2,16.618,0.0
6,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T195320_IFCB188,3.243289,cast,7,10.937,0.0
7,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T195539_IFCB109,3.558224,cast,7,10.937,0.0
8,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T201732_IFCB188,3.269522,cast,7,10.937,0.0
9,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T202002_IFCB109,3.515777,cast,7,10.937,0.0


In [170]:
# Ensure depth is numeric
merged["depth"] = pd.to_numeric(merged["depth"], errors="coerce")

# Drop rows with missing depth just in case
merged = merged.dropna(subset=["depth"])

# Read in IFCB data

In [171]:
cruise_meta = pd.read_csv(cruise_dir)
cruise_year = (cruise_meta[cruise_meta["Cruise"].str.lower() == cruise.lower()]["year"].iloc[0])
print(cruise_year)

2024


In [172]:
# Generate expected filenames
file_list = merged['pid'].tolist()
fea_list = [f + "_fea_v4.csv" for f in file_list]
multi_list = [f + "_multiblob_v4.csv" for f in file_list]

# Define source root
src_root = rf"Y:\NESLTER_transect\features\D{cruise_year}"

# Storage
fea_paths = {}
multi_paths = {}

# Walk through source directory and collect full paths
for root, dirs, files in os.walk(src_root):
    for fname in files:
        if fname in fea_list:
            fea_paths[fname] = os.path.join(root, fname)
        elif fname in multi_list:
            multi_paths[fname] = os.path.join(root, fname)

print(f"Matched {len(fea_paths)} fea files and {len(multi_paths)} multiblob files.")


Matched 154 fea files and 154 multiblob files.


In [173]:
merged["fea_file"] = merged["pid"].apply(lambda pid: fea_paths.get(f"{pid}_fea_v4.csv", np.nan))
merged["multi_file"] = merged["pid"].apply(lambda pid: multi_paths.get(f"{pid}_multiblob_v4.csv", np.nan))
merged.head()

Unnamed: 0,cruise,cast,date,latitude,longitude,nearest_station,distance_km,pid,ml_analyzed,sample_type,niskin,depth,skip,fea_file,multi_file
0,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T181510_IFCB188,3.261301,cast,13,2.879,0.0,Y:\NESLTER_transect\features\D2024\D20241106\D...,Y:\NESLTER_transect\features\D2024\D20241106\m...
1,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T181609_IFCB109,3.405544,cast,2,16.618,0.0,Y:\NESLTER_transect\features\D2024\D20241106\D...,Y:\NESLTER_transect\features\D2024\D20241106\m...
2,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T183922_IFCB188,3.210241,cast,13,2.879,0.0,Y:\NESLTER_transect\features\D2024\D20241106\D...,Y:\NESLTER_transect\features\D2024\D20241106\m...
3,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T184031_IFCB109,3.304301,cast,2,16.618,0.0,Y:\NESLTER_transect\features\D2024\D20241106\D...,Y:\NESLTER_transect\features\D2024\D20241106\m...
4,ae2426,1,2024-11-06 16:23:16+00:00,41.198,-70.882667,L1,0.154,D20241106T190334_IFCB188,3.244334,cast,13,2.879,0.0,Y:\NESLTER_transect\features\D2024\D20241106\D...,Y:\NESLTER_transect\features\D2024\D20241106\m...


In [174]:
# Save
merged.to_csv(f"{output_dir}/{cruise.lower()}_ifcb_metadata.csv", index=False, na_rep="NaN")