In [1]:
import pandas as pd
import numpy as np
import os
import shutil

# User defined Variables

In [2]:
cruise = "EN617"

# Directory

In [3]:
transect_dir = r"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\NESLTER_transect_metadata.csv"
cruise_dir = r"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\processed_cruises_meta.csv"
ctd_dir = rf"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\CTD\Metadata\{cruise.lower()}_ctd_metadata.csv"
output_dir = rf"C:\Users\ocron\OneDrive - Massachusetts Institute of Technology\Documents\Research\Generals\Research_Report3\Data\IFCB\{cruise}"
os.makedirs(output_dir, exist_ok=True)

# Read in Metadata

In [4]:
# CTD Data
ctd_meta = pd.read_csv(ctd_dir)

In [5]:
# IFCB data
transect_meta = pd.read_csv(transect_dir)
transect_meta = transect_meta[transect_meta["cast"].apply(lambda x: str(x).isdigit())]
transect_meta["cast"] = transect_meta["cast"].astype(int)
transect_meta.head()

Unnamed: 0,dataset,pid,sample_time,ifcb,ml_analyzed,latitude,longitude,depth,cruise,cast,...,sample_type,n_images,tag1,tag2,tag3,tag4,tag5,comment_summary,trigger_selection,skip
202,NESLTER_transect,D20170504T124320_IFCB115,2017-05-04 12:43:20+00:00,115,4.710733,,,0.0,AR16,1,...,cast,493,phat,surface,vanmooy,,,,2,0
253,NESLTER_transect,D20170504T180839_IFCB115,2017-05-04 18:08:39+00:00,115,4.363601,,,0.0,AR16,4,...,cast,1446,30__light,phat,vanmooy,,,,2,0
254,NESLTER_transect,D20170504T181042_IFCB009,2017-05-04 18:10:42+00:00,9,1.076917,,,0.0,AR16,4,...,cast,495,30__light,phat,vanmooy,,,,2,0
255,NESLTER_transect,D20170504T182112_IFCB009,2017-05-04 18:21:12+00:00,9,0.699981,,,0.0,AR16,4,...,cast,606,30__light,phat,vanmooy,,,,2,0
256,NESLTER_transect,D20170504T182843_IFCB009,2017-05-04 18:28:43+00:00,9,1.361373,,,0.0,AR16,4,...,cast,470,phat,surface,vanmooy,,,,2,0


# Merge CTD and IFCB Data

In [6]:
# Clean cruise
ctd_meta["cruise"] = ctd_meta["cruise"].astype(str).str.strip().str.lower()
transect_meta["cruise"] = transect_meta["cruise"].astype(str).str.strip().str.lower()

# Clean cast
ctd_meta["cast"] = pd.to_numeric(ctd_meta["cast"], errors="coerce").astype("Int64")
transect_meta["cast"] = pd.to_numeric(transect_meta["cast"], errors="coerce").astype("Int64")

# (Optional) also normalize niskin if you want to merge on it later
transect_meta["niskin"] = pd.to_numeric(transect_meta["niskin"], errors="coerce").astype("Int64")


In [7]:
# Merge on cruise, cast, and niskin
merged = pd.merge(
    ctd_meta,
    transect_meta[["cruise", "cast", "pid", "ifcb", "ml_analyzed", "sample_type", "niskin", "depth", "skip"]],
    on=["cruise", "cast"],
    how="left")
merged = merged.dropna(subset=["pid"])

# Check output
merged.head(20)

Unnamed: 0,cruise,cast,date,latitude,longitude,nearest_station,distance_km,pid,ifcb,ml_analyzed,sample_type,niskin,depth,skip
0,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T184414_IFCB109,109.0,1.572351,cast,12,1.796,0.0
1,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T190050_IFCB109,109.0,1.547778,cast,12,1.796,0.0
2,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T191853_IFCB109,109.0,1.508746,cast,12,1.796,0.0
3,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T193536_IFCB109,109.0,1.017873,cast,8,7.261,0.0
4,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T195316_IFCB109,109.0,2.590303,cast,8,7.261,0.0
5,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T201724_IFCB109,109.0,2.569993,cast,8,7.261,0.0
6,en617,2,2018-07-20 22:57:14+00:00,41.030333,-70.880667,L2,0.225,D20180720T234550_IFCB127,127.0,1.995168,cast,11,20.104,0.0
7,en617,2,2018-07-20 22:57:14+00:00,41.030333,-70.880667,L2,0.225,D20180721T000950_IFCB127,127.0,1.989656,cast,11,20.104,0.0
8,en617,2,2018-07-20 22:57:14+00:00,41.030333,-70.880667,L2,0.225,D20180721T003350_IFCB127,127.0,2.008439,cast,11,20.104,0.0
9,en617,2,2018-07-20 22:57:14+00:00,41.030333,-70.880667,L2,0.225,D20180721T012949_IFCB127,127.0,4.038344,cast,16,3.22,0.0


In [8]:
# Ensure depth is numeric
merged["depth"] = pd.to_numeric(merged["depth"], errors="coerce")

# Drop rows with missing depth just in case
merged = merged.dropna(subset=["depth"])

# Read in IFCB data

In [9]:
cruise_meta = pd.read_csv(cruise_dir)
cruise_year = (cruise_meta[cruise_meta["Cruise"].str.lower() == cruise.lower()]["year"].iloc[0])
print(cruise_year)

2018


In [10]:
# Generate expected filenames
file_list = merged['pid'].tolist()
fea_list = [f + "_fea_v4.csv" for f in file_list]
multi_list = [f + "_multiblob_v4.csv" for f in file_list]

# Define source root
src_root = rf"Y:\NESLTER_transect\features\D{cruise_year}"

# Storage
fea_paths = {}
multi_paths = {}

# Walk through source directory and collect full paths
for root, dirs, files in os.walk(src_root):
    for fname in files:
        if fname in fea_list:
            fea_paths[fname] = os.path.join(root, fname)
        elif fname in multi_list:
            multi_paths[fname] = os.path.join(root, fname)

print(f"Matched {len(fea_paths)} fea files and {len(multi_paths)} multiblob files.")


Matched 97 fea files and 0 multiblob files.


In [11]:
merged["fea_file"] = merged["pid"].apply(lambda pid: fea_paths.get(f"{pid}_fea_v4.csv", np.nan))
merged["multi_file"] = merged["pid"].apply(lambda pid: multi_paths.get(f"{pid}_multiblob_v4.csv", np.nan))
merged.head()

Unnamed: 0,cruise,cast,date,latitude,longitude,nearest_station,distance_km,pid,ifcb,ml_analyzed,sample_type,niskin,depth,skip,fea_file,multi_file
0,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T184414_IFCB109,109.0,1.572351,cast,12,1.796,0.0,Y:\NESLTER_transect\features\D2018\D20180720\D...,
1,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T190050_IFCB109,109.0,1.547778,cast,12,1.796,0.0,Y:\NESLTER_transect\features\D2018\D20180720\D...,
2,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T191853_IFCB109,109.0,1.508746,cast,12,1.796,0.0,Y:\NESLTER_transect\features\D2018\D20180720\D...,
3,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T193536_IFCB109,109.0,1.017873,cast,8,7.261,0.0,Y:\NESLTER_transect\features\D2018\D20180720\D...,
4,en617,1,2018-07-20 17:23:53+00:00,41.200667,-70.885333,L1,0.472,D20180720T195316_IFCB109,109.0,2.590303,cast,8,7.261,0.0,Y:\NESLTER_transect\features\D2018\D20180720\D...,


In [12]:
# Save
merged.to_csv(f"{output_dir}/{cruise.lower()}_ifcb_metadata.csv", index=False, na_rep="NaN")