In [1]:
import os
import pandas as pd
import re
import rasterio
from glob import glob
from collections import Counter, defaultdict
def make_unique(columns):
    counts = Counter()
    unique_cols = []
    for col in columns:
        counts[col] += 1
        if counts[col] > 1:
            unique_cols.append(f"{col}_{counts[col]}")
        else:
            unique_cols.append(col)
    return unique_cols
def merge_priority_columns(df, primary, secondary):
    if primary in df.columns and secondary in df.columns:
        df[primary] = df[primary].combine_first(df[secondary])
        df.drop(columns=[secondary], inplace=True)
    elif secondary in df.columns and primary not in df.columns:
        df.rename(columns={secondary: primary}, inplace=True)
    return df
def extract_base_name(col):
    col_lower = col.lower()
    if col_lower.startswith("flag"):
        return "flag"
    return col.split(':')[0].strip()
def merge_duplicate_columns_take_first(df):
    from collections import defaultdict
    grouped = defaultdict(list)
    for col in df.columns:
        base = re.sub(r'_\d+$', '', col)
        grouped[base].append(col)
    new_df = df.copy()
    for base, cols in grouped.items():
        if len(cols) > 1:
            new_df[base] = new_df[cols].bfill(axis=1).iloc[:, 0]
            new_df.drop(columns=[col for col in cols if col != base], inplace=True)
    new_df.dropna(axis=1, how='all', inplace=True)
    return new_df
def clean_lat_long_pair(lat_raw, lon_raw):
    def extract_digits_and_dir(value):
        value = (
            value.replace(',', '.')
                 .replace('v', '.')
                 .replace('"', '')
                 .replace("'", '')
                 .replace("'", '')
                 .replace("`", '')
                 .replace("°", '')  
                 .replace("º", '')    
                 .replace("˚", '')    
                 .strip()
        )
        value = re.sub(r'[^0-9NSEWnsew. ]+', '', value)
        value = re.sub(r'(?<=\d)[ .]+(?=\d)', '', value)
        value = re.sub(r'[ .]', '', value)
        match = re.match(r'^(\d+)([NSEWnsew]?)$', value)
        if not match:
            raise ValueError(f"Invalid coordinate format: {value}")
        digits, direction = match.groups()
        direction = direction.upper() if direction else None
        return digits, direction
    lat_digits, lat_dir = extract_digits_and_dir(lat_raw)
    lon_digits, lon_dir = extract_digits_and_dir(lon_raw)
    if lat_dir not in ['N', 'S']:
        lat_dir = 'N'
    if lon_dir not in ['E', 'W']:
        lon_dir = 'E'
    if lat_dir in ['E', 'W'] and lon_dir in ['E', 'W']:
        print(" Latitude has direction like longitude — correcting to N")
        lat_dir = 'N'
    elif lat_dir in ['N', 'S'] and lon_dir in ['N', 'S']:
        print(" Longitude has direction like latitude — correcting to E")
        lon_dir = 'E'
    return lat_digits + lat_dir, lon_digits + lon_dir
def parse_latitude(lat_str):
    length = len(lat_str)
    meta = lat_str[-1].upper()
    digits = lat_str[:-1]
    deg = min = sec = 0
    if length == 0 + 1:
        if lat_str[0] == '0':
            deg = int(digits[:1])
        else:
            deg = int(digits[:1])
    elif length == 1 + 1:
        if lat_str[0] == '0':
            deg = int(digits[:1])
        else:
            deg = int(digits[:1])
    elif length == 2 + 1:
        if lat_str[0] == '0':
            deg = int(digits[:2])
        else:
            deg = int(digits[:2])
    elif length == 3 + 1:
        if lat_str[0] == '0':
            deg = int(digits[:2])
            min = int(digits[2:3])
            sec = 0
        else:
            deg = int(digits[:1])
            min = int(digits[1:3])
    elif length == 4 + 1:
        if lat_str[0] == '0':
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = 0
        else:
            deg = int(digits[:1])
            min = int(digits[1:4])
    elif length == 5 + 1:
        if digits[0] in ['0', '1', '2']:
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = int(digits[4:5])
        else:
            deg = int(digits[0])
            min = int(digits[1:3])
            sec = int(digits[3:5])
    elif length == 6 + 1:
        if digits[0] in ['0', '1', '2']:
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = int(digits[4:6])
        else:
            deg = int(digits[:1])
            min = int(digits[1:3])
            sec = int(digits[3:6])
    elif length == 7 + 1:
        if digits[0] in ['0', '1', '2']:
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = int(digits[4:7])
        else:
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = int(digits[4:7])
    else:
        raise ValueError("unsupported")
    if sec > 59:
        if sec <=599:
            sec = sec/10
        else:
            sec = sec/100
    decimal = deg + (min/60) + (sec/3600)
    if meta in ['S', 'W']:
        decimal *=-1
    return round(decimal, 4)
def parse_longitude(lon_str):
    length = len(lon_str)
    meta = lon_str[-1].upper()
    digits = lon_str[:-1]
    deg = min = sec = 0
    if length == 1 + 1:
        if lon_str[0] == '0':
            deg = int(digits[:2])
        else:
            deg = int(digits[:2])
    elif length == 2 + 1:
        if lon_str[0] == '0':
            deg = int(digits[:2])
        else:
            deg = int(digits[:2]) 
    elif length == 3+ 1:
        if lon_str[0] == '0':
            deg = int(digits[:3])
        else:
            deg = int(digits[:2])
    elif length == 4 + 1:
        if lon_str[0] == '0':
            deg = int(digits[:3])
            min = int(digits[3:4])
        else:
            deg = int(digits[:2])
            min = int(digits[2:4])
    elif length == 5 + 1:
        if lon_str[0] == '0':
            deg = int(digits[:3])
            min = int(digits[3:5])
        else:
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = int(digits[4:5])
    elif length == 6 + 1:
        if lon_str[0] == '0':
            deg = int(digits[:3])
            min = int(digits[3:5])
            sec = int(digits[5:6])
        else:
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = int(digits[4:6])
    elif length == 7 + 1:
        if lon_str[0] == '0':
            deg = int(digits[:3])
            min = int(digits[3:5])
            sec = int(digits[5:7])
        else:
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = int(digits[4:7])
    elif length == 8 + 1:
        if lon_str[0] == '0':
            deg = int(digits[:3])
            min = int(digits[3:5])
            sec = int(digits[5:8])
        else:
            deg = int(digits[:2])
            min = int(digits[2:4])
            sec = int(digits[4:8])
    elif length == 9 + 1:
        if lon_str[0] == '0':
            deg = int(digits[:3])
            min = int(digits[3:6])
            sec = int(digits[6:9])
        else:
            deg = int(digits[:2])
            min = int(digits[2:5])
            sec = int(digits[5:9])
    else:
        raise ValueError("unsupported")
    if sec > 59:
        if sec <=599:
            sec = sec/10
        else:
            sec = sec/100
    decimal = deg + (min/60) + (sec/3600)
    if meta in ['S', 'W']:
        decimal *=-1
    return round(decimal, 4)
def parse_cnv(file_path, relative_path):
    print(f"Parsing {file_path}")
    metadata = {}
    parameters = []
    data_started = False
    data_lines = []
    lat_raw = None
    lon_raw = None
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='latin1') as f:
                lines = f.readlines()
        except Exception as e:
            raise RuntimeError(f"Failed to decode {file_path}: {e}")
    meta_patterns = {
        "Ship": re.compile(r"\bShip[\s.:]*\s*(.+)", re.IGNORECASE),
        "Cruise ID": re.compile(r"\bCruise\s*:\s*(.+)", re.IGNORECASE),
        "Station": re.compile(r"\bStation\s*:\s*(.+)", re.IGNORECASE),
        "Latitude": re.compile(r"\bLatitude\s*:\s*(.+)", re.IGNORECASE),
        "Longitude": re.compile(r"\bLongitude\s*:\s*(.+)", re.IGNORECASE),
    }
    for line in lines:
        line = line.strip()
        if line.startswith("*END*"):
            data_started = True
            continue
        if not data_started:
            for key, pattern in meta_patterns.items():
                match = pattern.search(line)
                if match:
                    value = match.group(1).strip()
                    if key == "Latitude":
                        lat_raw = value
                    elif key == "Longitude":
                        lon_raw = value
                    else:
                        metadata[key] = value
            if lat_raw and lon_raw and "Latitude" not in metadata:
                try:
                    lat_clean, lon_clean = clean_lat_long_pair(lat_raw, lon_raw)
                    metadata["Latitude"] = parse_latitude(lat_clean)
                    metadata["Longitude"] = parse_longitude(lon_clean)
                except Exception as e:
                    metadata["Latitude"] = "NA"
                    metadata["Longitude"] = "NA"
                    print(f"Unsupported lat/lon in {file_path}: {e}")
            if line.startswith("# start_time"):
                parts = line.split("=", 1)
                if len(parts) > 1:
                    metadata["Start Time"] = parts[1].strip()
            elif line.startswith("# name"):
                parts = line.split("=", 1)
                if len(parts) > 1:
                    parameters.append(parts[1].strip())
        elif line and not line.startswith("#"):
            data_lines.append(line)
    if not parameters:
        raise ValueError(f"No parameters found in {file_path}")
    column_names = make_unique([param.split(':')[0].strip() for param in parameters])
    clean_str = lambda s: str(s).replace(" ", "").replace(":", "-") if s is not None else "NA"
    trace_id_parts = [
        clean_str(metadata.get("Cruise ID")),
        clean_str(metadata.get("Station")),
        clean_str(metadata.get("Latitude")),
        clean_str(metadata.get("Longitude")),
        clean_str(metadata.get("Start Time")),
    ]
    trace_id = "_".join(trace_id_parts)
    metadata["TraceID"] = trace_id
    keys = ["Ship", "Cruise ID", "Station", "Latitude", "Longitude", "Start Time"]
    meta_values = [metadata.get(k, None) for k in keys]
    meta_df = pd.DataFrame([meta_values], columns=keys)
    meta_df["Parameters"] = ", ".join(column_names)
    meta_df["TraceID"] = trace_id
    meta_df["folderpath_filename"] = relative_path
    data = []
    for line in data_lines:
        try:
            values = list(map(float, line.split()))
            data.append(values)
        except ValueError:
            continue
    try:
        data_df = pd.DataFrame(data, columns=column_names[:len(data[0])] if data else column_names)
        data_df.columns = make_unique(data_df.columns)
        data_df = merge_duplicate_columns_take_first(data_df)
    except Exception as e:
        print(f" CNV data malformed or binary in {file_path} → skipping CNV")
        return meta_df, pd.DataFrame(), column_names
    if not data_df.empty and data_df.shape[1] <= 2 and len(column_names) > 5:
        print(f" CNV data malformed or binary in {file_path} → skipping CNV")
        return meta_df, pd.DataFrame(), column_names
    data_df["TraceID"] = trace_id
    data_df["folderpath_filename"] = relative_path
    return meta_df, data_df, column_names
    standard_columns = [
        'prDM', 't090C', 'sal00', 'sbeox0ML/L', 'depSM', 'svCM', 'specc', 'xmiss',  
        'bat', 'scan', 'prdM', 'tv290C', 'flECO-AFL', 'c0S/m', 'sbeox0PS', 'par', 
        'density00', 'sigma-é00', 'flag', 'oxsatML/L', 'c0mS/cm', 'potemp090C',
        'potemp068C', 'sigma-t00', 'depS', 'pr', 't068', 'OxML/L', 'nbf', 'nbin', 
        'potemp068', 'uv', 'dz/dt', 'timeS', 'gpa', 'dm', 'pta068', 'sva', 'SVC',
        'oxML/L', 'DepS', 'Pr', 'acc', 'N^2', 'N', 'E', 'E10^-8', 'svC',
        'T068', 'OxML/L', 'Xmiss', 'Sal00', 'Sigma-t00', 'Potemp068', 'Nbin', 'SvC',
        'C0S/m', 'Sbeox0ML/L', 'Sva', 'Pta068', 'sal', 'pta090C', 'avgSvC', 'potemp090', 'navg'
    ]
    keep_columns = [col for col in data_df.columns if any(
        std_col.lower() in col.lower() for std_col in standard_columns
    )]
    clean_str = lambda s: str(s).replace(" ", "").replace(":", "-") if s is not None else "NA"
    trace_id_parts = [
        clean_str(metadata.get("Cruise ID")),
        clean_str(metadata.get("Station")),
        clean_str(metadata.get("Latitude")),
        clean_str(metadata.get("Longitude")),
        clean_str(metadata.get("Start Time")),
    ]
    trace_id = "_".join(trace_id_parts)
    metadata["TraceID"] = trace_id
    data_df["TraceID"] = trace_id
    keep_columns.append("TraceID")
    if keep_columns:
        data_df = data_df[keep_columns]
    keys = ["Ship", "Cruise ID", "Station", "Latitude", "Longitude", "Start Time"]
    meta_values = [metadata.get(k, None) for k in keys]
    meta_df = pd.DataFrame([meta_values], columns=keys)
    meta_df["Parameters"] = ", ".join(column_names)
    meta_df["TraceID"] = trace_id
    meta_df["folderpath_filename"] = relative_path
    data_df["folderpath_filename"] = relative_path
    meta_cols = meta_df.columns.tolist()
    data_cols = data_df.columns.tolist()
    meta_cols.insert(meta_cols.index("TraceID") + 1, meta_cols.pop(meta_cols.index("folderpath_filename")))
    data_cols.insert(data_cols.index("TraceID") + 1, data_cols.pop(data_cols.index("folderpath_filename")))
    meta_df = meta_df[meta_cols]
    data_df = data_df[data_cols]
    return meta_df, data_df, column_names
def parse_asc(file_path, expected_columns=None):
    try:
        with open(file_path, 'r') as f:
            lines = [line.strip() for line in f.readlines() if line.strip()]
        header_line = None
        data_lines = []
        if lines and not lines[0][0].isdigit():
            header_line = lines[0]
            data_lines = lines[1:]
        else:
            data_lines = lines
        detected_from_header = False
        if expected_columns:
            sample_data_line = next((l for l in data_lines if re.match(r'^[\d\.\-]', l)), "")
            num_values = len(sample_data_line.split())

            if num_values <= len(expected_columns):
                columns = expected_columns.copy()
                used_columns = expected_columns[:num_values]
                extra_columns = expected_columns[num_values:]
            else:
                used_columns = expected_columns + [f"extra_col_{i}" for i in range(num_values - len(expected_columns))]
                extra_columns = []
                columns = used_columns
        elif header_line:
            if "  " in header_line:
                used_columns = [col.strip() for col in re.split(r' {2,}', header_line) if col.strip()]
                sample_data_line = next((l for l in data_lines if re.match(r'^[\d\.\-]', l)), "")
                num_values = len(sample_data_line.split()) if sample_data_line else 0
                
                if len(used_columns) != num_values:
                    used_columns = header_line.strip().split()

            extra_columns = []
            detected_from_header = True
        else:
            num_values = len(data_lines[0].split()) if data_lines else 0
            used_columns = [f"col_{i}" for i in range(num_values)]
            extra_columns = []
        data = []
        for line in data_lines:
            try:
                values = list(map(float, line.split()))
                row = values[:len(used_columns)]
                row += [float('nan')] * len(extra_columns)
                data.append(row)
            except ValueError:
                continue
        full_columns = used_columns + extra_columns
        df = pd.DataFrame(data, columns=full_columns)
        return df, full_columns if detected_from_header else None

    except Exception as e:
        print(f"Error reading ASC file: {file_path} → {str(e)}")
        return pd.DataFrame(), None
def preprocess_cnv_folder(folder_path):
    all_metadata = []
    all_data = []
    all_files = []

    for dirpath, dirnames, filenames in os.walk(folder_path):
        cnv_files = [os.path.join(dirpath, f) for f in filenames if f.lower().endswith(".cnv")]
        all_files.extend(cnv_files)

    print(f"Found {len(all_files)} CNV files...")

    for file_path in all_files:
        print(f"Processing: {os.path.basename(file_path)}")
        try:
            relative_path = os.path.relpath(file_path, folder_path).replace("\\", "/")
            meta_df, df, column_names = parse_cnv(file_path, relative_path) 
            all_metadata.append(meta_df)
            if df.empty:
                asc_path = os.path.splitext(file_path)[0] + ".asc"
                if os.path.exists(asc_path):
                    asc_df, asc_header = parse_asc(asc_path, expected_columns=None)
                    if not asc_df.empty:
                        if asc_header and len(asc_df.columns) == len(asc_header):
                            asc_df.columns = asc_header
                        else:
                            asc_df.columns = column_names[:asc_df.shape[1]]
                        asc_df["TraceID"] = meta_df["TraceID"].values[0]
                        asc_df["folderpath_filename"] = relative_path.replace(".cnv", ".asc")
                        all_data.append(asc_df)
                    else:
                        print(f" ASC file found but empty: {asc_path}")
                else:
                    print(f" No ASC found for: {os.path.basename(file_path)}")
            else:
                if df.shape[1] <= 2 and len(column_names) > 5:
                    print(f" CNV data malformed in {file_path}, trying ASC fallback...")
                    asc_path = os.path.splitext(file_path)[0] + ".asc"
                    if os.path.exists(asc_path):
                        asc_df, asc_header = parse_asc(asc_path, expected_columns=None)
                        if not asc_df.empty:
                            if asc_header and len(asc_df.columns) == len(asc_header):
                                asc_df.columns = asc_header
                            else:
                                asc_df.columns = column_names[:asc_df.shape[1]]
                            asc_df["TraceID"] = meta_df["TraceID"].values[0]
                            asc_df["folderpath_filename"] = relative_path.replace(".cnv", ".asc")
                            all_data.append(asc_df)
                        else:
                            print(f" ASC file found but empty: {asc_path}")
                    else:
                        print(f" No ASC found for malformed CNV: {os.path.basename(file_path)}")
                else:
                    all_data.append(df)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    if all_data:
        merged_data = []
        for df in all_data:
            merge_pairs = [
                ('depSM', 'depS', 'DepS'),
                ('T090C', 'T090'),
                ('sal00', 'sal00_2'),
                ('sal00', 'sal', 'Sal00'),
                ('sbeox0ML/L', 'Sbeox0ML/L', 'oxML/L', 'OxML/L'),
                ('prDM', 'pr', 'Pr'),
                ('c0mS/cm', 'c0S/m', 'C0S/m'),
                ('t090C', 't068', 'T068'),
                ('t090C', 't068C'),
                ('potemp090C', 'potemp090'),
                ('potemp090C', 'pta090C'),
                ('svCM', 'svC'),
                ('potemp068', 'Potemp068', 'pta068', 'Pta068'),
                ('potemp068', 'potemp068_2'),
                ('potemp090C', 'potemp068'),
                ('xmiss', 'Xmiss'),
                ('timeS', 'TimeS'),
                ('nbin', 'Nbin'),
                ('sva', 'Sva'),
                ('avgSvC', 'SvC'),
                ('sigma-t00', 'Sigma-t00', 'sigma-t00_2'),
            ]
            for primary, *secondaries in merge_pairs:
                for secondary in secondaries:
                    df = merge_priority_columns(df, primary, secondary)
            RENAME_CASE_MAP = {
                'sal00': 'Sal00',
            }
            df.rename(columns=RENAME_CASE_MAP, inplace=True)
            merged_data.append(df)
        all_data = merged_data
    return all_metadata,all_data       
#----------ASCDAT--------
def merge_duplicate_columns_take_first(df):
    grouped = defaultdict(list)
    for col in df.columns:
        base = re.sub(r'_\d+$', '', col)
        grouped[base].append(col)
    new_df = df.copy()
    for base, cols in grouped.items():
        if len(cols) > 1:
            new_df[base] = new_df[cols].bfill(axis=1).iloc[:, 0]
            new_df.drop(columns=[col for col in cols if col != base], inplace=True)
    new_df.dropna(axis=1, how='all', inplace=True)
    return new_df

# ------------ MAIN ASC-DAT PAIR PARSER ----------------

def parse_asc_dat_pair(asc_path, dat_path, relative_path):
    print(f"Processing ASC-DAT pair: {asc_path} with {dat_path}")
    
    metadata = {
        "Ship": "NA", "Cruise ID": "NA", "Station": "NA",
        "Latitude": "NA", "Longitude": "NA", "Start Time": "NA",
        "_debug": {"file": dat_path, "lines": []}
    }

    try:
        with open(dat_path, 'r', encoding='latin1') as f:
            lines = f.readlines()
            metadata['_debug']['lines'] = lines[:10]
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                def get_value(prefixes):
                    for prefix in prefixes:
                        if line.lower().startswith(prefix.lower()):
                            return line[len(prefix):].lstrip(":. ").strip()
                    return None
                if (value := get_value(['** Ship', '**Ship'])) is not None:
                    metadata['Ship'] = value or "NA"
                elif (value := get_value(['** Cruise', '**Cruise'])) is not None:
                    metadata['Cruise ID'] = value or "NA"
                elif (value := get_value(['** Station', '**Station'])) is not None:
                    # Clean cases like "No :    1357"
                    value = re.sub(r'^\s*No\s*[:]*\s*', '', value, flags=re.IGNORECASE)
                    metadata['Station'] = value or "NA"
                elif (value := get_value(['** Latitude', '**Latitude', '** Lat', '**Lat'])) is not None:
                    metadata['Latitude'] = value or "NA"
                elif (value := get_value(['** Longitude', '**Longitude', '** Long', '**Long'])) is not None:
                    metadata['Longitude'] = value or "NA"
                elif any(x in line.lower() for x in ['# start_time', 'system upload time']):
                    parts = re.split(r'[=:]', line, 1)
                    metadata['Start Time'] = parts[-1].strip() if len(parts) > 1 else "NA"
    except Exception as e:
        print(f"Error reading DAT file {dat_path}: {str(e)}")
        print(f"Debug info: {metadata['_debug']}")
        return None, None
    try:
        if metadata['Latitude'] != "NA" and metadata['Longitude'] != "NA":
            lat_clean, lon_clean = clean_lat_long_pair(metadata['Latitude'], metadata['Longitude'])
            metadata['Latitude'] = parse_latitude(lat_clean)
            metadata['Longitude'] = parse_longitude(lon_clean)
        else:
            metadata['Latitude'] = "NA"
            metadata['Longitude'] = "NA"
    except Exception as e:
        print(f" Failed to convert lat/lon for {dat_path}: {e}")
        metadata['Latitude'] = "NA"
        metadata['Longitude'] = "NA"
    try:
        with open(asc_path, 'r') as f:
            lines = [line.strip() for line in f if line.strip()]
    except Exception as e:
        print(f"Error reading ASC file: {e}")
        return None, None
    if not lines:
        print(f"Empty ASC file: {asc_path}")
        return None, None
    header_line = None
    for line in lines:
        if any(x in line for x in ['DepS', 'T068', 'Sal', 'Pr', 'Sigma', 'OxML']):
            header_line = line
            break
    if not header_line:
        print(f"No valid header found in {asc_path}")
        return None, None
    column_names = header_line.split()
    data_start = lines.index(header_line) + 1
    data = []
    for line in lines[data_start:]:
        if any(x in line for x in ['DepS', 'T068', 'Sal', 'Pr', 'Sigma', 'OxML']):
            continue
        try:
            values = list(map(float, line.split()))
            if len(values) == len(column_names):
                data.append(values)
        except ValueError:
            continue
    if not data:
        print(f"No valid data found in {asc_path}")
        return None, None
    try:
        data_df = pd.DataFrame(data, columns=make_unique(column_names))
        data_df = merge_duplicate_columns_take_first(data_df)
        trace_id = "_".join([
            str(metadata.get("Cruise ID")).replace(" ", "").replace(":", "-"),
            str(metadata.get("Station")).replace(" ", ""),
            str(metadata.get("Latitude")),
            str(metadata.get("Longitude")),
            str(metadata.get("Start Time")).replace(" ", "_")
        ])
        data_df["TraceID"] = trace_id
        data_df["folderpath_filename"] = relative_path
        column_groups = {
            "sbeox0ML/L": ["OxML/L", "ox0ML/L", "Sbeox0ML/L"],
            "oxsatML/L": ["OxsatML/L"],
            "c0mS/cm":["C0S/m"],
            "Sal00":["Sal00","Sal"],
            "potemp068C":["Potemp068", "Pta068"],
            "avgSvC":["SvC"],
            "depSM":["DepS"],
            "prDM":["Pr"],
            "potemp090C":["Potemp090"],
            "xmiss":["Xmiss"],
            "sigma-é00":["Sigma-é00"],
            "sigma-t00":["Sigma-t00"],
            "nbin":["Nbin"],
            "acc":["Acc"],
            "bat":["Bat"],
            "sva":["Sva"],
            "gpa":["Gpa"],
            "dm":["Dm"],
            "tsa":["Tsa"],
            "flag":["Flag"],
            "dz/dt":["Dz/dt"],
            "specc":["Specc"]
        }
        for unified_col, variants in column_groups.items():
            present = [col for col in data_df.columns if col in variants]
            if present:
                data_df[unified_col] = data_df[present].bfill(axis=1).iloc[:, 0]
                to_drop = [col for col in present if col != unified_col]
                data_df.drop(columns=to_drop, inplace=True)
        meta_df = pd.DataFrame([{
            "Ship": metadata["Ship"],
            "Cruise ID": metadata["Cruise ID"],
            "Station": metadata["Station"],
            "Latitude": metadata["Latitude"],
            "Longitude": metadata["Longitude"],
            "Start Time": metadata["Start Time"],
            "Parameters": ", ".join(column_names),
            "TraceID": trace_id,
            "folderpath_filename": relative_path
        }])
        return meta_df, data_df
    except Exception as e:
        print(f"Error creating DataFrames: {e}")
        return None, None
# ------------ MAIN LOOP ----------------
def map_asc_dat_files(folder_path):
    all_metadata = []
    all_data = []

    cnv_folders = set()
    for root, dirs, files in os.walk(folder_path):
        if any(f.lower().endswith('.cnv') for f in files):
            cnv_folders.add(root)
    for dirpath, dirnames, filenames in os.walk(folder_path):
        if any(cnv_folder in dirpath for cnv_folder in cnv_folders):
            print(f"Skipping {dirpath} (parent folder contains CNV files)")
            continue
        asc_files = [f for f in filenames if f.lower().endswith('.asc')]
        for asc_file in asc_files:
            base_name = os.path.splitext(asc_file)[0]
            dat_file = base_name + '.dat'
            dat_path = os.path.join(dirpath, 'datfiles', dat_file)
            if not os.path.exists(dat_path):
                dat_path = os.path.join(dirpath, dat_file)
                if not os.path.exists(dat_path):
                    parent_dir = os.path.dirname(dirpath)
                    for subfolder in ['data', 'dat']:
                        alt_path = os.path.join(parent_dir, subfolder, dat_file)
                        if os.path.exists(alt_path):
                            dat_path = alt_path
                            break
            if os.path.exists(dat_path):
                try:
                    relative_path = os.path.relpath(os.path.join(dirpath, asc_file), folder_path).replace("\\", "/")
                    meta_df, data_df = parse_asc_dat_pair(
                        os.path.join(dirpath, asc_file),
                        dat_path,
                        relative_path
                    )
                    if meta_df is not None and data_df is not None:
                        all_metadata.append(meta_df)
                        all_data.append(data_df)
                except Exception as e:
                    print(f"Error processing {asc_file} with {dat_file}: {e}")

    return all_metadata, all_data
#-------------ASCHDR--------------------------

def merge_duplicate_columns_take_first(df):
    grouped = defaultdict(list)
    for col in df.columns:
        base = re.sub(r'_\d+$', '', col)
        grouped[base].append(col)
    new_df = df.copy()
    for base, cols in grouped.items():
        if len(cols) > 1:
            new_df[base] = new_df[cols].bfill(axis=1).iloc[:, 0]
            new_df.drop(columns=[col for col in cols if col != base], inplace=True)
    new_df.dropna(axis=1, how='all', inplace=True)
    return new_df

# ------------ MAIN ASC-DAT PAIR PARSER ----------------

def parse_asc_hdr_pair(asc_path, hdr_path, relative_path):
    print(f"Processing ASC-HDR pair: {asc_path} with {hdr_path}")
    
    metadata = {
        "Ship": "NA", "Cruise ID": "NA", "Station": "NA",
        "Latitude": "NA", "Longitude": "NA", "Start Time": "NA",
        "_debug": {"file": hdr_path, "lines": []}
    }

    try:
        with open(hdr_path, 'r', encoding='latin1') as f:
            lines = f.readlines()
            metadata['_debug']['lines'] = lines[:10]
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                def get_value(prefixes):
                    for prefix in prefixes:
                        if line.lower().startswith(prefix.lower()):
                            return line[len(prefix):].lstrip(":. ").strip()
                    return None
                if (value := get_value(['** Ship', '**Ship'])) is not None:
                    metadata['Ship'] = value or "NA"
                elif (value := get_value(['** Cruise', '**Cruise'])) is not None:
                    metadata['Cruise ID'] = value or "NA"
                elif (value := get_value(['** Station', '**Station'])) is not None:
                    metadata['Station'] = value or "NA"
                elif (value := get_value(['** Latitude', '**Latitude', '** Lat', '**Lat'])) is not None:
                    metadata['Latitude'] = value or "NA"
                elif (value := get_value(['** Longitude', '**Longitude', '** Long', '**Long'])) is not None:
                    metadata['Longitude'] = value or "NA"
                elif any(x in line.lower() for x in ['# start_time', 'system upload time']):
                    parts = re.split(r'[=:]', line, 1)
                    metadata['Start Time'] = parts[-1].strip() if len(parts) > 1 else "NA"
    except Exception as e:
        print(f"Error reading HDR file {hdr_path}: {str(e)}")
        print(f"Debug info: {metadata['_debug']}")
        return None, None

    try:
        if metadata['Latitude'] != "NA" and metadata['Longitude'] != "NA":
            lat_clean, lon_clean = clean_lat_long_pair(metadata['Latitude'], metadata['Longitude'])
            metadata['Latitude'] = parse_latitude(lat_clean)
            metadata['Longitude'] = parse_longitude(lon_clean)
        else:
            metadata['Latitude'] = "NA"
            metadata['Longitude'] = "NA"
    except Exception as e:
        print(f" Failed to convert lat/lon for {hdr_path}: {e}")
        metadata['Latitude'] = "NA"
        metadata['Longitude'] = "NA"

    try:
        with open(asc_path, 'r') as f:
            lines = [line.strip() for line in f if line.strip()]
    except Exception as e:
        print(f"Error reading ASC file: {e}")
        return None, None

    if not lines:
        print(f"Empty ASC file: {asc_path}")
        return None, None

    header_line = None
    for line in lines:
        if any(x in line for x in ['DepS', 'T068', 'Sal', 'Pr', 'Sigma', 'OxML']):
            header_line = line
            break

    if not header_line:
        print(f"No valid header found in {asc_path}")
        return None, None

    column_names = header_line.split()
    data_start = lines.index(header_line) + 1

    data = []
    for line in lines[data_start:]:
        if any(x in line for x in ['DepS', 'T068', 'Sal', 'Pr', 'Sigma', 'OxML']):
            continue
        try:
            values = list(map(float, line.split()))
            if len(values) == len(column_names):
                data.append(values)
        except ValueError:
            continue

    if not data:
        print(f"No valid data found in {asc_path}")
        return None, None

    try:
        data_df = pd.DataFrame(data, columns=make_unique(column_names))
        data_df = merge_duplicate_columns_take_first(data_df)
        trace_id = "_".join([
            str(metadata.get("Cruise ID")).replace(" ", "").replace(":", "-"),
            str(metadata.get("Station")).replace(" ", ""),
            str(metadata.get("Latitude")),
            str(metadata.get("Longitude")),
            str(metadata.get("Start Time")).replace(" ", "_")
        ])
        data_df["TraceID"] = trace_id
        data_df["folderpath_filename"] = relative_path
        column_groups = {
            "sbeox0ML/L": ["OxML/L", "ox0ML/L", "Sbeox0ML/L"],
            "oxsatML/L": ["OxsatML/L"],
            "c0mS/cm":["C0S/m"],
            "Sal00":["Sal00","Sal"],
            "potemp068C":["Potemp068", "Pta068"],
            "avgSvC":["SvC"],
            "depSM":["DepS"],
            "prDM":["Pr"],
            "potemp090C":["Potemp090"],
            "xmiss":["Xmiss"],
            "sigma-t00":["Sigma-t00"],
            "gpa":["Gpa"],
            "dm":["Dm"],
            "flag":["Flag"],
            "nbin":["Nbin"]
        }

        for unified_col, variants in column_groups.items():
            present = [col for col in data_df.columns if col in variants]
            if present:
                data_df[unified_col] = data_df[present].bfill(axis=1).iloc[:, 0]
                to_drop = [col for col in present if col != unified_col]
                data_df.drop(columns=to_drop, inplace=True)


        meta_df = pd.DataFrame([{
            "Ship": metadata["Ship"],
            "Cruise ID": metadata["Cruise ID"],
            "Station": metadata["Station"],
            "Latitude": metadata["Latitude"],
            "Longitude": metadata["Longitude"],
            "Start Time": metadata["Start Time"],
            "Parameters": ", ".join(column_names),
            "TraceID": trace_id,
            "folderpath_filename": relative_path
        }])

        return meta_df, data_df

    except Exception as e:
        print(f"Error creating DataFrames: {e}")
        return None, None

# ------------ MAIN LOOP ----------------
def map_asc_hdr_files(folder_path):
    all_metadata = []
    all_data = []
    skip_folders = set()
    for root, dirs, files in os.walk(folder_path):
        if any(f.lower().endswith(('.cnv', '.dat')) for f in files):
            skip_folders.add(root)
    hdr_map = {}

    for root, dirs, files in os.walk(folder_path):
        if any(skip_folder in root for skip_folder in skip_folders):
            continue

        hdr_files = [f for f in files if f.lower().endswith('.hdr')]
        asc_files = [f for f in files if f.lower().endswith('.asc')]
        dat_cnv_files = [f for f in files if f.lower().endswith(('.cnv', '.dat'))]
        if hdr_files:
            for f in hdr_files:
                base = os.path.splitext(f)[0]
                full_path = os.path.join(root, f)
                upper_base = base.upper()
                if upper_base.startswith('HDR'):
                    code = upper_base[3:]
                    hdr_map[code] = full_path
                hdr_map[upper_base] = full_path
    for root, dirs, files in os.walk(folder_path):
        if any(skip_folder in root for skip_folder in skip_folders):
            print(f"Skipping {root} (contains CNV/DAT files)")
            continue

        asc_files = [f for f in files if f.lower().endswith('.asc')]
        if not asc_files:
            continue

        for asc_file in asc_files:
            base = os.path.splitext(asc_file)[0]
            possible_codes = []
            if base.isdigit():
                possible_codes.append(base)
            if base.upper().startswith('STN'):
                possible_codes.append(base[3:])
            possible_codes.append(base)
            possible_codes.append(base.upper())

            matched = False
            for code in possible_codes:
                if code in hdr_map:
                    hdr_path = hdr_map[code]
                    try:
                        relative_path = os.path.relpath(
                            os.path.join(root, asc_file), folder_path
                        ).replace("\\", "/")

                        meta_df, data_df = parse_asc_hdr_pair(
                            os.path.join(root, asc_file),
                            hdr_path,
                            relative_path
                        )

                        if meta_df is not None and data_df is not None:
                            all_metadata.append(meta_df)
                            all_data.append(data_df)
                            print(f" Matched: {asc_file} with {os.path.basename(hdr_path)}")
                            matched = True
                            break
                    except Exception as e:
                        print(f" Error processing {asc_file} with {hdr_path}: {e}")
    return all_metadata, all_data
def main():
    folder = r"C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD"
    #ASC-CNV
    meta_asc, data_asc = preprocess_cnv_folder(folder)
    # ASC-DAT
    meta_dat, data_dat = map_asc_dat_files(folder)
    
    # ASC-HDR
    meta_hdr, data_hdr = map_asc_hdr_files(folder)

    # Concatenate all metadata and data
    all_meta = pd.concat(meta_dat + meta_hdr + meta_asc , ignore_index=True) if (meta_dat or meta_hdr or meta_asc) else pd.DataFrame()
    all_data = pd.concat(data_dat + data_hdr + data_asc, ignore_index=True) if (data_dat or data_hdr or data_asc) else pd.DataFrame()

    # Save final outputs
    if not all_meta.empty:
        all_meta.to_csv('fallback.csv', index=False, na_rep='NA')
        print(f" Saved all metadata → fallback.csv")
    if not all_data.empty:
        all_data.to_csv('fallback_data.csv', index=False, na_rep='NA')
        print(f" Saved all data → fallback_data.csv")

if __name__ == "__main__":
    main()
#--------QC----------
print("\n Running QC on extracted data...")

meta = pd.read_csv('fallback.csv')
data = pd.read_csv('fallback_data.csv')
raster = rasterio.open(r"C:\Users\aishwarya\Downloads\ETOPO1_Bed_g_geotiff\ETOPO1_Bed_g_geotiff.tif")
bathymetry = raster.read(1)

def is_at_sea(lat, lon):
    try:
        row, col = raster.index(lon, lat)
        return bathymetry[row, col] < 0
    except:
        return False
# === Profile Envelope QC Range for TEMP (GTSPP)
TEMP_PROFILE_ENVELOPE = [
    {"min_depth": 0, "max_depth": 1100, "min_value": -2.0, "max_value": 40.0},
    {"min_depth": 1100, "max_depth": 3000, "min_value": -1.5, "max_value": 18.0},
]

def get_profile_envelope(depth, envelope_table):
    for layer in envelope_table:
        if layer["min_depth"] <= depth < layer["max_depth"]:
            return layer["min_value"], layer["max_value"]
    return None, None
def profile_envelope_qc(df, depth_col='depSM', param_col='t090C', envelope_table=TEMP_PROFILE_ENVELOPE):
    flags = []
    for _, row in df.iterrows():
        depth = row.get(depth_col)
        value = row.get(param_col)

        if pd.isna(depth) or pd.isna(value):
            flags.append(9) 
            continue

        min_val, max_val = get_profile_envelope(depth, envelope_table)
        if min_val is None:
            flags.append(9)  
        elif min_val <= value <= max_val:
            flags.append(1) 
        else:
            flags.append(4)  
    return pd.Series(flags, name=f'{param_col}_PROFILE_QC')

# === QC 1: Valid datetime
meta['datetime'] = pd.to_datetime(meta['Start Time'], errors='coerce')
meta['DATE_QC'] = pd.to_datetime(meta['Start Time'], errors='coerce').dt.year.gt(1997).map({True: 1, False: 4})

# === QC 2: Valid position
valid_lat = meta['Latitude'].between(-40, 30)
valid_lon = meta['Longitude'].between(20, 160)
meta['POS_QC'] = ((valid_lat & valid_lon)).map({True: 1, False: 4})

# === QC 3: Location at Sea
print(" Checking location at sea...")
meta['SEA_QC'] = meta.apply(lambda row: 1 if is_at_sea(row['Latitude'], row['Longitude']) else 4, axis=1)

# === Combine all three station-level QC tests
meta_valid = meta[(meta['DATE_QC'] == 1) & (meta['POS_QC'] == 1) & (meta['SEA_QC'] == 1)]

# === Filter data to only valid profiles
valid_trace_ids = meta_valid['TraceID'].tolist()
#data = data[data['TraceID'].isin(valid_trace_ids)]

# === Gradient and Spike QC Functions
def gradient_test(series, threshold):
    result = (series - (series.shift(-1) + series.shift(1)) / 2).abs() <= threshold
    return result.map({True: 1, False: 4})

def spike_test(series, threshold):
    part1 = (series - (series.shift(-1) + series.shift(1)) / 2).abs()
    part2 = ((series.shift(-1) - series.shift(1)) / 2).abs()
    result = (part1 - part2) <= threshold
    return result.map({True: 1, False: 4})

# === QC 4–6: Variable-level QC for TEMP and PSAL
if 't090C' in data.columns:
    data['TEMP_QC'] = data['t090C'].between(-2, 40).map({True: 1, False: 4})
    data['TEMP_GRAD_QC'] = gradient_test(data['t090C'], 10.0)
    data['TEMP_SPIKE_QC'] = spike_test(data['t090C'], 2.0)

if 'Sal00' in data.columns:
    data['PSAL_QC'] = data['Sal00'].between(0, 41).map({True: 1, False: 4})
    data['PSAL_GRAD_QC'] = gradient_test(data['Sal00'], 5.0)
    data['PSAL_SPIKE_QC'] = spike_test(data['Sal00'], 0.3)
# === QC 7: Profile Envelope Test
if 'depSM' in data.columns and 't090C' in data.columns:
    data['TEMP_PROFILE_QC'] = profile_envelope_qc(data, depth_col='depSM', param_col='t090C')


meta.to_csv("meta.csv", index=False,na_rep='NaN')
data.to_csv("data.csv", index=False,na_rep='NaN')

print(" QC complete. Saved meta.csv and data.csv ")    





Found 1034 CNV files...
Processing: 1153.cnv
Parsing C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1153.cnv
Processing: 1153PP.CNV
Parsing C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1153PP.CNV
Error processing C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1153PP.CNV: No parameters found in C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1153PP.CNV
Processing: 1154a.cnv
Parsing C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1154a.cnv
Processing: 1154B.CNV
Parsing C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1154B.CNV
Processing: 1154c.cnv
Parsing C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1154c.cnv
Processing: 1154mic1.cnv
Parsing C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1154mic1.cnv
Processing: 1154micr.cnv
Parsing C:\Users\aishwarya\OneDrive\Desktop\NEW1\CMLRE_CTD\cr205\CTD205\1154micr.cnv
Processing: 1154papa.cnv
Parsing C:\Users\ais

  new_rows = np.floor(new_rows).astype(dtype="int32")
  new_cols = np.floor(new_cols).astype(dtype="int32")


 QC complete. Saved meta.csv and data.csv 
