In [1]:
import os
import pandas as pd
import re
import rasterio
def make_unique(columns):
    counts = {}
    result = []
    for col in columns:
        if col in counts:
            counts[col] += 1
            result.append(f"{col}_{counts[col]}")
        else:
            counts[col] = 1
            result.append(col)
    return result
def is_number(val):
    try:
        float(val)
        return True
    except:
        return False
def process_csv_same_headers_per_file(file_path):
    print(f"Reading: {file_path}")
    df = pd.read_csv(file_path, header=None)
    all_meta = []
    data_blocks = []

    meta_header = df.iloc[0].dropna().tolist()
    meta_header = [x for x in meta_header if isinstance(x, str)]

    i = 1
    data_header = None

    while i < len(df):
        row_vals = df.iloc[i].dropna().tolist()

        if len(row_vals) >= len(meta_header) // 2:
            if len(row_vals) < len(meta_header):
                row_vals += [''] * (len(meta_header) - len(row_vals))
            metadata = dict(zip(meta_header, row_vals))
            metadata['SourceFile'] = os.path.basename(file_path)
            metadata['Folder'] = os.path.basename(os.path.dirname(file_path))
            all_meta.append(metadata)
            i += 1
        elif data_header is None and len(row_vals) == 3 and all(not is_number(x) for x in row_vals):
            data_header = row_vals
            i += 1

        elif data_header is not None and row_vals == data_header:
            i += 1

        elif data_header is not None and len(row_vals) == 3 and all(is_number(x) for x in row_vals):
            data_rows = []
            while i < len(df):
                data_row = df.iloc[i].dropna().tolist()
                if data_row == data_header:
                    i += 1
                    continue
                if len(data_row) == 3 and all(is_number(x) for x in data_row):
                    data_rows.append(data_row)
                    i += 1
                else:
                    break
            if data_rows:
                if len(set(data_header)) != len(data_header):
                    print(f"Duplicate column names detected in {file_path}: {data_header}")
                    data_header = make_unique(data_header)
                temp_df = pd.DataFrame(data_rows, columns=data_header)
                temp_df['SourceFile'] = os.path.basename(file_path)
                temp_df['Folder'] = os.path.basename(os.path.dirname(file_path))
                data_blocks.append(temp_df)
        else:
            if data_header is None and len(row_vals) == 3 and all(is_number(x) for x in row_vals):
                data_rows = []
                col_count = 3
                while i < len(df):
                    data_row = df.iloc[i].dropna().tolist()
                    if len(data_row) == col_count and all(is_number(x) for x in data_row):
                        data_rows.append(data_row)
                        i += 1
                    else:
                        break
                if data_rows:
                    dummy_header = [f"Col{j+1}" for j in range(col_count)]
                    temp_df = pd.DataFrame(data_rows, columns=dummy_header)
                    temp_df['SourceFile'] = os.path.basename(file_path)
                    temp_df['Folder'] = os.path.basename(os.path.dirname(file_path))
                    data_blocks.append(temp_df)
            else:
                i += 1
    pairs = list(zip(all_meta, data_blocks))
    return pairs
def debug_process_csv(file_path):
    print(f"Debug processing: {file_path}")
    try:
        with open(file_path, 'r') as f:
            lines = f.readlines()

        all_meta = []
        data_blocks = []
        meta_header = None
        current_meta = None
        last_meta = None
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            if not line:
                i += 1
                continue
            fields = [f.strip() for f in line.split(',') if f.strip()]
            if meta_header is None and len(fields) >= 6 and all(not is_number(x) for x in fields):
                meta_header = fields
                i += 1
                continue
            if meta_header and len(fields) == len(meta_header):
                current_meta = dict(zip(meta_header, fields))
                current_meta['SourceFile'] = os.path.basename(file_path)
                current_meta['Folder'] = os.path.basename(os.path.dirname(file_path))
                all_meta.append(current_meta)
                last_meta = current_meta
                i += 1
                continue
            if len(fields) >= 2 and all(not is_number(x) for x in fields):
                data_header = make_unique(fields)
                i += 1
                data_rows = []
                while i < len(lines):
                    row = [x.strip() for x in lines[i].strip().split(',') if x.strip()]
                    if len(row) != len(data_header) or not all(is_number(x) for x in row):
                        break
                    data_rows.append(row)
                    i += 1
                if data_rows and last_meta:
                    temp_df = pd.DataFrame(data_rows, columns=data_header)
                    temp_df['SourceFile'] = os.path.basename(file_path)
                    temp_df['Folder'] = os.path.basename(os.path.dirname(file_path))
                    data_blocks.append((last_meta.copy(), temp_df))
                continue
            if last_meta and len(fields) >= 2 and all(is_number(x) for x in fields):
                data_rows = []
                col_count = len(fields)
                while i < len(lines):
                    row = [x.strip() for x in lines[i].strip().split(',') if x.strip()]
                    if len(row) == col_count and all(is_number(x) for x in row):
                        data_rows.append(row)
                        i += 1
                    else:
                        break
                if data_rows:
                    dummy_header = [f"Col{j+1}" for j in range(col_count)]
                    temp_df = pd.DataFrame(data_rows, columns=dummy_header)
                    temp_df['SourceFile'] = os.path.basename(file_path)
                    temp_df['Folder'] = os.path.basename(os.path.dirname(file_path))
                    data_blocks.append((last_meta.copy(), temp_df))
                continue  
            i += 1
        return data_blocks
    except Exception as e:
        print(f"Debug processing failed for {file_path}: {e}")
        return []
def log_parser_error_details(file_path):
    try:
        with open(file_path, "r") as f:
            lines = f.readlines()
        block_cols = None
        block_type = None  # 'meta' or 'data'
        block_start = 0

        for i, line in enumerate(lines):
            raw = line.strip()
            if not raw:
                continue
            cols = [x.strip() for x in raw.split('\t') if x.strip()] 
            if not cols:
                continue
            if all(not is_number(x) for x in cols):
                block_cols = len(cols)
                block_type = "header"
                block_start = i + 1
                continue
            if block_cols is not None and len(cols) != block_cols:
                continue
    except Exception as e:
        print(f"Failed to inspect file: {e}")
def process_csv_with_fallback(file_path):
    try:
        pairs = process_csv_same_headers_per_file(file_path)
        if pairs:
            return pairs
        print(f"⚠ Format issue detected in {file_path}, attempting recovery with debug mode...")
        log_parser_error_details(file_path) 
        return debug_process_csv(file_path)
    except pd.errors.ParserError as e:
        if "Expected" in str(e) and "saw" in str(e):
            log_parser_error_details(file_path)
            return debug_process_csv(file_path)
        raise
    except Exception as e:
        print(f"Unexpected error processing {file_path}: {e}")
        return []
def generate_trace_id(row):
    fields = ['ship_name', 'ORG_CR_NO', 'ORG_ST_NO', 'latitude', 'LONGITUDE', 'ST_DATE']
    parts = [str(row.get(f, "NA")).strip() or "NA" for f in fields]
    return "_".join(parts)
def folder_has_cnv(folder_path):
    for dirpath, _, filenames in os.walk(folder_path):
        if any(fname.lower().endswith(".cnv") for fname in filenames):
            return True
    return False
def process_csv_folder_recursive(folder_path, output_meta="00_Moutput.csv", output_data="00_doutput.csv"):
    all_meta = []
    all_data = []
    for dirpath, _, filenames in os.walk(folder_path):
        if folder_has_cnv(dirpath):
            continue
        for fname in filenames:
            if fname.lower().endswith(".csv"):
                fpath = os.path.join(dirpath, fname)
                pairs = process_csv_with_fallback(fpath)
                for meta_row, data_df in pairs:
                    trace_fields = ['ship_name', 'ORG_CR_NO', 'ORG_ST_NO', 'latitude', 'LONGITUDE', 'ST_DATE']
                    for f in trace_fields:
                        if f not in meta_row:
                            meta_row[f] = "NA"
                    try:
                        dt = pd.to_datetime(meta_row['ST_DATE'], errors='coerce')
                        meta_row['ST_DATE'] = dt.strftime("%b %d %Y %H:%M:%S") if not pd.isna(dt) else "NA"
                    except:
                        meta_row['ST_DATE'] = "NA"
                    traceid = generate_trace_id(meta_row)
                    meta_row['TraceID'] = traceid
                    data_df['TraceID'] = traceid
                    data_df['SourceFile'] = os.path.basename(fpath)
                    data_df['Folder'] = os.path.basename(os.path.dirname(fpath))
                    data_df['folderpath_filename'] = data_df['Folder'] + os.sep + data_df['SourceFile']
                    param_cols = [col for col in data_df.columns if col not in {'TraceID', 'SourceFile', 'Folder'}]
                    meta_row['Parameters'] = ', '.join(param_cols)
                    meta_row['folderpath_filename'] = os.path.join(meta_row['Folder'], meta_row['SourceFile'])

                    column_groups = {
                        'depSM': ['DepS', 'DepSM', 'Depth', 'Deps', 'depth', 'meters', 'Meters'],
                        't090C': ['Temp', 'Temp.(C)', 'T090C', 'T090', 'T068', 'temp'],
                        'Sal00': ['Sal00', 'Sal', 'salinity', 'Sal(psu)', 'Salinity'],
                    }
                    data_df.columns = [col.strip() for col in data_df.columns]
                    col_names = list(data_df.columns)
                    col_prefixes = [col for col in col_names if re.fullmatch(r'Col\d+', col)]
                    if set(col_prefixes).issubset(col_names):
                        col_count = len(col_prefixes)
                        col_prefixes_sorted = sorted(col_prefixes, key=lambda x: int(x[3:])) 
                        rename_map = {}
                        if col_count == 2:
                            rename_map = dict(zip(col_prefixes_sorted, ['depSM', 't090C']))
                        elif col_count == 3:
                            rename_map = dict(zip(col_prefixes_sorted, ['depSM', 't090C', 'Sal00']))
                        elif col_count == 4:
                            if data_df['Col4'].notna().sum() > 0:
                                rename_map = dict(zip(col_prefixes_sorted, ['Sequence', 'depSM', 't090C', 'Sal00']))
                        if rename_map:
                            data_df.rename(columns=rename_map, inplace=True)
                    for unified_col, variants in column_groups.items():
                        present = [col for col in variants if col in data_df.columns]
                        if present:
                            data_df[unified_col] = data_df[present].bfill(axis=1).iloc[:, 0]
                            to_drop = [col for col in present if col != unified_col]
                            data_df.drop(columns=to_drop, inplace=True)
                    all_meta.append(pd.DataFrame([meta_row]))
                    all_data.append(data_df)
    final_meta = pd.concat(all_meta, ignore_index=True) if all_meta else pd.DataFrame()
    final_data = pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    if not final_meta.empty:
        final_meta.rename(columns={
            'latitude': 'Latitude',
            'LONGITUDE': 'Longitude',
            'ST_DATE': 'Start Time',
            'ORG_CR_NO': 'Cruise ID',
            'ship_name':'Ship',
            'ORG_ST_NO': 'Station'
        }, inplace=True)
        final_meta.drop(columns=['SourceFile', 'Folder'], errors='ignore', inplace=True)
        final_meta.to_csv(output_meta, index=False, na_rep='NaN')
        print(f"Saved metadata to: {output_meta} ({len(final_meta)} rows)")
    else:
        print("No metadata found")
    if not final_data.empty:
        final_data.drop(columns=['SourceFile', 'Folder'], errors='ignore', inplace=True)
        final_data.to_csv(output_data, index=False, na_rep='NaN')
        print(f"Saved data to: {output_data} ({len(final_data)} rows)")
    else:
        print("No data found")
folder = r"C:\Users\aishwarya\OneDrive\Desktop\303\NIO"
process_csv_folder_recursive(folder)
#--------QC----------
print("\n Running QC on extracted data...")

meta = pd.read_csv('00_Moutput.csv')
data = pd.read_csv('00_doutput.csv')
raster = rasterio.open(r"C:\Users\aishwarya\Downloads\ETOPO1_Bed_g_geotiff\ETOPO1_Bed_g_geotiff.tif")
bathymetry = raster.read(1)

def is_at_sea(lat, lon):
    try:
        row, col = raster.index(lon, lat)
        return bathymetry[row, col] < 0
    except:
        return False
# === Profile Envelope QC Range for TEMP (GTSPP)
TEMP_PROFILE_ENVELOPE = [
    {"min_depth": 0, "max_depth": 1100, "min_value": -2.0, "max_value": 40.0},
    {"min_depth": 1100, "max_depth": 3000, "min_value": -1.5, "max_value": 18.0},
]

def get_profile_envelope(depth, envelope_table):
    for layer in envelope_table:
        if layer["min_depth"] <= depth < layer["max_depth"]:
            return layer["min_value"], layer["max_value"]
    return None, None
def profile_envelope_qc(df, depth_col='depSM', param_col='t090C', envelope_table=TEMP_PROFILE_ENVELOPE):
    flags = []
    for _, row in df.iterrows():
        depth = row.get(depth_col)
        value = row.get(param_col)

        if pd.isna(depth) or pd.isna(value):
            flags.append(9) 
            continue

        min_val, max_val = get_profile_envelope(depth, envelope_table)
        if min_val is None:
            flags.append(9) 
        elif min_val <= value <= max_val:
            flags.append(1)  
        else:
            flags.append(4)  
    return pd.Series(flags, name=f'{param_col}_PROFILE_QC')

# === QC 1: Valid datetime
meta['datetime'] = pd.to_datetime(meta['Start Time'], errors='coerce')
meta['DATE_QC'] = pd.to_datetime(meta['Start Time'], errors='coerce').dt.year.gt(1997).map({True: 1, False: 4})

# === QC 2: Valid position
valid_lat = meta['Latitude'].between(-40, 30)
valid_lon = meta['Longitude'].between(20, 160)
meta['POS_QC'] = ((valid_lat & valid_lon)).map({True: 1, False: 4})

# === QC 3: Location at Sea
print("🔹 Checking location at sea...")
meta['SEA_QC'] = meta.apply(lambda row: 1 if is_at_sea(row['Latitude'], row['Longitude']) else 4, axis=1)

# === Combine all three station-level QC tests
meta_valid = meta[(meta['DATE_QC'] == 1) & (meta['POS_QC'] == 1) & (meta['SEA_QC'] == 1)]

# === Filter data to only valid profiles
valid_trace_ids = meta_valid['TraceID'].tolist()
#data = data[data['TraceID'].isin(valid_trace_ids)]

# === Gradient and Spike QC Functions
def gradient_test(series, threshold):
    result = (series - (series.shift(-1) + series.shift(1)) / 2).abs() <= threshold
    return result.map({True: 1, False: 4})

def spike_test(series, threshold):
    part1 = (series - (series.shift(-1) + series.shift(1)) / 2).abs()
    part2 = ((series.shift(-1) - series.shift(1)) / 2).abs()
    result = (part1 - part2) <= threshold
    return result.map({True: 1, False: 4})

# === QC 4–6: Variable-level QC for TEMP and PSAL
if 't090C' in data.columns:
    data['TEMP_QC'] = data['t090C'].between(-2, 40).map({True: 1, False: 4})
    data['TEMP_GRAD_QC'] = gradient_test(data['t090C'], 10.0)
    data['TEMP_SPIKE_QC'] = spike_test(data['t090C'], 2.0)

if 'Sal00' in data.columns:
    data['PSAL_QC'] = data['Sal00'].between(0, 41).map({True: 1, False: 4})
    data['PSAL_GRAD_QC'] = gradient_test(data['Sal00'], 5.0)
    data['PSAL_SPIKE_QC'] = spike_test(data['Sal00'], 0.3)
# === QC 7: Profile Envelope Test
if 'depSM' in data.columns and 't090C' in data.columns:
    data['TEMP_PROFILE_QC'] = profile_envelope_qc(data, depth_col='depSM', param_col='t090C')

#meta.drop(columns=['at_sea'], inplace=True)
meta.to_csv("meta9.csv", index=False,na_rep='NaN')
data.to_csv("data9.csv", index=False,na_rep='NaN')

print(" QC complete. Saved meta9.csv and data9.csv ")





Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk103.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk104.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk105a.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk109.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk109.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk110.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk110a.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk113.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk113.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk115.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk116.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk118.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk119.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk121.csv
Reading: C:\Us

  df = pd.read_csv(file_path, header=None)


Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk148.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk148.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk149D.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk175.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk179.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk179.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk189.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk194.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk194.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk195.csv


  df = pd.read_csv(file_path, header=None)


Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk200.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk207.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk207.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk212.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk219.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk219.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk220.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk227.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk63.csv


  df = pd.read_csv(file_path, header=None)


Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk70.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\sk70.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK124\sk124-1_10stn_no_salinity.csv
⚠ Format issue detected in C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK124\sk124-1_10stn_no_salinity.csv, attempting recovery with debug mode...
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK124\sk124-1_10stn_no_salinity.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK124\sk124.csv
Debug processing: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK124\sk124.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK138C\sk138c-07.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK138C\sk138c-10.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK138C\sk138c-13.csv
Reading: C:\Users\aishwarya\OneDrive\Desktop\303\NIO\CTD1m\SK138C\sk138c-5-19.csv
Reading: C:\Users\aish