In [2]:
import os
import pandas as pd
from collections import Counter
import rasterio

def make_unique(columns):
    counts = Counter()
    result = []
    for col in columns:
        counts[col] += 1
        result.append(f"{col}_{counts[col]}" if counts[col] > 1 else col)
    return result

def parse_txt_combined(file_path, relative_path=""):
    with open(file_path, 'r', encoding='latin1') as f:
        lines = [line.strip() for line in f if line.strip()]

    metadata = {
        "Ship": float('nan'),
        "Cruise ID": float('nan'),
        "Station": float('nan'),
        "Latitude": float('nan'),
        "Longitude": float('nan'),
        "Start Time": float('nan'),
    }

    for line in lines:
        if line.startswith("** Ship:"):
            metadata["Ship"] = line.split(":", 1)[1].strip()
        elif line.startswith("** Cruise:"):
            metadata["Cruise ID"] = line.split(":", 1)[1].strip()
        elif line.startswith("** Station:"):
            metadata["Station"] = line.split(":", 1)[1].strip()
        elif line.startswith("** Latitude:"):
            metadata["Latitude"] = line.split(":", 1)[1].strip()
        elif line.startswith("** Longitude:"):
            metadata["Longitude"] = line.split(":", 1)[1].strip()
        elif line.startswith("# start_time"):
            parts = line.split("=", 1)
            if len(parts) > 1:
                metadata["Start Time"] = parts[1].strip()

    def safe_str(v):
        return "NaN" if pd.isna(v) else str(v)

    metadata["TraceID"] = f"{safe_str(metadata['Cruise ID'])}_{safe_str(metadata['Station'])}_{safe_str(metadata['Latitude'])}_{safe_str(metadata['Longitude'])}_{safe_str(metadata['Start Time'])}"
    metadata["folderpath_filename"] = relative_path

    # === Type 1: Standard headers like Pressure, Depth, etc.
    header_keywords = ['Pressure', 'Depth', 'Temperature', 'Theta']
    header_line = next((line for line in lines if any(k in line for k in header_keywords)), None)

    if header_line:
        raw_columns = header_line.split()
        columns = make_unique(raw_columns)
        start_index = lines.index(header_line) + 1
        data_rows = []

        for line in lines[start_index:]:
            if any(k in line for k in header_keywords):
                continue
            if not any(c.isdigit() for c in line):
                continue
            parts = line.split()
            if len(parts) < len(columns):
                parts += [None] * (len(columns) - len(parts))
            elif len(parts) > len(columns):
                parts = parts[:len(columns)]
            data_rows.append(parts)

        if not data_rows:
            raise ValueError("No valid data rows")

        df = pd.DataFrame(data_rows, columns=columns)
        df.columns = make_unique(df.columns)
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    else:
        # === Type 2: Block headers like "Press Temp Cond ..."
        block_headers = ['press', 'temp', 'cond', 'sal', 'o2', 'ph', 'chl', 'tr%', 'memory']
        data_blocks = []
        current_columns = []
        current_data = []
        pre_header_data = []
        
        for line in lines:
            if "Keyb.Cmd" in line or "<ESC>" in line:
                continue
            if sum(c.isalnum() for c in line) < len(line) * 0.3:
                continue
            if not all(32 <= ord(c) <= 126 or c.isspace() for c in line):
                continue
        
            tokens = line.split()
            token_match = all(
                any(c.isalpha() for c in t) and all(32 <= ord(c) <= 126 for c in t)
                for t in tokens
            )
            header_match = any(k in line.lower() for k in block_headers)
            if token_match and header_match:
                if not all(32 <= ord(c) <= 126 or c.isspace() for c in line):
                    continue

                if current_columns and current_data:
                    df = pd.DataFrame(current_data, columns=current_columns)
                    df.columns = make_unique(df.columns)
                    for col in df.columns:
                        df[col] = pd.to_numeric(df[col], errors='coerce')
                    data_blocks.append(df)
                    current_data = []
                current_columns = make_unique(tokens)
                for row in pre_header_data:
                    if len(row) < len(current_columns):
                        row += [None] * (len(current_columns) - len(row))
                    elif len(row) > len(current_columns):
                        row = row[:len(current_columns)]
                    current_data.append(row)
                pre_header_data = []
                continue
            if not current_columns:
                if any(c.isdigit() for c in line):
                    pre_header_data.append(line.split())
                continue
            if sum(c.isalnum() for c in line) < len(line) * 0.3:
                continue
            if not all(32 <= ord(c) <= 126 or c.isspace() for c in line):
                continue
        
            # === Regular data row ===
            parts = line.split()
            if len(parts) < len(current_columns):
                parts += [None] * (len(current_columns) - len(parts))
            elif len(parts) > len(current_columns):
                parts = parts[:len(current_columns)]
            current_data.append(parts)
        if current_columns and current_data:
            df = pd.DataFrame(current_data, columns=current_columns)
            df.columns = make_unique(df.columns)
            for col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            data_blocks.append(df)

        # === Fallback: no headers at all ===
        if not data_blocks:
            print(f" No header found. Trying fixed-column parsing for: {os.path.basename(file_path)}")
        
            data_rows = []
        
            for line in lines:
                if not any(c.isdigit() for c in line):
                    continue
                tokens = line.split()
                if len(tokens) < 8:
                    continue
                if not any(char.isdigit() for char in tokens[0]):
                    tokens = tokens[1:]
        
                data_rows.append(tokens)
        
            if not data_rows:
                raise ValueError("No valid data rows")
        
            fallback_columns = [
                "Press", "Temp", "Cond", "Sal.", "O2%", "O2ppm", "pH", "Chl(a)", "Turb.Alt.",  "Tr%",   "Rho." ,"memory", "flag", "time"
            ]
        
            max_cols = len(fallback_columns)
            columns = fallback_columns
            data_rows = [row[:max_cols] for row in data_rows]
        
            df = pd.DataFrame(data_rows, columns=columns)
        
            if "time" in df.columns:
                time_col = df["time"].astype(str)
                df.drop(columns=["time"], inplace=True)
            else:
                time_col = None
        
            for col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
            if time_col is not None:
                df["time"] = time_col


        else:
            df = pd.concat(data_blocks, ignore_index=True)
    # === Normalize column variants ===
    column_groups = {
        "prDM": ["Press", "Pressure"],
        "t090C": ["Temp", "Temperature"],
        "Sal00": ["Sal.", "Sal", "Salinity"],
        "depSM":["Depth"],
        "c0mS/cm":["Cond"],
        "sbeox0ML/L":["Oxygen"],
        "sigma-t00":["SigmaT"],
    }
    
    df.columns = [col.strip() for col in df.columns]
    
    for unified_col, variants in column_groups.items():
        present = [col for col in df.columns if col in variants]
        if present:
            df[unified_col] = df[present].bfill(axis=1).iloc[:, 0]
            to_drop = [col for col in present if col != unified_col]
            df.drop(columns=to_drop, inplace=True)
    if "Oxygen_2" in df.columns:
        df.drop(columns=["Oxygen_2"], inplace=True)
    df["TraceID"] = metadata["TraceID"]
    df["folderpath_filename"] = metadata["folderpath_filename"]
    metadata["parameters"] = ", ".join(df.columns[:-2])

    return pd.DataFrame([metadata]), df

def preprocess_txt_folder(folder_path, output_meta_csv='textn.csv', output_data_csv='textnd.csv'):
    all_metadata = []
    all_data = []
    all_files = []

    for dirpath, _, filenames in os.walk(folder_path):
        txt_files = [os.path.join(dirpath, f) for f in filenames if f.lower().endswith('.txt')]
        all_files.extend(txt_files)

    print(f"\n Found {len(all_files)} text files in: {folder_path}")

    for file_path in all_files:
        print(f" Processing: {os.path.basename(file_path)}")
        try:
            rel_path = os.path.relpath(file_path, folder_path).replace("\\", "/")
            meta_df, df = parse_txt_combined(file_path, rel_path)

            if df.columns.duplicated().any():
                raise ValueError(f"Duplicate columns found in {file_path}: {df.columns[df.columns.duplicated()].tolist()}")

            all_metadata.append(meta_df.to_dict(orient='records')[0])
            all_data.append(df)
        except Exception as e:
            print(f" Error in {file_path}: {e}")

    if all_data:
        data_table = pd.concat(all_data, ignore_index=True)
        data_table.to_csv(output_data_csv, index=False, na_rep="NaN")
        print(f" Data saved to: {output_data_csv}")

    if all_metadata:
        meta_table = pd.DataFrame(all_metadata)
        meta_table.to_csv(output_meta_csv, index=False, na_rep="NaN")
        print(f" Metadata saved to: {output_meta_csv}")

if __name__ == "__main__":
    folder = r"C:\Users\aishwarya\Videos\Captures\Desktop\303"
    preprocess_txt_folder(folder, output_meta_csv='textn.csv', output_data_csv='textnd.csv')
#--------QC----------
print("\n Running QC on extracted data...")

meta = pd.read_csv('textn.csv')
data = pd.read_csv('textnd.csv')
raster = rasterio.open(r"C:\Users\aishwarya\Downloads\ETOPO1_Bed_g_geotiff\ETOPO1_Bed_g_geotiff.tif")
bathymetry = raster.read(1)

def is_at_sea(lat, lon):
    try:
        row, col = raster.index(lon, lat)
        return bathymetry[row, col] < 0
    except:
        return False
# === Profile Envelope QC Range for TEMP (GTSPP)
TEMP_PROFILE_ENVELOPE = [
    {"min_depth": 0, "max_depth": 1100, "min_value": -2.0, "max_value": 40.0},
    {"min_depth": 1100, "max_depth": 3000, "min_value": -1.5, "max_value": 18.0},
]

def get_profile_envelope(depth, envelope_table):
    for layer in envelope_table:
        if layer["min_depth"] <= depth < layer["max_depth"]:
            return layer["min_value"], layer["max_value"]
    return None, None
def profile_envelope_qc(df, depth_col='depSM', param_col='t090C', envelope_table=TEMP_PROFILE_ENVELOPE):
    flags = []
    for _, row in df.iterrows():
        depth = row.get(depth_col)
        value = row.get(param_col)

        if pd.isna(depth) or pd.isna(value):
            flags.append(9)  # missing
            continue

        min_val, max_val = get_profile_envelope(depth, envelope_table)
        if min_val is None:
            flags.append(9)
        elif min_val <= value <= max_val:
            flags.append(1)
        else:
            flags.append(4) 
    return pd.Series(flags, name=f'{param_col}_PROFILE_QC')

# === QC 1: Valid datetime
meta['datetime'] = pd.to_datetime(meta['Start Time'], errors='coerce')
meta['DATE_QC'] = pd.to_datetime(meta['Start Time'], errors='coerce').dt.year.gt(1997).map({True: 1, False: 4})

# === QC 2: Valid position
valid_lat = meta['Latitude'].between(-40, 30)
valid_lon = meta['Longitude'].between(20, 160)
meta['POS_QC'] = ((valid_lat & valid_lon)).map({True: 1, False: 4})

# === QC 3: Location at Sea
print(" Checking location at sea...")
meta['SEA_QC'] = meta.apply(lambda row: 1 if is_at_sea(row['Latitude'], row['Longitude']) else 4, axis=1)

# === Combine all three station-level QC tests
meta_valid = meta[(meta['DATE_QC'] == 1) & (meta['POS_QC'] == 1) & (meta['SEA_QC'] == 1)]

# === Filter data to only valid profiles
valid_trace_ids = meta_valid['TraceID'].tolist()
#data = data[data['TraceID'].isin(valid_trace_ids)]

# === Gradient and Spike QC Functions
def gradient_test(series, threshold):
    result = (series - (series.shift(-1) + series.shift(1)) / 2).abs() <= threshold
    return result.map({True: 1, False: 4})

def spike_test(series, threshold):
    part1 = (series - (series.shift(-1) + series.shift(1)) / 2).abs()
    part2 = ((series.shift(-1) - series.shift(1)) / 2).abs()
    result = (part1 - part2) <= threshold
    return result.map({True: 1, False: 4})

# === QC 4â€“6: Variable-level QC for TEMP and PSAL
if 't090C' in data.columns:
    data['TEMP_QC'] = data['t090C'].between(-2, 40).map({True: 1, False: 4})
    data['TEMP_GRAD_QC'] = gradient_test(data['t090C'], 10.0)
    data['TEMP_SPIKE_QC'] = spike_test(data['t090C'], 2.0)

if 'Sal00' in data.columns:
    data['PSAL_QC'] = data['Sal00'].between(0, 41).map({True: 1, False: 4})
    data['PSAL_GRAD_QC'] = gradient_test(data['Sal00'], 5.0)
    data['PSAL_SPIKE_QC'] = spike_test(data['Sal00'], 0.3)
# === QC 7: Profile Envelope Test
if 'depSM' in data.columns and 't090C' in data.columns:
    data['TEMP_PROFILE_QC'] = profile_envelope_qc(data, depth_col='depSM', param_col='t090C')

#meta.drop(columns=['at_sea'], inplace=True)
meta.to_csv("texnQC.csv", index=False,na_rep='NaN')
data.to_csv("textndQC.csv", index=False,na_rep='NaN')

print(" QC complete. Saved textnQC.csv and textndQC.csv ")





 Found 11 text files in: C:\Users\aishwarya\Videos\Captures\Desktop\303
 Processing: SN031ctd01.TXT
 Processing: SN031ctd02.txt
 Processing: SN031ctd04.txt
 Processing: SN031ctd07.txt
 Processing: SN031ctd08.TXT
 Processing: SN031ctd09.TXT
 No header found. Trying fixed-column parsing for: SN031ctd09.TXT
 Processing: SN031ctd10.TXT
 Processing: SN031ctd3.txt
 Processing: SN031ctd5001.txt
 Processing: SN031ctd5002.txt
 Processing: SN031ctd6.txt
 Data saved to: textnd.csv
 Metadata saved to: textn.csv

 Running QC on extracted data...


  data = pd.read_csv('textnd.csv')


 Checking location at sea...


  new_rows = np.floor(new_rows).astype(dtype="int32")
  new_cols = np.floor(new_cols).astype(dtype="int32")


 QC complete. Saved textnQC.csv and textndQC.csv 
