In [None]:
"""
Parse CF4 galaxy distance catalog from a text file, clean column names,
infer data types, filter invalid entries (non-finite RA), and export each
column as a separate dataset to an HDF5 file. Fields containing 'pgc' are
stored as int32.
"""

import csv
import numpy as np
import h5py

filename = "/Users/rstiskalek/Downloads/CF4_table.txt"

# --- Load and parse file ---
with open(filename, "r", encoding="utf-8") as f:
    lines = f.readlines()

col_names = lines[1].strip().split(",")
data_lines = lines[5:]
rows = [row for row in csv.reader(data_lines) if any(row)]

# --- Dtype inference ---
def infer_dtype(col):
    clean = [v for v in col if v.strip()]
    try:
        [float(v) for v in clean]
        return np.float64
    except ValueError:
        try:
            [int(v) for v in clean]
            return np.int32
        except ValueError:
            maxlen = max(len(v) for v in clean) if clean else 1
            return f'U{maxlen}'

columns = list(zip(*rows))
dtypes = [(col_names[i], infer_dtype(columns[i])) for i in range(len(col_names))]

# --- Build structured array ---
arr = np.empty(len(rows), dtype=dtypes)
for i, row in enumerate(rows):
    for j, val in enumerate(row):
        name, typ = dtypes[j]
        kind = np.dtype(typ).kind
        try:
            arr[i][name] = float(val) if kind == 'f' else int(val) if kind == 'i' else val
        except ValueError:
            arr[i][name] = np.nan if kind == 'f' else -1 if kind == 'i' else ""

# --- Filter valid RA ---
arr = arr[np.isfinite(arr["RA"])]

# --- Clean up field names ---
clean_names = [name.replace("*", "") for name in arr.dtype.names]
new_dtype = np.dtype([(clean_names[i], arr.dtype[i]) for i in range(len(clean_names))])
clean_arr = np.empty(arr.shape, dtype=new_dtype)
for old, new in zip(arr.dtype.names, clean_names):
    clean_arr[new] = arr[old]

# --- Write to HDF5 ---
fname_write = filename.replace(".txt", ".hdf5")
print(f"Writing to {fname_write}")

with h5py.File(fname_write, "w") as f:
    g = f.create_group("cf4")
    for name in clean_arr.dtype.names:
        data = clean_arr[name]
        if "pgc" in name.lower():
            with np.errstate(invalid="ignore"):
                data = np.where(np.isfinite(data), data.astype(np.int32), -1)
        g.create_dataset(name, data=data)
        

Writing to /Users/rstiskalek/Downloads/CF4_table.hdf5
