In [4]:
# ...existing code...
import re
import codecs
import pandas as pd
from typing import List

FIXED_COLS = [
    "Name","Stat","Nd","Vd","dPgF","Density","Melt","Cost","CR","FR","SR","AR","PR",
    "LambdaMin","LambdaMax","TCE","D0","D1","D2","E0","E1","Ltk","Formula"
]

SELLMEIER1_KEYS = ["K1","L1","K2","L2","K3","L3"]
SCHOTT_KEYS = ["A0","A1","A2","A3","A4","A5"]

def _read_lines_auto_encoding(path: str) -> List[str]:  # <-- change list[str] -> List[str]
    data = open(path, "rb").read()

    # BOM-based detection
    if data.startswith(codecs.BOM_UTF16_LE):
        text = data.decode("utf-16-le", errors="replace")
    elif data.startswith(codecs.BOM_UTF16_BE):
        text = data.decode("utf-16-be", errors="replace")
    elif data.startswith(codecs.BOM_UTF8):
        text = data.decode("utf-8-sig", errors="replace")
    else:
        try:
            text = data.decode("utf-8", errors="strict")
        except UnicodeDecodeError:
            text = data.decode("cp1252", errors="replace")

    text = text.replace("\x00", "")
    return text.splitlines()

def parse_cdgm_report(path: str) -> pd.DataFrame:
    lines = _read_lines_auto_encoding(path)

    # find header line robustly (normalize whitespace)
    header_idx = None
    for i, line in enumerate(lines):
        norm = re.sub(r"\s+", " ", line).strip()
        if re.match(r"^Name\b", norm) and re.search(r"\bFormula\b", norm):
            header_idx = i
            break
    if header_idx is None:
        # helpful debug: show a few candidate lines
        sample = "\n".join(lines[:25])
        raise ValueError("Header line not found. First lines were:\n" + sample)

    rows = []
    for line in lines[header_idx + 1:]:
        s = line.strip()
        if not s:
            continue

        tokens = re.split(r"\s+", s)
        if len(tokens) < len(FIXED_COLS):
            continue

        base = dict(zip(FIXED_COLS, tokens[:len(FIXED_COLS)]))
        rest = tokens[len(FIXED_COLS):]

        coeffs = {}
        j = 0
        while j + 1 < len(rest):
            key, val = rest[j], rest[j + 1]
            if re.fullmatch(r"[A-Za-z]\d+", key):  # K1, L2, A0, ...
                coeffs[key] = val
                j += 2
            else:
                break

        rows.append({**base, **coeffs})

    df = pd.DataFrame(rows)

    # numeric conversion
    for c in FIXED_COLS:
        if c in {"Name", "Stat", "Formula"}:
            continue
        df[c] = pd.to_numeric(df[c].replace("?", pd.NA), errors="coerce")

    coeff_cols = [c for c in df.columns if c not in FIXED_COLS]
    for c in coeff_cols:
        df[c] = pd.to_numeric(df[c].replace("?", pd.NA), errors="coerce")

    return df


def get_glass_info(df: pd.DataFrame, name: str) -> dict:
    name = name.strip()
    m = df[df["Name"].str.casefold() == name.casefold()]

    if m.empty:
        # try partial matches to help you
        near = df[df["Name"].str.contains(re.escape(name), case=False, na=False)]["Name"].head(15).tolist()
        raise KeyError(f"Glass '{name}' not found. Similar: {near}")

    if len(m) > 1:
        # uncommon, but handle it
        m = m.iloc[[0]]

    row = m.iloc[0]
    formula = str(row.get("Formula", ""))

    if formula == "Sellmeier1":
        keys = SELLMEIER1_KEYS
    elif formula == "Schott":
        keys = SCHOTT_KEYS
    else:
        # fallback: return all parsed coeff columns for that row
        keys = [c for c in df.columns if re.fullmatch(r"[A-Za-z]\d+", c)]

    coeffs = {k: (None if pd.isna(row.get(k)) else float(row.get(k))) for k in keys if k in df.columns}

    return {
        "Name": row["Name"],
        "Nd": None if pd.isna(row["Nd"]) else float(row["Nd"]),
        "Vd": None if pd.isna(row["Vd"]) else float(row["Vd"]),
        "Formula": formula,
        "Coefficients": coeffs,
    }


# ---- Usage (interactive) ----
df = parse_cdgm_report("CDGM_glass_report.txt")

while True:
    q = input("Glass name (empty to quit): ").strip()
    if not q:
        break
    try:
        info = get_glass_info(df, q)
        print(f"\nName: {info['Name']}")
        print(f"Nd: {info['Nd']}")
        print(f"Vd: {info['Vd']}")
        print(f"Formula: {info['Formula']}")
        print("Coefficients:")
        for k, v in info["Coefficients"].items():
            print(f"  {k} = {v}")
        print()
    except Exception as e:
        print(e)


Name: H-FK55
Nd: 1.5502
Vd: 75.224681
Formula: Sellmeier1
Coefficients:
  K1 = 0.534931974
  L1 = 0.00128059279
  K2 = 0.839050115
  L2 = 0.0116444604
  K3 = 0.886511647
  L3 = 143.252236

