In [1]:

import pandas as pd
import numpy as np
from pathlib import Path


# 1. Set up paths

# Project root (modify if needed)
project_root = Path("/Users/v/Desktop/Visual/glaucoma_DL_project")
data_path = project_root / "data" / "uwhvf-master" / "CSV" / "VF_Data.csv"
output_path = project_root / "data" / "processed"
output_path.mkdir(parents=True, exist_ok=True)

print(f"Loading data from: {data_path}")

# 2. Load and inspect data

df = pd.read_csv(data_path)
print("Dataset loaded:", df.shape)
print("Columns:", len(df.columns))
print("Unique patients:", df['PatID'].nunique())
print("Unique eyes:", df[['PatID','Eye']].drop_duplicates().shape[0])

# Sort to make sure each eye’s records are ordered
df = df.sort_values(by=["PatID", "Eye", "FieldN"]).reset_index(drop=True)

# 3. Construct triplets (v1, v2 -> v3)

triplets = []

for (pid, eye), group in df.groupby(["PatID", "Eye"]):
    group = group.sort_values("FieldN")
    
    # skip if fewer than 3 visits
    if len(group) < 3:
        continue
    
    # slide over consecutive visits
    for i in range(len(group) - 2):
        v1 = group.iloc[i]
        v2 = group.iloc[i+1]
        v3 = group.iloc[i+2]
        
        triplets.append({
            "PatID": pid,
            "Eye": eye,
            "v1_FieldN": v1["FieldN"],
            "v2_FieldN": v2["FieldN"],
            "v3_FieldN": v3["FieldN"],
            "Age_v1": v1["Age"],
            "Age_v2": v2["Age"],
            "Age_v3": v3["Age"],
            "MTD_v1": v1["MTD"],
            "MTD_v2": v2["MTD"],
            "MTD_v3": v3["MTD"],
            "PSD_v1": v1["PSD"],
            "PSD_v2": v2["PSD"],
            "PSD_v3": v3["PSD"]
        })

triplet_df = pd.DataFrame(triplets)
print("Triplets created:", len(triplet_df))
triplet_df.head(10)

# 4. Save triplets for later model training

save_path = output_path / "triplets_summary.csv"
triplet_df.to_csv(save_path, index=False)
print("Saved triplets summary to:", save_path)

Loading data from: /Users/v/Desktop/Visual/glaucoma_DL_project/data/uwhvf-master/CSV/VF_Data.csv
Dataset loaded: (28943, 184)
Columns: 184
Unique patients: 3871
Unique eyes: 7428
Triplets created: 14117
Saved triplets summary to: /Users/v/Desktop/Visual/glaucoma_DL_project/data/processed/triplets_summary.csv


In [2]:
# ================================================================
# Build model-ready X/Y from TD_1–54
# ================================================================
import pandas as pd
import numpy as np
from pathlib import Path

# -----------------------------
# 1) Paths & settings
# -----------------------------
project_root = Path("/Users/v/Desktop/Visual/glaucoma_DL_project")
vf_path      = project_root / "data" / "uwhvf-master" / "CSV" / "VF_Data.csv"
coord_path   = project_root / "data" / "uwhvf-master" / "CSV" / "Coord_242.csv"
out_proc     = project_root / "data" / "processed"
out_feat     = project_root / "data" / "features"
out_proc.mkdir(parents=True, exist_ok=True)
out_feat.mkdir(parents=True, exist_ok=True)

# Missing-value handling strategy: "median" or "drop"
MISSING_STRATEGY = "median"
# Optional TD clipping range to reduce extreme outliers
TD_CLIP = (-35.0, 5.0)

print("Loading VF data:", vf_path)
df = pd.read_csv(vf_path)
print("Loading coordinates:", coord_path)
coords = pd.read_csv(coord_path)

# -----------------------------
# 2) Identify TD columns & remove blind spots
# -----------------------------
td_cols = [c for c in df.columns if c.startswith("TD_")]

# Blind spots in Coord_242 are Cluster==0; map LocID -> TD column name
blind_loc_ids = coords.loc[coords["Cluster"]==0, "LocID"].astype(int).tolist()  # typically two locs
blind_td_cols = [f"TD_{lid}" for lid in blind_loc_ids if f"TD_{lid}" in td_cols]

print(f"Detected {len(td_cols)} TD columns total.")
print(f"Blind-spot TD columns to exclude: {blind_td_cols}")

# Keep only non-blind locations
td_keep = [c for c in td_cols if c not in blind_td_cols]
L = len(td_keep)  # number of valid locations used (typically 52)
print(f"Using {L} TD locations after removing blind spots.")

# -----------------------------
# 3) (Optional) Pre-compute medians for imputation
# -----------------------------
col_median = df[td_keep].median()  # per-location median for imputing missing values

def prepare_vector(row, clip=TD_CLIP, missing=MISSING_STRATEGY):
    """
    Return a TD vector (length L) from a row, with optional clipping & imputation.
    """
    v = row[td_keep].to_numpy(dtype=float)
    # Handle missing
    if missing == "median":
        # replace NaN by per-location median
        nan_mask = np.isnan(v)
        if nan_mask.any():
            v[nan_mask] = col_median[nan_mask].to_numpy()
    elif missing == "drop":
        # if any NaN, return None to skip this sample
        if np.isnan(v).any():
            return None
    # Clip extremes
    if clip is not None:
        v = np.clip(v, clip[0], clip[1])
    return v

# -----------------------------
# 4) Build triplets & assemble X / Y tensors
# -----------------------------
df = df.sort_values(by=["PatID","Eye","FieldN"]).reset_index(drop=True)

X_list, Y_list, meta = [], [], []

for (pid, eye), g in df.groupby(["PatID","Eye"]):
    g = g.sort_values("FieldN")
    if len(g) < 3: 
        continue

    for i in range(len(g) - 2):
        r1, r2, r3 = g.iloc[i], g.iloc[i+1], g.iloc[i+2]

        v1 = prepare_vector(r1)
        v2 = prepare_vector(r2)
        v3 = prepare_vector(r3)

        # If using "drop" and any vector returned None, skip
        if v1 is None or v2 is None or v3 is None:
            continue

        # Stack input channels: (2, L); target: (L,)
        X = np.stack([v1, v2], axis=0)  # (2, L)
        Y = v3                           # (L,)

        X_list.append(X)
        Y_list.append(Y)
        meta.append({
            "PatID": pid,
            "Eye": eye,
            "v1_FieldN": int(r1["FieldN"]),
            "v2_FieldN": int(r2["FieldN"]),
            "v3_FieldN": int(r3["FieldN"]),
            "Age_v1": float(r1["Age"]),
            "Age_v2": float(r2["Age"]),
            "Age_v3": float(r3["Age"]),
            "MTD_v1": float(r1["MTD"]),
            "MTD_v2": float(r2["MTD"]),
            "MTD_v3": float(r3["MTD"]),
            "PSD_v1": float(r1["PSD"]),
            "PSD_v2": float(r2["PSD"]),
            "PSD_v3": float(r3["PSD"]),
        })

X = np.stack(X_list, axis=0)  # (N, 2, L)
Y = np.stack(Y_list, axis=0)  # (N, L)
meta_df = pd.DataFrame(meta)

print(f"Built X shape: {X.shape}  (N samples, 2 channels, {L} locations)")
print(f"Built Y shape: {Y.shape}  (N samples, {L} locations)")
print("Meta rows:", meta_df.shape)

# -----------------------------
# 5) Save arrays and metadata
# -----------------------------
x_path = out_feat / f"X_td2ch_L{L}.npy"
y_path = out_feat / f"Y_td_L{L}.npy"
m_path = out_proc / f"triplets_meta_L{L}.csv"

np.save(x_path, X)
np.save(y_path, Y)
meta_df.to_csv(m_path, index=False)

print("Saved:")
print("  X ->", x_path)
print("  Y ->", y_path)
print("  meta ->", m_path)

Loading VF data: /Users/v/Desktop/Visual/glaucoma_DL_project/data/uwhvf-master/CSV/VF_Data.csv
Loading coordinates: /Users/v/Desktop/Visual/glaucoma_DL_project/data/uwhvf-master/CSV/Coord_242.csv
Detected 54 TD columns total.
Blind-spot TD columns to exclude: ['TD_26', 'TD_35']
Using 52 TD locations after removing blind spots.
Built X shape: (14117, 2, 52)  (N samples, 2 channels, 52 locations)
Built Y shape: (14117, 52)  (N samples, 52 locations)
Meta rows: (14117, 14)
Saved:
  X -> /Users/v/Desktop/Visual/glaucoma_DL_project/data/features/X_td2ch_L52.npy
  Y -> /Users/v/Desktop/Visual/glaucoma_DL_project/data/features/Y_td_L52.npy
  meta -> /Users/v/Desktop/Visual/glaucoma_DL_project/data/processed/triplets_meta_L52.csv
