
# 🧪 Validate Models from W&B CSV — v2 (Robust Name/Config Parsing)

This improved version:
- Recovers **model name** from multiple CSV fields (case-insensitive) **and** from inside the parsed `config`.
- Writes sidecar `<name>.arch.json` files next to matched checkpoints.
- Provides **debug summaries** if something can't be matched.
- Guards sorting and displays informative messages if no results are found.


## 🔧 Parameters

In [None]:

CSV_PATH = "/mnt/data/wandb_export_2025-08-22T14_23_07.979+01_00.csv"  # your uploaded CSV

# Where to search for checkpoints
SEARCH_ROOTS = [
    "/home/admindi/sbenites/WirelessLocation",
    "/mnt/data",
]

# Validation subsets
SUBSETS = ["garage", "outdoor", "indoor"]  # or ["all"] or include "collections"
COLLECTIONS = []                            # used only if "collections" in SUBSETS

DB_NAME = "wifi_fingerprinting_data"
BATCH_SIZE = 4096

# Project paths so imports work
PROJECT_PATHS = [
    "/home/admindi/sbenites/WirelessLocation",
    "/mnt/data",
]


## 📦 Imports & Environment

In [None]:

import os, glob, json, ast, math, re
from typing import Dict, Any, List, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import sys

for p in PROJECT_PATHS:
    if p not in sys.path:
        sys.path.append(p)

from data_processing import get_dataset, combine_arrays, shuffle_array, split_combined_data
from model_generation import GeneratedModel
from gpu_fucntion import LightningWrapper

print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


## 🗂️ Collections & Subsets

In [None]:

ALL_COLLECTIONS = [
    "equilatero_grande_garage",
    "equilatero_grande_outdoor",
    "equilatero_medio_garage",
    "equilatero_medio_outdoor",
    "isosceles_grande_indoor",
    "isosceles_grande_outdoor",
    "isosceles_medio_outdoor",
    "obtusangulo_grande_outdoor",
    "obtusangulo_pequeno_outdoor",
    "reto_grande_garage",
    "reto_grande_indoor",
    "reto_grande_outdoor",
    "reto_medio_garage",
    "reto_medio_outdoor",
    "reto_n_quadrado_grande_indoor",
    "reto_n_quadrado_grande_outdoor",
    "reto_n_quadrado_pequeno_outdoor",
    "reto_pequeno_garage",
    "reto_pequeno_outdoor",
]

def group_by_location(collections: List[str], locations: List[str]) -> List[str]:
    return [name for name in collections if any(loc in name for loc in locations)]

SUBSET_MAP = {
    "garage": group_by_location(ALL_COLLECTIONS, ["garage"]),
    "outdoor": group_by_location(ALL_COLLECTIONS, ["outdoor"]),
    "indoor": group_by_location(ALL_COLLECTIONS, ["indoor"]),
    "all": ALL_COLLECTIONS,
}


## 🧩 CSV → Architecture Config & Name

In [None]:

def parse_jsonish(text: str) -> Optional[Dict[str, Any]]:
    if not isinstance(text, str) or not text.strip():
        return None
    s = text.strip()
    candidates = [s, s.replace("'", '"')]
    repl = (('None','null'), ('True','true'), ('False','false'))
    s2 = s
    for a,b in repl:
        s2 = re.sub(r'\b'+a+r'\b', b, s2)
    candidates += [s2, s2.replace("'", '"')]
    for c in candidates:
        try:
            return json.loads(c)
        except Exception:
            pass
        try:
            obj = ast.literal_eval(c)
            if isinstance(obj, dict):
                return obj
        except Exception:
            pass
    return None

def set_in_nested(d: Dict[str, Any], path_parts: List[Any], value: Any):
    cur = d
    for i, key in enumerate(path_parts):
        is_last = (i == len(path_parts) - 1)
        if isinstance(key, int):
            if not isinstance(cur, list):
                cur_list = []
                if isinstance(cur, dict): cur.clear()
                cur = cur_list
            while len(cur) <= key:
                cur.append({})
            if is_last:
                cur[key] = value
            else:
                if not isinstance(cur[key], (dict, list)):
                    cur[key] = {}
                cur = cur[key]
        else:
            if is_last:
                cur[key] = value
            else:
                if key not in cur or not isinstance(cur[key], (dict, list)):
                    cur[key] = {}
                cur = cur[key]

def parse_flattened_to_config(row: pd.Series) -> Optional[Dict[str, Any]]:
    colnames = list(row.index)
    prefixes = ["config.", "architecture_config.", "arch_config.", "model_config.", "architecture.", "cnn_config."]
    candidates = [c for c in colnames if any(c.startswith(p) for p in prefixes)]
    if not candidates:
        return None
    cfg: Dict[str, Any] = {}
    for c in candidates:
        val = row[c]
        if pd.isna(val): continue
        if isinstance(val, str):
            if val.lower() in ("true","false"):
                val = (val.lower() == "true")
            else:
                try:
                    if "." in val:
                        v = float(val)
                        val = int(v) if v.is_integer() else v
                    else:
                        val = int(val)
                except Exception:
                    parsed = parse_jsonish(val)
                    val = parsed if parsed is not None else val
        parts = []
        for part in c.split("."):
            m = re.match(r"^(.*?)(\[(\d+)\])?$", part)
            if not m:
                parts.append(part)
                continue
            name, _, idx = m.groups()
            if name: parts.append(name)
            if idx is not None: parts.append(int(idx))
        if parts and parts[0] in ("config","architecture_config","arch_config","model_config","architecture","cnn_config"):
            parts = parts[1:]
        if not parts: continue
        set_in_nested(cfg, parts, val)
    return cfg if cfg else None

def row_to_arch_config(row: pd.Series) -> Optional[Dict[str, Any]]:
    # direct JSON-ish
    direct_cols = [
        "architecture_config", "arch_config", "config", "model_config",
        "architecture", "cnn_config", "config_json", "hparams", "hyperparameters",
        "Hyperparameters", "Config", "CONFIG",
    ]
    for col in direct_cols:
        if col in row and isinstance(row[col], str):
            cfg = parse_jsonish(row[col])
            if isinstance(cfg, dict): return cfg
    # flattened
    cfg = parse_flattened_to_config(row)
    if isinstance(cfg, dict) and cfg: return cfg
    return None

def pick_model_name(row: pd.Series, cfg: Optional[Dict[str, Any]]) -> Optional[str]:
    # 1) from config (most reliable for your training code)
    if isinstance(cfg, dict):
        for key in ["name", "model_name"]:
            if key in cfg and isinstance(cfg[key], str) and cfg[key].strip():
                base = os.path.basename(cfg[key].strip())
                return os.path.splitext(base)[0]

    # 2) CSV columns (case-insensitive)
    lower_cols = {c.lower(): c for c in row.index}
    candidates = ["name","model_name","ckpt_stem","run_name","run name","id","slug"]
    for lc in candidates:
        if lc in lower_cols:
            value = row[lower_cols[lc]]
            if isinstance(value, str) and value.strip():
                base = os.path.basename(value.strip())
                return os.path.splitext(base)[0]
    return None


## 📥 Load CSV & Build Entries

In [None]:

df_raw = pd.read_csv(CSV_PATH)
print("CSV rows:", len(df_raw))
print("First 20 columns:", list(df_raw.columns)[:20])

entries = []  # {name, arch_config, ckpt_path}

def find_ckpt_for_name(name: str) -> Optional[str]:
    for root in SEARCH_ROOTS:
        pattern = os.path.join(root, "**", f"{name}.ckpt")
        found = glob.glob(pattern, recursive=True)
        if found:
            found.sort(key=lambda p: os.path.getmtime(p), reverse=True)
            return found[0]
    return None

for idx, row in df_raw.iterrows():
    cfg = row_to_arch_config(row)
    name = pick_model_name(row, cfg)
    if not (isinstance(cfg, dict) and name):
        continue
    ckpt = find_ckpt_for_name(name)
    entries.append({"name": name, "arch_config": cfg, "ckpt_path": ckpt})

print(f"Total parsed entries (config+name): {len(entries)}")
print("Preview:", entries[:3])


## 🧷 Write Sidecars & Coverage Report

In [None]:

written = 0
missing_ckpt = []

for e in entries:
    name, cfg, ckpt = e["name"], e["arch_config"], e["ckpt_path"]
    if not ckpt:
        missing_ckpt.append(name); continue
    sidecar = ckpt.replace(".ckpt", ".arch.json")
    try:
        with open(sidecar, "w") as f:
            json.dump(cfg, f)
        written += 1
        e["sidecar"] = sidecar
    except Exception as ex:
        print(f"⚠️ Sidecar failed for {name}: {ex}")

print(f"Sidecars written: {written}")
if missing_ckpt:
    print(f"❗ {len(missing_ckpt)} models missing .ckpt (showing up to 20):")
    for n in missing_ckpt[:20]:
        print(" -", n)

entries_ready = [e for e in entries if e.get("ckpt_path") and e.get("sidecar")]
print("Entries ready:", len(entries_ready))


## 📏 Metrics & Data Loading

In [None]:

def mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.mean((y_true - y_pred) ** 2))

def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.sqrt(mse(y_true, y_pred)))

def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.mean(np.abs(y_true - y_pred)))

def r2_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true, axis=0)) ** 2)
    return float(1 - ss_res / ss_tot) if ss_tot != 0 else float("nan")

def load_val_data(selected_collections: List[str], db_name: str) -> Tuple[np.ndarray, np.ndarray]:
    print(f"📡 Loading validation datasets: {selected_collections}")
    datasets = [get_dataset(name, db_name) for name in selected_collections]
    combined = combine_arrays(datasets)
    shuffled = shuffle_array(combined)
    X, y = split_combined_data(shuffled)
    return X, y


## 🧠 Load Model & Evaluate

In [None]:

def load_model_from_arch_and_ckpt(ckpt_path: str, arch_config: Dict[str, Any], input_size: int, output_size: int, device: torch.device) -> LightningWrapper:
    model = GeneratedModel(input_size=input_size, output_size=output_size, architecture_config=arch_config)
    wrapper = LightningWrapper(
        model=model,
        train_data=(torch.empty(1, input_size), torch.empty(1, output_size)),
        val_data=(torch.empty(1, input_size), torch.empty(1, output_size)),
        learning_rate=arch_config.get("learning_rate", 1e-3),
        weight_decay=arch_config.get("weight_decay", 0.0),
        optimizer_name=arch_config.get("optimizer", "adam"),
    )
    ckpt = torch.load(ckpt_path, map_location="cpu")
    state_dict = ckpt.get("state_dict", ckpt)
    missing, unexpected = wrapper.load_state_dict(state_dict, strict=False)
    if missing: print(f"⚠️ Missing keys: {missing[:5]}{'...' if len(missing)>5 else ''}")
    if unexpected: print(f"⚠️ Unexpected keys: {unexpected[:5]}{'...' if len(unexpected)>5 else ''}")
    wrapper.to(device); wrapper.eval()
    return wrapper

@torch.inference_mode()
def evaluate(wrapper: LightningWrapper, X: np.ndarray, y: np.ndarray, device: torch.device, batch_size: int = 4096):
    X_t = torch.as_tensor(X, dtype=torch.float32, device=device)
    preds = []
    for i in range(0, X_t.size(0), batch_size):
        xb = X_t[i:i+batch_size]
        yb_pred = wrapper.model(xb)
        preds.append(yb_pred.detach().cpu().numpy())
    y_pred = np.vstack(preds)
    return {
        "mse": mse(y, y_pred),
        "rmse": rmse(y, y_pred),
        "mae": mae(y, y_pred),
        "r2": r2_score(y, y_pred),
    }


## 🚀 Run Validation (with safe sorting & debug)

In [None]:

subset_to_collections = {}
for sub in SUBSETS:
    if sub == "collections":
        if not COLLECTIONS:
            raise SystemExit('You included "collections" but COLLECTIONS is empty.')
        subset_to_collections[sub] = COLLECTIONS
    else:
        subset_to_collections[sub] = SUBSET_MAP[sub]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

results = []
if not entries_ready:
    print("❗ No entries are ready for validation. Check earlier cells for why (no ckpt found, no config, or no names).")

for e in entries_ready:
    name, ckpt, cfg = e["name"], e["ckpt_path"], e["arch_config"]
    print("\n" + "="*100)
    print("Model:", name)
    for subset_name, collections in subset_to_collections.items():
        X_val, y_val = load_val_data(collections, DB_NAME)
        input_size = X_val.shape[1]; output_size = y_val.shape[1]
        wrapper = load_model_from_arch_and_ckpt(ckpt, cfg, input_size, output_size, device)
        metrics = evaluate(wrapper, X_val, y_val, device, batch_size=BATCH_SIZE)
        row = {"name": name, "ckpt": ckpt, "subset": subset_name, "collections": ",".join(collections), **metrics}
        print(row)
        results.append(row)

df_results = pd.DataFrame(results)
if not df_results.empty:
    cols_to_sort = [c for c in ["name","subset"] if c in df_results.columns]
    if cols_to_sort:
        df_results = df_results.sort_values(cols_to_sort).reset_index(drop=True)
else:
    print("ℹ️ No results produced — likely zero matched (name → ckpt) or parsing failed.")

df_results


## 💾 Save Results

In [None]:

if not df_results.empty:
    out_csv = "/mnt/data/validation_results_from_wandb_v2.csv"
    df_results.to_csv(out_csv, index=False)
    print("Saved:", out_csv)
else:
    print("No results to save.")
