In [6]:
import pandas as pd
import json
import ast

def try_parse_cell(cell):
    """Parse a cell that may be a dict/list or JSON/Python literal string."""
    if cell is None:
        return None
    if isinstance(cell, (dict, list)):
        return cell
    s = str(cell).strip()
    if s == "" or s.lower() == "nan":
        return None
    try:
        return json.loads(s)
    except Exception:
        pass
    try:
        return ast.literal_eval(s)
    except Exception:
        pass
    return None

def extract_minimal_resources_from_row(row):
    """Extract InstanceType, vCPUs, MemoryMiB, GPUs from a full catalog row."""
    name = row.get("InstanceType") or row.get("instance_type")
    vcpu = 0
    mem = 0
    gpus = 0

    # VCpuInfo
    vc = row.get("VCpuInfo")
    if vc:
        parsed = try_parse_cell(vc)
        if isinstance(parsed, dict):
            vcpu = int(parsed.get("DefaultVCpus", parsed.get("DefaultVCPUs", 0)) or 0)

    # MemoryInfo
    mi = row.get("MemoryInfo")
    if mi:
        parsed = try_parse_cell(mi)
        if isinstance(parsed, dict):
            mem = int(parsed.get("SizeInMiB", 0) or 0)

    # GpuInfo
    gi = row.get("GpuInfo")
    if gi:
        parsed = try_parse_cell(gi)
        if isinstance(parsed, dict):
            g_list = parsed.get("Gpus") or []
            if g_list and isinstance(g_list, list):
                gpus = int(g_list[0].get("Count", 0) or 0)

    return {"InstanceType": name, "vCPUs": vcpu, "MemoryMiB": mem, "GPUs": gpus}

def preprocess_catalog(input_csv, output_csv):
    df = pd.read_csv(input_csv, dtype=str).fillna("")
    if "InstanceType" not in df.columns and "instance_type" in df.columns:
        df.rename(columns={"instance_type": "InstanceType"}, inplace=True)

    minimal_rows = []
    for _, row in df.iterrows():
        rowd = row.to_dict()
        minimal = extract_minimal_resources_from_row(rowd)
        if minimal["InstanceType"]:
            minimal_rows.append(minimal)

    minimal_df = pd.DataFrame(minimal_rows)
    minimal_df.to_csv(output_csv, index=False)
    print(f"Saved preprocessed catalog to {output_csv}, {len(minimal_df)} rows.")

In [7]:
preprocess_catalog("grouped_full_catalog.csv", "preprocessed_catalog.csv")

Saved preprocessed catalog to preprocessed_catalog.csv, 1044 rows.
