In [None]:
import os
import json
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:

# ======= 1. Configure root directory =======
# Put here the directory that contains seed_1, seed_2, ...
# For example, if it's the G:\ level, write r"G:\"
n_componets = 3
BASE_DIR = f"../output/seed_runs_dim{n_componets}"  # TODO: change to your actual path


def normalize_item_name(name: str) -> str:
    """
    Clean item text (remove extra spaces/newlines) to avoid duplicates
    caused by formatting differences.
    """
    return " ".join(name.split())


def main():
    # item_seed_count_by_cc[CC_name][item] = number of seeds in which item appears
    item_seed_count_by_cc = defaultdict(Counter)

    num_seeds = 0
    missing_files = []

    # ======= 2. Iterate over all seed_* folders =======
    for folder in os.listdir(BASE_DIR):
        folder_path = os.path.join(BASE_DIR, folder)
        if not os.path.isdir(folder_path):
            continue
        if not folder.startswith("seed_"):
            continue

        seed_id = folder.split("_", 1)[-1]  # "seed_1" -> "1"
        summary_filename = f"summary_seed_{seed_id}.json"
        summary_path = os.path.join(folder_path, summary_filename)

        if not os.path.exists(summary_path):
            missing_files.append(summary_path)
            continue

        num_seeds += 1

        try:
            with open(summary_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception as e:
            print(f"[ERROR] Failed to read {summary_path}: {e}")
            continue

        cca = data.get("cca", {})
        top_items = cca.get("top_items", {})

        # ======= 3. For each CC, record unique top items in this seed =======
        for cc_name, items_list in top_items.items():
            seen_items_this_seed_cc = set()

            for item_info in items_list:
                raw_name = (item_info.get("item") or "").strip()
                if not raw_name:
                    continue

                norm_name = normalize_item_name(raw_name)
                seen_items_this_seed_cc.add(norm_name)

            # Each unique item in this CC receives +1 seed count
            for item in seen_items_this_seed_cc:
                item_seed_count_by_cc[cc_name][item] += 1

    print(f"\nTotal summary files successfully processed: {num_seeds}")

    # ======= 4. Print top items for each CC (top 20 by seed frequency) =======
    for cc_name in sorted(item_seed_count_by_cc.keys()):
        print(f"\n=== {cc_name}: Top items appearing in the most seeds (Top 20) ===")
        for item, count in item_seed_count_by_cc[cc_name].most_common(20):
            print(f"{count:3d} seeds  |  {item}")

    # ======= 5. Save consolidated CSV: CC, item, seeds_count =======
    output_csv = os.path.join(BASE_DIR, "cc_top_items_across_seeds.csv")
    with open(output_csv, "w", encoding="utf-8-sig") as f:
        f.write("CC,item,seeds_count\n")
        for cc_name in sorted(item_seed_count_by_cc.keys()):
            counter = item_seed_count_by_cc[cc_name]
            for item, count in counter.most_common():
                safe_item = item.replace('"', '""')  # escape quotes for CSV
                f.write(f"{cc_name},\"{safe_item}\",{count}\n")

    print(f"\nSaved consolidated table to: {output_csv}")

    # ======= 6. Report missing summary files =======
    if missing_files:
        print("\nThe following summary files were NOT found (check naming/path issues):")
        for p in missing_files[:20]:
            print("  ", p)
        if len(missing_files) > 20:
            print(f"... and {len(missing_files) - 20} more not shown")



if __name__ == "__main__":
    main()



Total summary files successfully processed: 491

=== CC1: Top items appearing in the most seeds (Top 20) ===
491 seeds  |  Sudden changes in mood or feelings Súbitos cambios de humor o sentimientos
488 seeds  |  Can't get their mind off certain thoughts; obsessions
486 seeds  |  Stubborn, sullen, or irritable Obstinado(a), malhumorado(a), irritable
485 seeds  |  Can't concentrate, can't pay attention for long No puede concentrarse o prestar atención por mucho tiempo; Can't sit still, restless, or hyperactive No puede quedarse quieto(a); es inquieto(a) o hiperactivo(a); Inattentive or easily distracted No presta atención o se distrae fácilmente
483 seeds  |  Impulsive or acts without thinking Impulsivo(a); actúa sin pensar
460 seeds  |  Argues a lot Discute mucho
456 seeds  |  Demands a lot of attention Exige mucha atención
444 seeds  |  Fails to finish things they start
432 seeds  |  Temper tantrums or hot temper Le dan rabietas o tiene mal genio
252 seeds  |  Disobedient at home Deso

In [None]:

# ========= 1. Load your CSV =========
CSV_PATH = BASE_DIR+"/cc_top_items_across_seeds.csv"

df_raw = pd.read_csv(CSV_PATH)

# Check content
print("Loaded:", df_raw.head())


# ========= 2. Pivot into item × CC matrix =========
freq_mat = df_raw.pivot_table(
    index="item",
    columns="CC",
    values="seeds_count",
    fill_value=0
)

# Ensure CC columns are ordered
freq_mat = freq_mat[[f"CC{i+1}" for i in range(n_componets)]]

print("Matrix shape:", freq_mat.shape)


# ========= 3. Sort items by primary CC & frequency (makes terrain clearer) =========
primary_cc = freq_mat.idxmax(axis=1)        # which CC has highest frequency
max_freq = freq_mat.max(axis=1)             # that value

sort_info = pd.DataFrame({
    "primary_cc": primary_cc,
    "max_freq": max_freq
}, index=freq_mat.index)

sort_info = sort_info.sort_values(
    by=["primary_cc", "max_freq"],
    ascending=[True, False]
)

freq_sorted = freq_mat.loc[sort_info.index]


# ========= 4. Build grid for contour =========
Z = freq_sorted.values                 # shape (n_items, 5)
n_items, n_cc = Z.shape
x = np.arange(n_cc)                    # 0..4 -> CC1..CC5
y = np.arange(n_items)                 # one row per item
X, Y = np.meshgrid(x, y)


# ========= 5. Plot contour terrain =========
plt.figure(figsize=(15, 20))

# Filled contour (terrain)
contourf = plt.contourf(
    X, Y, Z,
    levels=20,
    cmap="terrain"         # <-- exactly the “地形图” colormap
)

# Contour lines
contour = plt.contour(
    X, Y, Z,
    levels=20,
    colors="k",
    linewidths=0.3,
    alpha=0.6
)

plt.clabel(contour, fontsize=6, inline=True)

# X-axis labels = CC names
plt.xticks(
    ticks=np.arange(n_componets),
    labels = [f"CC{i+1}" for i in range(n_componets)]

)

# Y-axis labels: sample every N items (not all 114)
step = max(1, n_items // 25)
yticks = np.arange(0, n_items, step)
yticklabels = [freq_sorted.index[i] for i in yticks]
plt.yticks(yticks, yticklabels)

plt.gca().invert_yaxis()

plt.title("Contour Terrain Map of Item Frequencies across CCs (62 seeds)")
plt.xlabel("Canonical Components")
plt.ylabel("CBCL Items (sorted by primary CC & frequency)")

plt.colorbar(contourf, label="Frequency (0–62)")

plt.tight_layout()

OUT_PATH = os.path.join(
    os.path.dirname(CSV_PATH),
    "frequency_contour_terrain.png"
)
plt.savefig(OUT_PATH, dpi=300, bbox_inches="tight")
plt.close()

print("Saved contour terrain map to:", OUT_PATH)


Loaded:     CC                                               item  seeds_count
0  CC1  Sudden changes in mood or feelings Súbitos cam...          491
1  CC1  Can't get their mind off certain thoughts; obs...          488
2  CC1  Stubborn, sullen, or irritable Obstinado(a), m...          486
3  CC1  Can't concentrate, can't pay attention for lon...          485
4  CC1  Impulsive or acts without thinking Impulsivo(a...          483
Matrix shape: (102, 5)


  plt.tight_layout()


Saved contour terrain map to: G:\ABCD\script\trail\output\seed_run_dim2\frequency_contour_terrain.png


: 