# Personality Atlas — Quick Start

**Reproduce the atlas results in under 5 minutes.** No API keys required.

> Raetano, J., Gregor, J., & Tamang, S. (2026). *A Survey and Computational Atlas of Personality Models.* ACM TIST. Under review.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Wildertrek/survey/blob/main/notebooks/atlas_quick_start.ipynb)

---

## 1. Setup (~30 seconds)

In [None]:
# Clone the atlas repository (skip if already cloned)
import os
if not os.path.exists("atlas"):
    !git clone --depth 1 https://github.com/Wildertrek/survey.git atlas
else:
    print("Atlas already cloned — skipping.")

In [None]:
# Install dependencies (uses Colab's pre-installed sklearn — no version conflicts)
!pip install -q faiss-cpu

# Suppress sklearn version warning (models trained with 1.5.0, works fine with newer versions)
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

## 2. Load and Predict — Any of 44 Models (~10 lines)

In [None]:
import ast
import numpy as np
import pandas as pd
import joblib

# Pick any model: ocean, hexaco, mbti, mmpi, scid, npi, dt4, tci, ...
SLUG = "ocean"

df = pd.read_csv(f"atlas/datasets/{SLUG}.csv")
embeddings = pd.read_csv(f"atlas/Embeddings/{SLUG}_embeddings.csv")
model = joblib.load(f"atlas/models/{SLUG}_rf_model.pkl")
encoder = joblib.load(f"atlas/models/{SLUG}_label_encoder.pkl")

X = np.array([ast.literal_eval(e) for e in embeddings["Embedding"]])
predictions = encoder.inverse_transform(model.predict(X))
accuracy = (predictions == df["Factor"].values).mean()

print(f"{SLUG.upper()}: {len(df)} traits, {len(set(predictions))} factors, accuracy = {accuracy:.1%}")
print(f"Factors: {sorted(set(predictions))}")

## 3. Atlas Overview — All 44 Models

In [None]:
import os

slugs = sorted([f.replace(".csv", "") for f in os.listdir("atlas/datasets") if f.endswith(".csv")])
print(f"Atlas contains {len(slugs)} personality models:\n")

results = []
for slug in slugs:
    df = pd.read_csv(f"atlas/datasets/{slug}.csv")
    n_factors = df["Factor"].nunique()
    
    model = joblib.load(f"atlas/models/{slug}_rf_model.pkl")
    enc = joblib.load(f"atlas/models/{slug}_label_encoder.pkl")
    emb = pd.read_csv(f"atlas/Embeddings/{slug}_embeddings.csv")
    X = np.array([ast.literal_eval(e) for e in emb["Embedding"]])
    preds = enc.inverse_transform(model.predict(X))
    acc = (preds == df["Factor"].values).mean()
    
    results.append({"Model": slug.upper(), "Traits": len(df), "Factors": n_factors, "Accuracy": f"{acc:.1%}"})

results_df = pd.DataFrame(results).sort_values("Accuracy", ascending=False)
results_df.index = range(1, len(results_df) + 1)
results_df

## 4. Reproduce PCA — Cross-Model Dimensionality Analysis (Paper Figure 5-8)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Category assignments for the 7-category taxonomy
CATEGORIES = {
    "Trait-Based": ["ocean", "hex", "mbti", "epm", "sixteenpf", "ftm"],
    "Narcissism-Based": ["npi", "pni", "hsns", "dtm", "dt4", "ffni", "ffni_sf", "narq", "mcmin", "ipn"],
    "Motivational/Value": ["stbv", "sdt", "rft", "aam", "mst", "cs"],
    "Cognitive/Learning": ["pct", "cest", "scm", "fsls"],
    "Clinical/Health": ["mmpi", "scid", "bdi", "gad7", "wais", "tci", "mcmi", "tmp", "rit", "tat"],
    "Interpersonal/Conflict": ["disc", "tki"],
    "Application-Specific": ["riasec", "cmoa", "tei", "bt", "em", "papc"]
}
slug_to_cat = {s: c for c, slugs in CATEGORIES.items() for s in slugs}

# Load all embeddings
all_vecs, all_labels, all_cats = [], [], []
for slug in slugs:
    df = pd.read_csv(f"atlas/datasets/{slug}.csv")
    emb = pd.read_csv(f"atlas/Embeddings/{slug}_embeddings.csv")
    X = np.array([ast.literal_eval(e) for e in emb["Embedding"]])
    all_vecs.append(X)
    all_labels.extend([slug.upper()] * len(X))
    all_cats.extend([slug_to_cat.get(slug, "Unknown")] * len(X))

X_all = np.vstack(all_vecs)
n_unknown = sum(1 for c in all_cats if c == "Unknown")
print(f"Loaded {X_all.shape[0]} embeddings ({X_all.shape[1]}-dim) from {len(slugs)} models")
if n_unknown > 0:
    unknown_slugs = sorted(set(s for s, c in zip([slugs[i] for i in range(len(slugs)) for _ in range(1)], all_cats) if c == "Unknown"))
    print(f"WARNING: {n_unknown} embeddings have Unknown category")

In [None]:
# PCA — Scree plot (Paper Figure 5)
pca = PCA(n_components=50)
X_pca = pca.fit_transform(X_all)
cumvar = np.cumsum(pca.explained_variance_ratio_) * 100

fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(range(1, 51), pca.explained_variance_ratio_ * 100, alpha=0.6, label="Individual")
ax.plot(range(1, 51), cumvar, "r-o", markersize=3, label="Cumulative")
ax.axhline(y=cumvar[-1], color="gray", linestyle="--", alpha=0.5)
ax.set_xlabel("Principal Component")
ax.set_ylabel("Variance Explained (%)")
ax.set_title(f"PCA Scree Plot — {X_all.shape[0]} Trait Embeddings from 44 Models\n50 PCs capture {cumvar[-1]:.1f}% of variance")
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# PCA — Model centroids in PC1-PC2 space (Paper Figure 6)
cat_colors = {
    "Trait-Based": "#1f77b4", "Narcissism-Based": "#ff7f0e",
    "Motivational/Value": "#2ca02c", "Cognitive/Learning": "#d62728",
    "Clinical/Health": "#9467bd", "Interpersonal/Conflict": "#8c564b",
    "Application-Specific": "#e377c2"
}

centroids = pd.DataFrame({
    "PC1": X_pca[:, 0], "PC2": X_pca[:, 1],
    "Model": all_labels, "Category": all_cats
}).groupby(["Model", "Category"])[["PC1", "PC2"]].mean().reset_index()

fig, ax = plt.subplots(figsize=(12, 8))
for cat, color in cat_colors.items():
    subset = centroids[centroids["Category"] == cat]
    ax.scatter(subset["PC1"], subset["PC2"], c=color, s=80, label=cat, alpha=0.8, edgecolors="white", linewidth=0.5)
    for _, row in subset.iterrows():
        ax.annotate(row["Model"], (row["PC1"], row["PC2"]), fontsize=7, alpha=0.7,
                    xytext=(4, 4), textcoords="offset points")

ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
ax.set_title("44 Personality Models in PC1-PC2 Space (Model Centroids)")
ax.legend(loc="best", fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# PCA — All traits colored by category (Paper Figure 8)
fig, ax = plt.subplots(figsize=(12, 8))
for cat, color in cat_colors.items():
    mask = [c == cat for c in all_cats]
    ax.scatter(X_pca[mask, 0], X_pca[mask, 1], c=color, s=4, alpha=0.3, label=cat)

ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
ax.set_title(f"All {X_all.shape[0]} Trait Embeddings — 44 Models, 7 Categories")
ax.legend(markerscale=5, fontsize=8)
plt.tight_layout()
plt.show()

## 5. Cross-Model Search with FAISS

In [None]:
import faiss

# Build FAISS index over entire atlas
X_norm = X_all / np.linalg.norm(X_all, axis=1, keepdims=True)
index = faiss.IndexFlatIP(X_norm.shape[1])
index.add(X_norm.astype(np.float32))

# Build metadata for lookups
all_factors = []
for slug in slugs:
    df = pd.read_csv(f"atlas/datasets/{slug}.csv")
    all_factors.extend(df["Factor"].values)

print(f"FAISS index: {index.ntotal} vectors, {X_norm.shape[1]}-dim")
print(f"Ready for cross-model personality search")

In [None]:
# Query: find similar traits across all 44 models
# This demonstrates cross-category retrieval — the atlas's core value
query_slug = "ocean"
query_factor = "Extraversion"

query_df = pd.read_csv(f"atlas/datasets/{query_slug}.csv")
query_emb = pd.read_csv(f"atlas/Embeddings/{query_slug}_embeddings.csv")

# Find first row matching the target factor
idx = query_df[query_df["Factor"] == query_factor].index[0]
q = np.array([ast.literal_eval(query_emb["Embedding"].iloc[idx])]).astype(np.float32)
q = q / np.linalg.norm(q)

D, I = index.search(q, 20)

query_trait = query_df.iloc[idx]
print(f"Query: {query_slug.upper()} / {query_factor} — \"{query_trait['Adjective']}\"\n")
print(f"{'Rank':<5} {'Model':<12} {'Factor':<35} {'Category':<22} {'Score':.5}")
print("-" * 85)
for rank, (i, score) in enumerate(zip(I[0], D[0]), 1):
    print(f"{rank:<5} {all_labels[i]:<12} {all_factors[i]:<35} {all_cats[i]:<22} {score:.4f}")

In [None]:
# Second query: Dark Triad Machiavellianism → cross-category retrieval
# Shows how a narcissism-based construct connects to clinical, trait, and motivational models
query_slug = "dtm"
query_factor = "Machiavellianism"

query_df = pd.read_csv(f"atlas/datasets/{query_slug}.csv")
query_emb = pd.read_csv(f"atlas/Embeddings/{query_slug}_embeddings.csv")

idx = query_df[query_df["Factor"] == query_factor].index[0]
q = np.array([ast.literal_eval(query_emb["Embedding"].iloc[idx])]).astype(np.float32)
q = q / np.linalg.norm(q)

D, I = index.search(q, 20)

query_trait = query_df.iloc[idx]
print(f"Query: {query_slug.upper()} / {query_factor} — \"{query_trait['Adjective']}\"\n")
print(f"{'Rank':<5} {'Model':<12} {'Factor':<35} {'Category':<22} {'Score':.5}")
print("-" * 85)
for rank, (i, score) in enumerate(zip(I[0], D[0]), 1):
    print(f"{rank:<5} {all_labels[i]:<12} {all_factors[i]:<35} {all_cats[i]:<22} {score:.4f}")

# Count unique categories and models in results
result_cats = set(all_cats[i] for i in I[0])
result_models = set(all_labels[i] for i in I[0])
print(f"\n→ {len(result_cats)} categories, {len(result_models)} models in top 20 — cross-tradition retrieval")

## 6. Lexical Schema — What's in Each Model

In [None]:
# Inspect any model's lexical schema
INSPECT = "hex"  # change to any slug (ocean, hex, mbti, mmpi, scid, npi, dt4, tci, ...)

df = pd.read_csv(f"atlas/datasets/{INSPECT}.csv")
print(f"{INSPECT.upper()}: {len(df)} traits across {df['Factor'].nunique()} factors\n")

for factor, group in df.groupby("Factor"):
    unique_adj = group["Adjective"].unique()[:5]
    print(f"  {factor} ({len(group)} traits): {', '.join(unique_adj)}, ...")

## 7. Experiment 1 — Novel Item Evaluation (Paper Section 6.2)

The accuracy leaderboard in Section 3 measures how well each classifier separates its own training lexicon --- a reproduction check, not a generalization test. Models with few traits relative to 1,536 embedding dimensions can reach 100% by memorizing the training set.

Here we evaluate on **5,052 truly novel test items** generated independently by GPT-4o from factor definitions alone, without access to the training data. Pre-computed 1,536-dim embeddings are included in the repository (no API key needed). This is the actual generalization accuracy reported in the paper.

In [None]:
import json
from sklearn.metrics import accuracy_score

# Load pre-computed test items (generated by GPT-4o, embedded offline)
test_items = json.load(open("atlas/data/test_items/test_items.json"))
test_emb = np.load("atlas/data/test_items/test_items_embeddings.npz")["embeddings"]

# Evaluate all 44 models on novel items
novel_results = []
for slug in slugs:
    model_rf = joblib.load(f"atlas/models/{slug}_rf_model.pkl")
    enc = joblib.load(f"atlas/models/{slug}_label_encoder.pkl")
    
    idx = [i for i, item in enumerate(test_items) if item["slug"] == slug]
    if not idx:
        continue
    
    X_novel = test_emb[idx]
    y_true = [test_items[i]["expected_factor"] for i in idx]
    y_pred = enc.inverse_transform(model_rf.predict(X_novel))
    acc = accuracy_score(y_true, y_pred)
    novel_results.append({"Model": slug.upper(), "Items": len(idx), "Novel Accuracy": f"{acc:.1%}", "_acc": acc})

novel_df = pd.DataFrame(novel_results).sort_values("_acc", ascending=False).drop(columns=["_acc"])
novel_df.index = range(1, len(novel_df) + 1)

mean_novel = np.mean([r["_acc"] for r in novel_results])
print(f"Mean novel-item accuracy: {mean_novel:.1%} (vs. ~98% on training data)")
print(f"This is the generalization accuracy reported in the paper.\n")
novel_df

## 8. Experiment 2 — 3072-dim Embedding Upgrade (Optional)

The atlas also includes upgraded **3072-dim** embeddings (`text-embedding-3-large`) and retrained RF classifiers from Experiment 2. These are hosted on [Hugging Face Hub](https://huggingface.co/datasets/Wildertrek/personality-atlas-3072) due to size (547 MB total).

Run the cells below to download and compare 1536 vs 3072 accuracy.

In [None]:
!pip install -q huggingface_hub

import os
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

import warnings
warnings.filterwarnings("ignore", message=".*HF_TOKEN.*")
warnings.filterwarnings("ignore", message=".*unauthenticated.*")

from huggingface_hub import hf_hub_download

HF_REPO = "Wildertrek/personality-atlas-3072"

def load_3072(slug):
    """Download and load 3072-dim assets for a single model from HuggingFace."""
    emb_path = hf_hub_download(HF_REPO, f"Embeddings_3072/{slug}_embeddings.csv", repo_type="dataset")
    model_path = hf_hub_download(HF_REPO, f"models_3072/{slug}_rf_model.pkl", repo_type="dataset")
    enc_path = hf_hub_download(HF_REPO, f"models_3072/{slug}_label_encoder.pkl", repo_type="dataset")
    emb_df = pd.read_csv(emb_path)
    X = np.array([ast.literal_eval(e) for e in emb_df["Embedding"]])
    return X, joblib.load(model_path), joblib.load(enc_path)

print("Ready to download 3072-dim assets from HuggingFace.")

In [None]:
# Compare 1536 vs 3072 accuracy across all 44 models
comparison = []
for slug in slugs:
    df = pd.read_csv(f"atlas/datasets/{slug}.csv")
    y_true = df["Factor"].values

    # 1536-dim (already loaded from repo)
    m1536 = joblib.load(f"atlas/models/{slug}_rf_model.pkl")
    e1536 = joblib.load(f"atlas/models/{slug}_label_encoder.pkl")
    emb1536 = pd.read_csv(f"atlas/Embeddings/{slug}_embeddings.csv")
    X1536 = np.array([ast.literal_eval(e) for e in emb1536["Embedding"]])
    acc1536 = (e1536.inverse_transform(m1536.predict(X1536)) == y_true).mean()

    # 3072-dim (downloaded from HuggingFace)
    X3072, m3072, e3072 = load_3072(slug)
    acc3072 = (e3072.inverse_transform(m3072.predict(X3072)) == y_true).mean()

    delta = (acc3072 - acc1536) * 100
    comparison.append({
        "Model": slug.upper(),
        "1536-dim": f"{acc1536:.1%}",
        "3072-dim": f"{acc3072:.1%}",
        "Delta": f"{delta:+.1f}pp",
        "_delta": delta,  # numeric for sorting
    })

comp_df = pd.DataFrame(comparison).sort_values("_delta", ascending=False).drop(columns=["_delta"])
comp_df.index = range(1, len(comp_df) + 1)

improved = sum(1 for c in comparison if c["_delta"] > 0.05)
decreased = sum(1 for c in comparison if c["_delta"] < -0.05)
unchanged = len(comparison) - improved - decreased
mean_1536 = np.mean([float(c["1536-dim"].rstrip("%")) for c in comparison])
mean_3072 = np.mean([float(c["3072-dim"].rstrip("%")) for c in comparison])

print(f"1536-dim mean: {mean_1536:.1f}% | 3072-dim mean: {mean_3072:.1f}% | Delta: {mean_3072 - mean_1536:+.1f}pp")
print(f"{improved} improved, {decreased} decreased, {unchanged} unchanged\n")
comp_df

### Novel item accuracy: 1536-dim vs 3072-dim

The table above compares accuracy on each model's own training data. Below we repeat the comparison on the **5,052 novel test items** from Section 7, which the classifiers have never seen. Pre-computed 3072-dim embeddings are downloaded from HuggingFace (no API key needed).

In [None]:
# Novel item accuracy: 1536-dim vs 3072-dim on truly novel test items
# Downloads 3072-dim test item embeddings from HuggingFace (no API key needed)
test3072_path = hf_hub_download(HF_REPO, "test_items/test_items_embeddings_3072.npz", repo_type="dataset")
test_emb_3072 = np.load(test3072_path)["embeddings"]

novel_comparison = []
for slug in slugs:
    m1536 = joblib.load(f"atlas/models/{slug}_rf_model.pkl")
    e1536 = joblib.load(f"atlas/models/{slug}_label_encoder.pkl")
    X3072_m, m3072, e3072 = load_3072(slug)

    idx = [i for i, item in enumerate(test_items) if item["slug"] == slug]
    if not idx:
        continue

    y_true = [test_items[i]["expected_factor"] for i in idx]
    acc_n1536 = accuracy_score(y_true, e1536.inverse_transform(m1536.predict(test_emb[idx])))
    acc_n3072 = accuracy_score(y_true, e3072.inverse_transform(m3072.predict(test_emb_3072[idx])))
    delta = (acc_n3072 - acc_n1536) * 100
    novel_comparison.append({
        "Model": slug.upper(), "Items": len(idx),
        "1536-dim": f"{acc_n1536:.1%}", "3072-dim": f"{acc_n3072:.1%}",
        "Delta": f"{delta:+.1f}pp", "_delta": delta
    })

nc_df = pd.DataFrame(novel_comparison).sort_values("_delta", ascending=False).drop(columns=["_delta"])
nc_df.index = range(1, len(nc_df) + 1)

mean_n1536 = np.mean([float(c["1536-dim"].rstrip("%")) for c in novel_comparison])
mean_n3072 = np.mean([float(c["3072-dim"].rstrip("%")) for c in novel_comparison])
improved = sum(1 for c in novel_comparison if c["_delta"] > 0.05)
decreased = sum(1 for c in novel_comparison if c["_delta"] < -0.05)
unchanged = len(novel_comparison) - improved - decreased

print(f"Novel item accuracy: 1536-dim mean: {mean_n1536:.1f}% | 3072-dim mean: {mean_n3072:.1f}% | Delta: {mean_n3072 - mean_n1536:+.1f}pp")
print(f"{improved} improved, {decreased} decreased, {unchanged} unchanged\n")
nc_df

## 9. Experiment 3 — External Validation: DSM-5 Clinical Alignment (Paper Section 6.3)

The atlas was built from personality research instruments, but personality constructs overlap heavily with clinical psychology. If the atlas taxonomy is well-structured, clinical constructs should route systematically to the correct category.

We test this by classifying all **222 DSM-5-TR disorders** through the atlas. Each disorder's name and diagnostic keywords are embedded and queried against the FAISS index. The predicted category for each disorder tells us whether the atlas's Clinical/Health category actually captures clinical constructs — or whether they leak into other categories.

**Data:** `atlas/data/dsm5_disorders.json` (222 disorders, 21 DSM-5 categories, 786 keywords)  
**Pre-computed:** `atlas/data/dsm5_embeddings.csv` (no API key needed)

In [None]:
# Classify 222 DSM-5-TR disorders through the atlas
# Uses pre-computed embeddings — no API key needed
import json
from collections import Counter

dsm5 = json.load(open("atlas/data/dsm5_disorders.json"))
dsm5_emb = pd.read_csv("atlas/data/dsm5_embeddings.csv")
dsm5_vecs = np.array([ast.literal_eval(e) for e in dsm5_emb["Embedding"]]).astype(np.float32)
dsm5_vecs = dsm5_vecs / np.linalg.norm(dsm5_vecs, axis=1, keepdims=True)

# Query each disorder against the atlas FAISS index (built in Section 5)
dsm5_results = []
for i, disorder in enumerate(dsm5):
    q = dsm5_vecs[i:i+1]
    D, I = index.search(q, 10)
    top_cats = [all_cats[j] for j in I[0]]
    predicted_cat = Counter(top_cats).most_common(1)[0][0]
    dsm5_results.append({
        "disorder": disorder["disorder_name"],
        "dsm5_category": disorder["dsm5_category"],
        "predicted_atlas_cat": predicted_cat,
        "is_clinical": predicted_cat == "Clinical/Health",
        "top_cat_counts": dict(Counter(top_cats))
    })

clinical_count = sum(1 for r in dsm5_results if r["is_clinical"])
pct = clinical_count / len(dsm5_results) * 100
print(f"DSM-5 Clinical Alignment: {clinical_count}/{len(dsm5_results)} disorders ({pct:.1f}%) route to Clinical/Health")
print(f"This confirms the atlas taxonomy correctly captures clinical constructs.\n")

# Show the few that don't route to Clinical/Health
non_clinical = [r for r in dsm5_results if not r["is_clinical"]]
if non_clinical:
    print(f"{len(non_clinical)} disorders route elsewhere:")
    for r in non_clinical:
        print(f"  {r['disorder'][:60]:<62} → {r['predicted_atlas_cat']}")
else:
    print("All 222 disorders route to Clinical/Health.")

In [None]:
# Per DSM-5 category breakdown — which clinical domains route where?
dsm5_cats = sorted(set(r["dsm5_category"] for r in dsm5_results))

print(f"{'DSM-5 Category':<45} {'N':>3}  {'Clinical':>8}  {'Other':>5}  {'Pct':>5}")
print("-" * 72)
for cat in dsm5_cats:
    cat_results = [r for r in dsm5_results if r["dsm5_category"] == cat]
    n = len(cat_results)
    clin = sum(1 for r in cat_results if r["is_clinical"])
    other = n - clin
    pct = clin / n * 100
    marker = "" if pct == 100 else " *"
    print(f"{cat:<45} {n:>3}  {clin:>8}  {other:>5}  {pct:>4.0f}%{marker}")

print(f"\n* = categories where some disorders route outside Clinical/Health")
print(f"Total: {len(dsm5_results)} disorders, {clinical_count} ({pct:.1f}%) route to Clinical/Health")

---

**Repository:** [github.com/Wildertrek/survey](https://github.com/Wildertrek/survey) | **3072-dim assets:** [Hugging Face Hub](https://huggingface.co/datasets/Wildertrek/personality-atlas-3072)  
**Paper:** Raetano, J., Gregor, J., & Tamang, S. (2026). *A Survey and Computational Atlas of Personality Models.* ACM TIST.  
**License:** MIT