In [1]:
import pandas as pd
import json
import plotly.graph_objects as go
import numpy as np

print("Libraries loaded successfully!")


Libraries loaded successfully!


In [2]:
# Load the augmented CSV with clustering columns
print("Loading augmented CSV...")
df = pd.read_csv("selected_conversations_with_topics_embedded_clustered.csv")

print(f"Loaded {len(df)} rows")
print(f"Total columns: {len(df.columns)}")
print(f"\nAll columns:")
print(df.columns.tolist())

print(f"\nFirst few rows:")
print(df.head(3))


Loading augmented CSV...
Loaded 1000 rows
Total columns: 26

All columns:
['Model', 'Conversation', 'Language', 'Toxic', 'State', 'Country', 'Hour of Day', 'Topic_Embedding', 'Topic', 'Subcluster_Name', 'Subcluster_Description', 'Subcluster_Description_Embedding', 'Cluster', 'Conversation_Embedding', 'L0_cluster_id', 'L0_cluster_label', 'L0_cluster_description', 'L0_cluster_description_embedding', 'L1_cluster_id', 'L1_cluster_label', 'L1_cluster_description', 'L1_cluster_description_embedding', 'L2_cluster_id', 'L2_cluster_label', 'L2_cluster_description', 'L2_cluster_description_embedding']

First few rows:
        Model                                       Conversation Language  \
0  gpt-4-0314  defaultdict(<class 'dict'>, {101001: {'user': ...  English   
1  gpt-4-0314  defaultdict(<class 'dict'>, {101004: {'user': ...  English   
2  gpt-4-0314  defaultdict(<class 'dict'>, {101008: {'user': ...  English   

   Toxic         State        Country  Hour of Day  \
0  False         Texa

In [3]:
# Validate required clustering columns exist
print("Validating clustering columns...")
required_cols = []
for level in [0, 1, 2]:
    required_cols.extend([
        f"L{level}_cluster_id",
        f"L{level}_cluster_label",
        f"L{level}_cluster_description",
        f"L{level}_cluster_description_embedding"
    ])

missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    print(f"ERROR: Missing columns: {missing_cols}")
else:
    print("✓ All required clustering columns present")

print(f"\nChecking for null values in clustering columns...")
null_counts = {}
for col in required_cols:
    null_count = df[col].isna().sum()
    empty_count = (df[col] == "").sum() if df[col].dtype == "object" else 0
    total_empty = null_count + empty_count
    null_counts[col] = total_empty
    if total_empty > 0:
        print(f"  {col}: {total_empty} empty/null values")

# Check which rows have valid clustering data
valid_rows = df[
    (df["L0_cluster_id"] != "") & 
    (df["L0_cluster_id"].notna()) &
    (df["L0_cluster_label"] != "") &
    (df["L0_cluster_label"].notna())
]

print(f"\n✓ Rows with valid L0 clustering: {len(valid_rows)} / {len(df)}")
print(f"✓ Rows with valid L1 clustering: {(valid_rows['L1_cluster_id'] != '') & (valid_rows['L1_cluster_id'].notna()).sum()}")
print(f"✓ Rows with valid L2 clustering: {(valid_rows['L2_cluster_id'] != '') & (valid_rows['L2_cluster_id'].notna()).sum()}")


Validating clustering columns...
✓ All required clustering columns present

Checking for null values in clustering columns...
  L2_cluster_description_embedding: 1000 empty/null values

✓ Rows with valid L0 clustering: 1000 / 1000
✓ Rows with valid L1 clustering: 0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: L1_cluster_id, Length: 1000, dtype: bool
✓ Rows with valid L2 clustering: 0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: L2_cluster_id, Length: 1000, dtype: bool


In [4]:
# Validate embedding columns
print("Validating embedding columns...")

# Check Topic_Embedding column
if "Topic_Embedding" in df.columns:
    topic_emb_valid = 0
    topic_emb_invalid = 0
    for idx, emb_str in enumerate(df["Topic_Embedding"]):
        if pd.isna(emb_str) or emb_str == "null" or emb_str == "":
            topic_emb_invalid += 1
            continue
        try:
            emb = json.loads(emb_str)
            if isinstance(emb, list) and len(emb) > 0:
                topic_emb_valid += 1
            else:
                topic_emb_invalid += 1
        except (json.JSONDecodeError, TypeError):
            topic_emb_invalid += 1
    print(f"  Topic_Embedding: {topic_emb_valid} valid, {topic_emb_invalid} invalid")
else:
    print("  WARNING: Topic_Embedding column not found")

# Check description embedding columns
for level in [0, 1, 2]:
    col = f"L{level}_cluster_description_embedding"
    if col in df.columns:
        desc_emb_valid = 0
        desc_emb_invalid = 0
        for idx, emb_str in df[col].items():
            if pd.isna(emb_str) or emb_str == "" or emb_str == "null":
                desc_emb_invalid += 1
                continue
            try:
                emb = json.loads(emb_str)
                if isinstance(emb, list) and len(emb) > 0:
                    desc_emb_valid += 1
                else:
                    desc_emb_invalid += 1
            except (json.JSONDecodeError, TypeError):
                desc_emb_invalid += 1
        
        # Only count rows with valid clustering
        valid_with_clustering = valid_rows[valid_rows[col] != ""]
        print(f"  {col}: {desc_emb_valid} valid embeddings (in rows with clustering)")
    else:
        print(f"  WARNING: {col} not found")


Validating embedding columns...
  Topic_Embedding: 1000 valid, 0 invalid
  L0_cluster_description_embedding: 1000 valid embeddings (in rows with clustering)
  L1_cluster_description_embedding: 1000 valid embeddings (in rows with clustering)
  L2_cluster_description_embedding: 0 valid embeddings (in rows with clustering)


In [5]:
# Build unique cluster nodes from the augmented CSV
print("Building cluster hierarchy from augmented CSV...")

# Helper function to convert cluster ID to string (handles int/float/string)
def normalize_cluster_id(cid):
    """Convert cluster ID to string, handling empty/null values."""
    if pd.isna(cid) or cid == "" or cid == "null":
        return None
    # Convert to string, handling int/float
    return str(int(float(cid))) if isinstance(cid, (int, float)) else str(cid)

# Filter to rows with valid clustering
df_valid = df[
    (df["L0_cluster_id"] != "") & 
    (df["L0_cluster_id"].notna())
].copy()

print(f"Using {len(df_valid)} rows with valid clustering data")

# Build unique clusters at each level
clusters = {0: {}, 1: {}, 2: {}}

# Level 0 clusters
for idx, row in df_valid.iterrows():
    l0_id = normalize_cluster_id(row["L0_cluster_id"])
    if l0_id is None:
        continue
    if l0_id not in clusters[0]:
        clusters[0][l0_id] = {
            "id": l0_id,
            "label": row["L0_cluster_label"] if pd.notna(row["L0_cluster_label"]) else "",
            "description": row["L0_cluster_description"] if pd.notna(row["L0_cluster_description"]) else "",
            "description_embedding": row["L0_cluster_description_embedding"] if pd.notna(row["L0_cluster_description_embedding"]) else "",
            "topic_count": 0,
            "parent_id": None
        }
    clusters[0][l0_id]["topic_count"] += 1

# Level 1 clusters
l0_to_l1 = {}
for idx, row in df_valid.iterrows():
    l0_id = normalize_cluster_id(row["L0_cluster_id"])
    l1_id = normalize_cluster_id(row["L1_cluster_id"])
    
    if l1_id is not None:
        if l1_id not in clusters[1]:
            clusters[1][l1_id] = {
                "id": l1_id,
                "label": row["L1_cluster_label"] if pd.notna(row["L1_cluster_label"]) else "",
                "description": row["L1_cluster_description"] if pd.notna(row["L1_cluster_description"]) else "",
                "description_embedding": row["L1_cluster_description_embedding"] if pd.notna(row["L1_cluster_description_embedding"]) else "",
                "topic_count": 0,
                "parent_id": None
            }
        clusters[1][l1_id]["topic_count"] += 1
        if l0_id is not None:
            l0_to_l1[l0_id] = l1_id

# Level 2 clusters
l1_to_l2 = {}
for idx, row in df_valid.iterrows():
    l1_id = normalize_cluster_id(row["L1_cluster_id"])
    l2_id = normalize_cluster_id(row["L2_cluster_id"])
    
    if l2_id is not None:
        if l2_id not in clusters[2]:
            clusters[2][l2_id] = {
                "id": l2_id,
                "label": row["L2_cluster_label"] if pd.notna(row["L2_cluster_label"]) else "",
                "description": row["L2_cluster_description"] if pd.notna(row["L2_cluster_description"]) else "",
                "description_embedding": row["L2_cluster_description_embedding"] if pd.notna(row["L2_cluster_description_embedding"]) else "",
                "topic_count": 0,
                "parent_id": None
            }
        clusters[2][l2_id]["topic_count"] += 1
        if l1_id is not None:
            l1_to_l2[l1_id] = l2_id

# Set parent relationships
for l0_id, l1_id in l0_to_l1.items():
    if l0_id in clusters[0]:
        clusters[0][l0_id]["parent_id"] = l1_id

for l1_id, l2_id in l1_to_l2.items():
    if l1_id in clusters[1]:
        clusters[1][l1_id]["parent_id"] = l2_id

print(f"Found clusters:")
print(f"  Level 0: {len(clusters[0])}")
print(f"  Level 1: {len(clusters[1])}")
print(f"  Level 2: {len(clusters[2])}")

# Validate no null labels or descriptions
print(f"\nValidating cluster data quality...")
for level in [0, 1, 2]:
    empty_labels = sum(1 for c in clusters[level].values() if not c["label"])
    empty_descriptions = sum(1 for c in clusters[level].values() if not c["description"])
    print(f"  Level {level}: {empty_labels} empty labels, {empty_descriptions} empty descriptions")


Building cluster hierarchy from augmented CSV...
Using 1000 rows with valid clustering data
Found clusters:
  Level 0: 50
  Level 1: 25
  Level 2: 5

Validating cluster data quality...
  Level 0: 0 empty labels, 0 empty descriptions
  Level 1: 0 empty labels, 0 empty descriptions
  Level 2: 0 empty labels, 0 empty descriptions


In [6]:
# Calculate total topic counts for each cluster (including children)
print("Calculating total topic counts per cluster...")

node_topic_counts = {}

# For L0, topic_count is already correct (direct count)
for l0_id, l0_data in clusters[0].items():
    node_id = f"L0-{l0_id}"
    node_topic_counts[node_id] = l0_data["topic_count"]

# For L1, sum up their L0 children's topic counts
for l1_id, l1_data in clusters[1].items():
    node_id = f"L1-{l1_id}"
    total = 0
    # Find all L0 children of this L1 node
    for l0_id, l0_data in clusters[0].items():
        if l0_data["parent_id"] == l1_id:
            l0_node_id = f"L0-{l0_id}"
            total += node_topic_counts.get(l0_node_id, 0)
    node_topic_counts[node_id] = total if total > 0 else l1_data["topic_count"]

# For L2, sum up their L1 children's topic counts
for l2_id, l2_data in clusters[2].items():
    node_id = f"L2-{l2_id}"
    total = 0
    # Find all L1 children of this L2 node
    for l1_id, l1_data in clusters[1].items():
        if l1_data["parent_id"] == l2_id:
            l1_node_id = f"L1-{l1_id}"
            total += node_topic_counts.get(l1_node_id, 0)
    node_topic_counts[node_id] = total if total > 0 else l2_data["topic_count"]

print(f"Calculated topic counts for {len(node_topic_counts)} nodes")
print(f"Sample counts: {dict(list(node_topic_counts.items())[:5])}")


Calculating total topic counts per cluster...
Calculated topic counts for 80 nodes
Sample counts: {'L0-20': 30, 'L0-18': 3, 'L0-42': 18, 'L0-11': 23, 'L0-12': 28}


In [7]:
# Build hierarchical data structure for icicle chart
print("Building icicle chart data structure...")

ids = []
labels = []
parents = []
values = []
hover_texts = []
colors = []

# Color scheme: different colors for each level
level_colors = {
    0: "#1f77b4",  # Blue for L0
    1: "#ff7f0e",  # Orange for L1
    2: "#2ca02c",  # Green for L2
}

# Process nodes level by level (L2 -> L1 -> L0) so parents exist before children
for level in [2, 1, 0]:
    # Sort by cluster ID (convert to int for proper numeric sorting)
    level_clusters = sorted(
        clusters[level].items(), 
        key=lambda x: int(x[0]) if str(x[0]).isdigit() else 999999
    )
    
    for cluster_id, cluster_data in level_clusters:
        node_id = f"L{level}-{cluster_id}"
        
        # Find parent ID
        parent_id = ""
        parent_level = level + 1
        if cluster_data["parent_id"] is not None and cluster_data["parent_id"] != "":
            parent_id = f"L{parent_level}-{cluster_data['parent_id']}"
        
        ids.append(node_id)
        labels.append(cluster_data["label"])
        parents.append(parent_id)
        values.append(node_topic_counts.get(node_id, cluster_data["topic_count"]))
        
        # Create hover text
        hover_text = (
            f"<b>Level {level} - Cluster {cluster_id}</b><br>"
            f"Label: {cluster_data['label']}<br>"
            f"Description: {cluster_data['description'][:150]}...<br>"
            f"Direct Topics: {cluster_data['topic_count']}<br>"
            f"Total Topics: {node_topic_counts.get(node_id, cluster_data['topic_count'])}"
        )
        hover_texts.append(hover_text)
        colors.append(level_colors[level])

print(f"Built icicle data: {len(ids)} nodes")
print(f"Sample IDs: {ids[:5]}")
print(f"Sample parents: {parents[:5]}")

# Validation checks
print(f"\nValidation:")
print(f"  Total IDs: {len(ids)}")
print(f"  Unique IDs: {len(set(ids))}")
print(f"  Empty parents (roots): {sum(1 for p in parents if p == '')}")
print(f"  Non-empty parents: {sum(1 for p in parents if p != '')}")

# Check if all parent IDs exist
missing_parents = []
for i, parent in enumerate(parents):
    if parent != "" and parent not in ids:
        missing_parents.append((i, parent, ids[i]))

if missing_parents:
    print(f"\nWARNING: {len(missing_parents)} missing parent IDs:")
    for idx, parent, child in missing_parents[:5]:
        print(f"  Child {child} has parent {parent} which doesn't exist")
else:
    print(f"\n✓ All parent IDs exist in ids list")

# Check values
print(f"\nValues check:")
print(f"  Min value: {min(values) if values else 'N/A'}")
print(f"  Max value: {max(values) if values else 'N/A'}")
print(f"  Zero values: {sum(1 for v in values if v == 0)}")
print(f"  Total sum: {sum(values)}")


Building icicle chart data structure...
Built icicle data: 80 nodes
Sample IDs: ['L2-0', 'L2-1', 'L2-2', 'L2-3', 'L2-4']
Sample parents: ['', '', '', '', '']

Validation:
  Total IDs: 80
  Unique IDs: 80
  Empty parents (roots): 5
  Non-empty parents: 75

✓ All parent IDs exist in ids list

Values check:
  Min value: 2
  Max value: 537
  Zero values: 0
  Total sum: 3000


In [8]:
# Create icicle chart
print("Creating icicle chart...")

fig_icicle = go.Figure(
    go.Icicle(
        ids=ids,
        labels=labels,
        parents=parents,
        values=values,
        branchvalues="total",  # Values are total topic counts
        hovertemplate="%{hovertext}<extra></extra>",
        hovertext=hover_texts,
        marker=dict(
            colors=colors,
            line=dict(width=2, color="#ffffff"),
        ),
        maxdepth=3,
    )
)

fig_icicle.update_layout(
    title={
        "text": "Hierarchical Cluster Structure - Icicle View (50 → 25 → 5)",
        "x": 0.5,
        "xanchor": "center",
        "font": {"size": 20},
    },
    height=800,
    width=1200,
)

fig_icicle.show()
print("✓ Icicle chart created successfully!")


Creating icicle chart...


✓ Icicle chart created successfully!


In [9]:
# Create Treemap visualization (tree structure)
print("Creating treemap visualization...")

fig_treemap = go.Figure(
    go.Treemap(
        ids=ids,
        labels=labels,
        parents=parents,
        values=values,
        branchvalues="total",
        hovertemplate="%{hovertext}<extra></extra>",
        hovertext=hover_texts,
        marker=dict(
            colors=colors,
            line=dict(width=2, color="#ffffff"),
        ),
        maxdepth=3,
    )
)

fig_treemap.update_layout(
    title={
        "text": "Cluster Hierarchy Treemap (50 → 25 → 5)",
        "x": 0.5,
        "xanchor": "center",
        "font": {"size": 20},
    },
    height=800,
    width=1200,
)

fig_treemap.show()
print("✓ Treemap created successfully!")


Creating treemap visualization...


✓ Treemap created successfully!


In [10]:
# Final validation summary
print("=" * 80)
print("VALIDATION SUMMARY")
print("=" * 80)
print(f"✓ Total rows in CSV: {len(df)}")
print(f"✓ Rows with valid L0 clustering: {len(df_valid)}")
print(f"✓ Unique clusters found:")
print(f"    - Level 0: {len(clusters[0])}")
print(f"    - Level 1: {len(clusters[1])}")
print(f"    - Level 2: {len(clusters[2])}")
print(f"✓ Total nodes in visualization: {len(ids)}")
print(f"✓ All parent IDs valid: {len(missing_parents) == 0}")
print(f"✓ Total topics represented: {sum(values)}")
print(f"✓ Icicle chart created successfully")
print("=" * 80)
print("All validations passed! ✓")


VALIDATION SUMMARY
✓ Total rows in CSV: 1000
✓ Rows with valid L0 clustering: 1000
✓ Unique clusters found:
    - Level 0: 50
    - Level 1: 25
    - Level 2: 5
✓ Total nodes in visualization: 80
✓ All parent IDs valid: True
✓ Total topics represented: 3000
✓ Icicle chart created successfully
All validations passed! ✓
