# VERA Clustering 3D Visualization

This notebook loads the VERA dataset and visualizes the clusters in 3D using PCA and Plotly.
You can rotate, zoom, and hover over points to see video names.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.stats import zscore
from pathlib import Path

# Configuration
DATA_PATH = Path("data/clustering_dataset/master_vector_data_set.csv")
FEATURES = [
    "body_gesture_activity_mean",   # Energy
    "body_posture_openness_mean",   # Confidence
    "body_body_sway_mean",          # Stability
    "audio_wpm",                    # Pacing
    "face_head_speed_mean",         # Engagement
    "face_smile_mean",              # Warmth
    "audio_pitch_std_st"            # Expressiveness
]
K = 3  # Number of clusters

In [None]:
# 1. Load Data
df = pd.read_csv(DATA_PATH)
X = df[FEATURES].copy()
video_ids = df["video_name"].astype(str)

# 2. Preprocess (Impute & Outlier Removal)
X = X.fillna(X.mean())
z_scores = np.abs(zscore(X))
outliers = (z_scores > 3.0).any(axis=1)
X_clean = X[~outliers]
video_ids_clean = video_ids[~outliers]

# 3. Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)

# 4. Cluster
kmeans = KMeans(n_clusters=K, n_init=50, random_state=42)
labels = kmeans.fit_predict(X_scaled)

# 5. Generate Persona Names
df_labeled = X_clean.copy()
df_labeled['cluster'] = labels
centroids = df_labeled.groupby('cluster').mean()
global_mean = X_clean.mean()
global_std = X_clean.std()

persona_map = {}
for cluster_id in range(K):
    centroid = centroids.loc[cluster_id]
    z_scores = (centroid - global_mean) / global_std
    top_features = z_scores.abs().sort_values(ascending=False).head(2)
    desc_parts = []
    for feat in top_features.index:
        z = z_scores[feat]
        direction = "High" if z > 0 else "Low"
        name = feat.replace("body_", "").replace("face_", "").replace("audio_", "").replace("_mean", "").replace("_st", "")
        desc_parts.append(f"{direction} {name}")
    persona_map[str(cluster_id)] = ", ".join(desc_parts)

print("Personas:", persona_map)

# 6. Prepare Viz Data
df_viz = X_clean.copy()
df_viz["Cluster ID"] = labels.astype(str)
df_viz["Persona"] = df_viz["Cluster ID"].map(persona_map)
df_viz["Video"] = video_ids_clean

In [None]:
# 7. PCA to 3D
pca = PCA(n_components=3)
components = pca.fit_transform(X_scaled)

df_viz["PC1"] = components[:, 0]
df_viz["PC2"] = components[:, 1]
df_viz["PC3"] = components[:, 2]

var_explained = pca.explained_variance_ratio_
total_var = var_explained.sum()

print(f"Total Variance Explained by 3 Components: {total_var:.1%}")
print(f"PC1: {var_explained[0]:.1%}, PC2: {var_explained[1]:.1%}, PC3: {var_explained[2]:.1%}")

In [None]:
# 8. Interactive 3D Plot
fig = px.scatter_3d(
    df_viz,
    x="PC1",
    y="PC2",
    z="PC3",
    color="Persona",
    hover_name="Video",
    hover_data=FEATURES,
    title=f"VERA Clustering (K={K}) - Total Variance Explained: {total_var:.1%}",
    labels={
        "PC1": f"PC1 ({var_explained[0]:.1%})",
        "PC2": f"PC2 ({var_explained[1]:.1%})",
        "PC3": f"PC3 ({var_explained[2]:.1%})"
    },
    opacity=0.8,
    size_max=10
)

fig.update_layout(margin=dict(l=0, r=0, b=0, t=30))
fig.show()