In [None]:
import pandas as pd
import umap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from matplotlib.colors import ListedColormap, BoundaryNorm
import numpy as np
from sklearn.metrics import adjusted_rand_score, silhouette_score

# 1. Load data
filtered_df = pd.read_csv('./data/dementia_MR_combined_IVW4_cleaned.csv', encoding='utf-8-sig')

# Remove any leading/trailing spaces in column names
filtered_df.columns = filtered_df.columns.str.strip()

# Print column names for verification
print(filtered_df.columns)

# Feature matrix (skip 'Protein' and 'Trend' columns)
features = filtered_df.iloc[:, 2:].values
trend_labels = filtered_df['Trend'].values

# Read p-value column (can be switched to other subsets for visualization)
pval_fam3c = filtered_df['pval_anycause'].values

# 2. Encode trend labels
le = LabelEncoder()
trend_encoded = le.fit_transform(trend_labels)

# 3. Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# 4. Supervised KMeans (optional, for evaluation)
kmeans = KMeans(
    n_clusters=len(le.classes_),
    random_state=42,
    n_init=20
)
kmeans_labels = kmeans.fit_predict(
    np.column_stack([scaled_features, trend_encoded * 0.5])
)

# 5. UMAP dimensionality reduction
umap_model = umap.UMAP(
    n_neighbors=35,
    min_dist=4,
    spread=5,
    local_connectivity=2,
    n_components=2,
    metric='euclidean',
    random_state=42,
    target_weight=0.4,
    repulsion_strength=1
)
umap_result = umap_model.fit_transform(scaled_features, y=trend_encoded)

# 6. Custom colors & point sizes
custom_colors = ['#0001A1', '#037F77', '#C5272D', '#8A4B43',
                 '#F4A99B', '#F98F34', '#415C42', '#FFDC00']
cmap = ListedColormap(custom_colors[:len(le.classes_)])

# Create discrete norm for labels
norm = BoundaryNorm(boundaries=np.arange(-0.5, len(le.classes_) + 0.5),
                    ncolors=len(le.classes_))

# Point sizes: larger for more significant (p < 0.05); gray for non-significant
gray_size = 120
k = 3
max_size = 1200
sizes = np.where(
    pval_fam3c < 0.05,
    np.minimum(k / pval_fam3c, max_size),
    gray_size
)
sizes = np.minimum(sizes, max_size)
sizes = np.maximum(sizes, gray_size + 200)

# 7. Plotting
plt.figure(figsize=(12, 9))
ax = plt.gca()

# Add margin to axis limits for better visualization
def get_compact_range(arr, margin=0.02):
    r = np.ptp(arr)
    return np.min(arr) - r * margin, np.max(arr) + r * margin

ax.set_xlim(*get_compact_range(umap_result[:, 0]))
ax.set_ylim(*get_compact_range(umap_result[:, 1]))

# 7.1 Plot significant points (p < 0.05)
mask_sig = pval_fam3c < 0.05
scatter_sig = ax.scatter(
    umap_result[mask_sig, 0], umap_result[mask_sig, 1],
    c=trend_encoded[mask_sig],
    cmap=cmap,
    norm=norm,
    s=sizes[mask_sig],
    edgecolors='black',
    linewidth=0.3,
    alpha=0.9,
    zorder=3
)

# 7.2 Plot non-significant points in light gray
mask_nonsig = ~mask_sig
ax.scatter(
    umap_result[mask_nonsig, 0], umap_result[mask_nonsig, 1],
    c='lightgray',
    s=gray_size,
    alpha=0.3,
    zorder=2
)

# Legend for trend classes
handles = [
    plt.Line2D([0], [0], marker='o', color='w',
               label=cls, markerfacecolor=custom_colors[i],
               markersize=10)
    for i, cls in enumerate(le.classes_)
]
plt.legend(handles=handles, title="Trend", loc='best', fontsize=12)

# Title and labels
plt.title('dementiaâ€“anytype', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('UMAP Dimension 1', fontsize=12, labelpad=10)
plt.ylabel('UMAP Dimension 2', fontsize=12, labelpad=10)

# Grid
ax.grid(True, linestyle=':', alpha=0.4, which='both')
ax.set_axisbelow(True)

plt.tight_layout()
plt.savefig('./results/enhanced_umap_with_dementia-anytype.pdf',
            dpi=600, bbox_inches='tight')
plt.show()

# 8. Evaluation metrics
ari = adjusted_rand_score(trend_labels, kmeans_labels)
silhouette = silhouette_score(scaled_features, kmeans_labels)
print(f"Adjusted Rand Index: {ari:.3f}")
print(f"Silhouette Score: {silhouette:.3f}")