In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, accuracy_score, f1_score, roc_auc_score
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from scipy.cluster.hierarchy import dendrogram, linkage
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import geoopt
import torch

# ----------------------
# Configuration
# ----------------------
output_dir = "result"
os.makedirs(output_dir, exist_ok=True)

file1 = "data"
file2 = "data"

# ----------------------
# Load Datasets
# ----------------------
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

if "name" in df2.columns:
    df2["subject#"] = df2["name"].str.extract(r'_S(\d+)_').astype(int)[0]

subjects1 = set(df1["subject#"].unique())
subjects2 = set(df2["subject#"].unique())
common_subjects = sorted(subjects1.intersection(subjects2))
print("Common subjects:", common_subjects)

# ----------------------
# Align rows per subject
# ----------------------
aligned_rows_1 = []
aligned_rows_2 = []

for s in common_subjects:
    rows1 = df1[df1["subject#"] == s]
    rows2 = df2[df2["subject#"] == s]
    n = min(len(rows1), len(rows2))
    aligned_rows_1.append(rows1.iloc[:n, :])
    aligned_rows_2.append(rows2.iloc[:n, :])

df1_aligned = pd.concat(aligned_rows_1).reset_index(drop=True)
df2_aligned = pd.concat(aligned_rows_2).reset_index(drop=True)

print("Aligned df1 shape:", df1_aligned.shape)
print("Aligned df2 shape:", df2_aligned.shape)

# ----------------------
# Preprocessing
def preprocess(df, drop_cols=[]):
    df = df.drop(columns=drop_cols, errors="ignore")
    df_numeric = df.apply(pd.to_numeric, errors="coerce").fillna(0)
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df_numeric)
    return scaled, df_numeric.columns.tolist()

X1, cols1 = preprocess(df1_aligned, drop_cols=["subject#", "test_time"])
X2, cols2 = preprocess(df2_aligned, drop_cols=["subject#", "name", "status"])

if "status" in df2_aligned.columns:
    y = df2_aligned["status"].values
    compute_classification_metrics = True
else:
    y = None
    compute_classification_metrics = False

# ----------------------
# Similarity Matrix
similarity_matrix = cosine_similarity(X1)

# ----------------------
# Graph Construction
k = 10
G = nx.Graph()
for i in range(similarity_matrix.shape[0]):
    indices = np.argsort(-similarity_matrix[i])[1:k+1]
    for j in indices:
        G.add_edge(i, j, weight=similarity_matrix[i, j])

# ----------------------
# Hyperbolic Embedding
torch.manual_seed(0)
manifold = geoopt.PoincareBall()
embedding = torch.nn.Parameter(torch.randn(X1.shape[0], 2) * 1e-2)
optimizer = torch.optim.Adam([embedding], lr=1e-2)

n_epochs = 600
losses, rmse_scores, mae_scores, silhouette_scores = [], [], [], []
accuracy_scores, f1_scores, roc_auc_scores = [], [], []

for epoch in range(n_epochs):
    dist = manifold.dist(embedding.unsqueeze(0), embedding.unsqueeze(1))
    target = torch.from_numpy(1 - similarity_matrix).float()
    loss = torch.mean((dist - target) ** 2)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    with torch.no_grad():
        embedding.copy_(manifold.projx(embedding))

    losses.append(loss.item())
    rmse_scores.append(torch.sqrt(torch.mean((dist - target)**2)).item())
    mae_scores.append(torch.mean(torch.abs(dist - target)).item())

    with torch.no_grad():
        emb_np = embedding.detach().cpu().numpy()
        cluster = AgglomerativeClustering(n_clusters=4).fit(emb_np)
        sil = silhouette_samples(emb_np, cluster.labels_).mean()
        silhouette_scores.append(sil)

        if compute_classification_metrics:
            rf = RandomForestClassifier(n_estimators=50, random_state=42)
            rf.fit(emb_np, y)
            y_pred = rf.predict(emb_np)
            y_prob = rf.predict_proba(emb_np)[:,1]
            if len(np.unique(y_pred)) < 2:
                acc, f1, auc = np.nan, np.nan, np.nan
            else:
                acc = accuracy_score(y, y_pred)
                f1 = f1_score(y, y_pred)
                auc = roc_auc_score(y, y_prob)
            accuracy_scores.append(acc)
            f1_scores.append(f1)
            roc_auc_scores.append(auc)
        else:
            accuracy_scores.append(np.nan)
            f1_scores.append(np.nan)
            roc_auc_scores.append(np.nan)

    if epoch % 50 == 0:
        print(f"Epoch {epoch}: Loss={loss.item():.4f}, Silhouette={sil:.4f}")

embedding_np = embedding.detach().cpu().numpy()
labels = cluster.labels_

# ----------------------
# Save Embedding Results
df_embedding = pd.DataFrame(embedding_np, columns=["Dim1", "Dim2"])
df_embedding["Cluster"] = labels
df_embedding["SubjectID"] = df1_aligned["subject#"].values
df_embedding.to_excel(os.path.join(output_dir, "embedding_clusters.xlsx"), index=False)

# ----------------------
# Plotting
def save_fig(name):
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, name))
    plt.close()

# 1. Metrics Over Epochs
plt.figure(figsize=(12,8))
plt.plot(losses, label="Loss")
plt.plot(rmse_scores, label="RMSE")
plt.plot(mae_scores, label="MAE")
plt.plot(silhouette_scores, label="Silhouette")
plt.legend()
plt.title("Embedding and Classification Metrics Over Epochs")
save_fig("metrics_over_epochs.png")

# 2. Embedding Scatter
plt.figure()
sns.scatterplot(x=embedding_np[:,0], y=embedding_np[:,1], hue=labels, palette="tab10")
plt.title("Embedding Scatter")
save_fig("embedding_scatter.png")

# 3. Distance Distribution
dists = dist.detach().numpy().flatten()
plt.figure()
plt.hist(dists, bins=50)
plt.title("Distance Histogram")
save_fig("distance_histogram.png")

# 4. Silhouette Distribution
sil_vals = silhouette_samples(embedding_np, labels)
plt.figure()
plt.hist(sil_vals, bins=20)
plt.title("Silhouette Histogram")
save_fig("silhouette_histogram.png")

# 5. Similarity Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(similarity_matrix[:100,:100], cmap="viridis")
plt.title("Similarity Heatmap (First 100)")
save_fig("similarity_heatmap.png")

# 6. Graph Degree Distribution
degrees = [d for n,d in G.degree()]
plt.figure()
plt.hist(degrees, bins=range(1,max(degrees)+2))
plt.title("Graph Degree Distribution")
save_fig("degree_distribution.png")

# 7. t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_proj = tsne.fit_transform(X1)
plt.figure()
sns.scatterplot(x=tsne_proj[:,0], y=tsne_proj[:,1], hue=labels, palette="tab10")
plt.title("t-SNE Projection")
save_fig("tsne_projection.png")

# 8. Dendrogram
linked = linkage(embedding_np, method="ward")
plt.figure(figsize=(12,5))
dendrogram(linked)
plt.title("Dendrogram")
save_fig("dendrogram.png")

# 9. Graph Visualization with Legend
plt.figure(figsize=(8,8))
pos = nx.spring_layout(G)

# prepare unique labels
unique_labels = np.unique(labels)
cmap = plt.get_cmap("tab10")

# draw edges
nx.draw_networkx_edges(G, pos, alpha=0.1)

# draw nodes per cluster
for lab in unique_labels:
    idx = np.where(labels == lab)[0]
    nx.draw_networkx_nodes(
        G, pos,
        nodelist=idx,
        node_size=20,
        node_color=[cmap(lab)],
        label=f"Cluster {lab}"
    )

plt.title("Graph Visualization with Clusters")
plt.legend(title="Clusters", loc="best")
save_fig("graph_visualization.png")

# 10. Correlation Heatmap
corr = pd.DataFrame(X1, columns=cols1).corr()
plt.figure(figsize=(15,15))
sns.heatmap(corr, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
save_fig("correlation_heatmap.png")

def plot_poincare_disk(embeddings, labels, path):
    fig, ax = plt.subplots(figsize=(6,6))
    circle = plt.Circle((0,0), 1, fill=False, color="black")
    ax.add_artist(circle)

    unique_labels = np.unique(labels)
    cmap = plt.get_cmap("tab10")

   
    for lab in unique_labels:
        idx = np.where(labels == lab)[0]
        ax.scatter(
            embeddings[idx,0],
            embeddings[idx,1],
            c=[cmap(lab)],
            s=30,
            edgecolor="k",
            label=f"Cluster {lab}"
        )

    ax.set_xlim(-1.05,1.05)
    ax.set_ylim(-1.05,1.05)
    ax.set_aspect("equal")
    ax.axis("off")
    plt.title("Poincaré Disk Embedding")
    plt.legend(title="Clusters", loc="best")
    save_fig(path)


# 12. SHAP Plot
rf_final = RandomForestClassifier(n_estimators=100, random_state=42)
rf_final.fit(X2, y)
explainer = shap.TreeExplainer(rf_final)
shap_values = explainer.shap_values(X2)

# Handle proper slicing
if len(shap_values.shape) == 3:
    print("SHAP: Detected 3D output, selecting class 1 contributions.")
    shap_values_class1 = shap_values[:,:,1]
else:
    shap_values_class1 = shap_values

# Safety check
if shap_values_class1.shape[1] > X2.shape[1]:
    print("SHAP: Detected extra column, slicing to match features...")
    shap_values_class1 = shap_values_class1[:, :-1]

shap.summary_plot(
    shap_values_class1,
    pd.DataFrame(X2, columns=cols2),
    max_display=X2.shape[1],
    show=False
)
save_fig("shap_summary.png")


print("✅ All steps completed. Results saved in:", output_dir)


Common subjects: [1, 2, 4, 5, 6, 7, 8, 10, 13, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 31, 32, 33, 34, 35, 37, 39, 42]
Aligned df1 shape: (171, 22)
Aligned df2 shape: (171, 25)
Epoch 0: Loss=1.0996, Silhouette=0.7036
Epoch 50: Loss=0.3038, Silhouette=0.3899
Epoch 100: Loss=0.2171, Silhouette=0.4065
Epoch 150: Loss=0.1057, Silhouette=0.4740
Epoch 200: Loss=0.0641, Silhouette=0.4193
Epoch 250: Loss=0.0597, Silhouette=0.4308
Epoch 300: Loss=0.0593, Silhouette=0.4393
Epoch 350: Loss=0.0592, Silhouette=0.4651
Epoch 400: Loss=0.0592, Silhouette=0.4310
Epoch 450: Loss=0.0592, Silhouette=0.4379
Epoch 500: Loss=0.0592, Silhouette=0.4379
Epoch 550: Loss=0.0592, Silhouette=0.4307
SHAP: Detected 3D output, selecting class 1 contributions.
✅ All steps completed. Results saved in: /Volumes/SP_SAGHAR/Documents/University/Articles/Conference/5- ICBME/2/result
