In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Step 1: Load phenotype data from Excel
def load_phenotype_data_from_excel(file_path):
    # Read the Excel file into a DataFrame
    df = pd.read_excel(file_path, index_col=0)  # Assumes models are in rows and phenotypes are columns
    print(f"Loaded data with {df.shape[0]} models and {df.shape[1]} phenotypes.")
    return df

# Step 2: Calculate the Jaccard similarity matrix from binary encoded phenotype data
def calculate_jaccard_similarity_matrix(df):
    # Calculate the Jaccard similarity matrix from binary phenotype data
    jaccard_distances = pdist(df.values, metric='jaccard')
    jaccard_matrix = pd.DataFrame(squareform(1 - jaccard_distances), index=df.index, columns=df.index)
    print("Jaccard similarity matrix calculated.")
    return jaccard_matrix

# Step 3: Perform hierarchical clustering and return cluster labels
def hierarchical_clustering_at_distance(jaccard_matrix, method, distance_threshold, save_path=None):
    # Convert the similarity matrix to a distance matrix (1 - similarity)
    distance_matrix = 1 - jaccard_matrix

    # Perform hierarchical/agglomerative clustering
    linkage_matrix = linkage(squareform(distance_matrix), method=method)

    # Generate cluster labels by cutting the dendrogram at a specified distance threshold
    cluster_labels = fcluster(linkage_matrix, distance_threshold, criterion='distance')

    # Plot the dendrogram (optional for visualization)
    plt.figure(figsize=(10, 7))
    dendrogram(linkage_matrix, labels=jaccard_matrix.index, leaf_rotation=90)
    plt.title(f'Hierarchical Clustering Dendrogram ({method} linkage)')
    plt.xlabel('Models')
    plt.ylabel('Distance')

    # Save the dendrogram as a high-quality image if save_path is provided
    if save_path:
        plt.savefig(save_path, format='pdf', dpi=300, bbox_inches='tight')
        print(f"Dendrogram saved as {save_path}")

    plt.show()

    return cluster_labels, distance_matrix

# Step 4: Calculate the silhouette score
def calculate_silhouette_score(cluster_labels, distance_matrix):
    silhouette_avg = silhouette_score(distance_matrix, cluster_labels, metric="precomputed")
    print(f"Silhouette score: {silhouette_avg}")
    return silhouette_avg

# Step 5: Display clusters
def show_clusters(df, cluster_labels):
    clusters = {}

    # Group models by their cluster labels
    for model, cluster_label in zip(df.index, cluster_labels):
        if cluster_label not in clusters:
            clusters[cluster_label] = []
        clusters[cluster_label].append(model)

    # Print the clusters
    for cluster_label, models in clusters.items():
        print(f"\nCluster {cluster_label}:")
        for model in models:
            print(f" - {model}")

    # Return the number of unique clusters
    return len(clusters)

# Main function
if __name__ == '__main__':
    # Load binary-encoded phenotype data from Excel file
    file_path = 'mouse_phenotypes.xlsx'  # Replace with your file
    df = load_phenotype_data_from_excel(file_path)

    # Calculate Jaccard similarity matrix
    jaccard_matrix = calculate_jaccard_similarity_matrix(df)

    # Set clustering parameters
    distance_threshold = 0.6
    save_path = 'dendrogram.pdf'
    method = 'complete'

    # Perform hierarchical clustering
    cluster_labels, distance_matrix = hierarchical_clustering_at_distance(jaccard_matrix, method, distance_threshold, save_path=save_path)

    # Calculate the silhouette score
    silhouette_avg = calculate_silhouette_score(cluster_labels, distance_matrix)

    # Display clusters and count them
    num_clusters = show_clusters(df, cluster_labels)
    print(f"\nNumber of clusters: {num_clusters}")
