### Clustering

In [1]:
import importlib
import functions
importlib.reload(functions)
functions.set_user('Kaja')
data = functions.call_data_clean(p_threshold=1.3)

ATAC_seq = data['ATAC_seq_T']

In [2]:

from sklearn.cluster import KMeans
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.metrics import pairwise_distances
import igraph as ig
import leidenalg
import networkx as nx

In [3]:
def clustering(data, method, n_clusters, is_correlation=False, plot=True):
    
    if isinstance(data, pd.DataFrame):
        df=data.copy()
    else: df=pd.DataFrame(data)

    if is_correlation:
        dist_matrix=1-df
        condensed=pairwise_distances(dist_matrix)
        data_for_clustering=condensed
    else: 
        data_for_clustering=StandardScaler().fit_transform(df)

    if method == 'kmeans':
        kmeans=KMeans(n_clusters=n_clusters, random_state=42)
        labels=kmeans.fit_predict(data_for_clustering)

    elif method=='dendogram':
        Z=linkage(data_for_clustering, method= 'ward')
        plt.figure(figsize=(8,5))
        dendrogram(Z, labels=df.index.to_list(), leaf_rotation=90)
        plt.title('Hirarchisches Clustering')
        plt.tight_layout()
        plt.show()
        from scipy.cluster.hierarchy import fcluster
        labels=fcluster(Z, n_clusters, criterion='maxclust')

    elif method == 'leiden':
        k=min(10, len(df)-1)
        knn_graph=kneighbors_graph(data_for_clustering, k, method='connectivity')
        nx_graph=nx.from_scipy_sparse_array(knn_graph)
        ig_graph=ig.Graph.from_networkx(nx_graph)
        partition=leidenalg.find_partition(ig_graph, leidenalg.RBConfigurationVertexPartition)
        labels=np.array(partition.membership)

    labels_series=pd.Series(labels, index=df.index, name='Cluster')

    if plot and df.shape[1]==2:
        plt.figure(figsize=(6, 5))
        sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 1], hue=labels_series, palette='tab10')
        plt.title(f"Cluster plot ({method})")
        plt.xlabel(df.columns[0])
        plt.ylabel(df.columns[1])
        plt.legend(title='Cluster')
        plt.tight_layout()
        plt.show()
    

In [4]:
umap_data = ATAC_seq.iloc[:, :]  

# Apply UMAP
umap_embedding = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean').fit_transform(umap_data.iloc[1:, :])

clustering(data=umap_embedding, method='kmeans', n_clusters=4, is_correlation=False, plot=True)

NameError: name 'umap' is not defined