This notebook generates cluster IDs using the data contained within the Anndata object produced by the adata_conversion notebook. These clusters are then projected onto the PHATE structure embedded as coordinates in the Anndata object by the PHATE_computing notebook.

In [None]:
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import preprocessing
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import pandas as pd
import phate
import math
import random
import gc
import scprep
from datetime import datetime, time
from matplotlib.animation import ImageMagickWriter
import matplotlib.animation as animation
import zipfile
from urllib.request import urlopen
import scipy.stats as st
from scipy.stats import norm
from scipy.stats import gaussian_kde
from scipy.stats import kde
from scipy.stats import binned_statistic
from scipy.stats import f_oneway
from matplotlib.colors import LogNorm
from mpl_toolkits.axes_grid1 import make_axes_locatable
plt.rcParams['pdf.fonttype'] = 42
print(sns.__version__)
from anndata import AnnData
import scanpy as sc
from delve import *
import anndata as ad
from sklearn.preprocessing import MinMaxScaler
from kh import sketch
from sklearn.cluster import KMeans
import umap
print(sc.__version__)
today = datetime.now().strftime("%m%d%Y-%H%M")

In [2]:
#Read back in the subsampled adata file
adata_save_path = r'your/save/path/here.h5ad'
standard_adata_sub_no_sparse = anndata.read_h5ad(adata_save_path)

In [4]:
def laplacian_score_fs(adata = None,
                    k: int  = None,
                    n_jobs: int  = -1):

    X, feature_names, obs_names = parse_input(adata)
    W = construct_affinity(X = X, k = k, n_jobs = n_jobs)
    scores = laplacian_score(X = X, W = W)
    predicted_features = pd.DataFrame(scores, index = feature_names, columns = ['laplacian_score'])
    predicted_features = predicted_features.sort_values(by = 'laplacian_score', ascending = True)

    return predicted_features 

In [5]:
l_score_standard = laplacian_score_fs(standard_adata_sub_no_sparse, k = 100)

In [None]:
len(l_score_standard)

In [None]:
l_score_standard.index[:46]

K Means Clustering - 3D

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Step 1: Retrieve the PHATE embeddings from the AnnData object
embedding = standard_trimmed_noPSTAT5_adata_sub.obsm['X_phate']

# Define a list of K values to loop through
k_values = [4, 5, 6, 7, 8, 9, 10, 11, 12]

for k in k_values:
    # Run K-means clustering for the current value of K using the precomputed PHATE embeddings
    kmeans = KMeans(n_clusters=k, random_state=0).fit(embedding)
    
    # Assign the cluster labels back to the AnnData object
    cluster_label = f'kmeans_clusters_k{k}'
    standard_trimmed_noPSTAT5_adata_sub.obs[cluster_label] = kmeans.labels_.astype(str)
    
    # Plotting the clusters for the current value of K using the pre-computed PHATE coordinates
    fig = plt.figure(figsize=(20, 16))
    # Define four views with 90-degree rotations in azimuthal angle
    views = [(30, 30), (30, 120), (30, 210), (30, 300)]
    for i, (elev, azim) in enumerate(views, start=1):
        ax = fig.add_subplot(2, 2, i, projection='3d')
        scatter = ax.scatter(embedding[:, 0], embedding[:, 1], embedding[:, 2], c=kmeans.labels_, cmap='viridis')
        ax.view_init(elev=elev, azim=azim)
        ax.set_xlabel('PHATE 1')
        ax.set_ylabel('PHATE 2')
        ax.set_zlabel('PHATE 3')
        ax.set_title(f'View {i} - Elev {elev}, Azim {azim}')
    
    plt.colorbar(scatter, ax=fig.axes, orientation='horizontal', label='Cluster Label')
    plt.suptitle(f'K-means Clusters Visualized with PHATE for K={k}')
    plt.show()

    # Report sample_IDs present in each cluster and their counts
    for i in range(k):
        cluster_sample_ids = standard_trimmed_noPSTAT5_adata_sub.obs.loc[standard_trimmed_noPSTAT5_adata_sub.obs[cluster_label] == str(i), 'sample_ID']
        print(f'Cluster {i} for K={k} analysis:')
        # Reporting unique sample_IDs
        unique_sample_ids = cluster_sample_ids.unique()
        print(f'Contains unique sample_IDs: {unique_sample_ids}')
        # Reporting counts of cells from each sample_ID in the cluster
        sample_id_counts = cluster_sample_ids.value_counts()
        print(f'Counts of cells from each sample_ID in the cluster:\n{sample_id_counts}\n')
