# Agglomerative Hierarchical Clustering of Shared Clonotypes and Euclidean Distance Calculation Between Dendrogram Leaves
- This notebook explains how agglomerative hierarchical clustering was performed for shared clonotypes with >=5 cells in each organ, as seen in Figure 6. Note: Figure 6C is a representative dendrogram (clonotype32).
- Much of the source code is borrowed from the scikit learn example on plotting hierarchical clustering dendograms and can be found here: https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
- Further, we show how Euclidean distances were calculated between dendrogram leaves for each shared clonotype with >=5 cells in each organ (as seen in Fig 6D).

In [None]:
import numpy as np #v1.26.4
import pandas as pd #v2.2.0
import anndata #v0.10.5.post1
import scanpy as sc #v1.9.8
import pydeseq2 #v0.4.10
import decoupler as dc #v1.6.0
import matplotlib.pyplot as plt #matplotlib v3.7.3
import random  

c_iSeed = 6161904
np.random.seed(c_iSeed)
random.seed(c_iSeed)

In [None]:
# Import Hierarchical Clustering Packages
import scipy #v1.12.0
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import load_iris #sklearn v.
from sklearn.cluster import AgglomerativeClustering

In [None]:
#Import annData hdf5
final_filtered_object = anndata.read_h5ad(filename=___) #Replace ___ with path to file "02_final_filtered_object.hdf5"

## Agglomerative Hierarchical Clustering

In [None]:
# Generate List of Clonotypes with >= 5 Cells in Both Lung and Liver
(final_filtered_object.obs.groupby(["clonotype_id","animal_id","organ"])
.size().reset_index(name="cells_in_clones")
.pivot(index=["clonotype_id","animal_id"],columns=["organ"],values="cells_in_clones")
.query('Liver >= 5 and Lung >= 5'))

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs) #return only for leaves

# Filter by Clonotype
clonotype_data = final_filtered_object[(final_filtered_object.obs['clonotype_id'].isin(['clonotype32']))] # Insert desired clonotype here
X = clonotype_data.obsm["scVI_Latent_Space"]
organ_labels = clonotype_data.obs['organ']
organ_labels = organ_labels.to_numpy()

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, compute_distances=True)

model = model.fit(X)

plot_dendrogram(model, truncate_mode=None, leaf_rotation=90, labels=organ_labels) # plot dendrogram
tree = plot_dendrogram(model, truncate_mode=None, leaf_rotation=90, labels=organ_labels) # plot dendrogram

label_colors = {'Lung': 'b', 'Liver': 'k'}

ax = plt.gca()
xlbls = ax.get_xmajorticklabels()
for lbl in xlbls:
    lbl.set_color(label_colors[lbl.get_text()])
    
plt.show()

## Euclidean Distance Calculation

In [None]:
# Print Euclidian Distances for Each Category (Liver v Liver, Lung v Lung, Liver v Lung)

Final_Table = pd.DataFrame()
    
# Filter by Clonotype
clonotype_data = final_filtered_object[(final_filtered_object.obs['clonotype_id'].isin(['clonotype2','clonotype5','clonotype8','clonotype11','clonotype17','clonotype19','clonotype32','clonotype34','clonotype42','clonotype43','clonotype45','clonotype53']))]
X = clonotype_data.obsm["scVI_Latent_Space"]
organ_labels = clonotype_data.obs['organ']
organ_labels = organ_labels.to_numpy()

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, compute_distances=True)
model = model.fit(X)
    
# Generate Table

leaves = len(model.labels_)-1

Organ_Label_1 = pd.DataFrame(organ_labels)
Organ_Label_1.columns = ["Organ"]
Organ_Label_1["Node_x"]= Organ_Label_1.index

Organ_Label_2 = pd.DataFrame(organ_labels)
Organ_Label_2.columns = ["Organ"]
Organ_Label_2["Node_y"]= Organ_Label_2.index

Euclidian_Children = pd.DataFrame(model.children_)
Euclidian_Children.columns = ["Node_x", "Node_y"]

Euclidian_Children = Euclidian_Children.merge(Organ_Label_1, how='left', on='Node_x')
Euclidian_Children = Euclidian_Children.merge(Organ_Label_2, how='left', on='Node_y')

Euclidian_Distance = pd.DataFrame(model.distances_)
Euclidian_Children['Distance']= Euclidian_Distance

Final_Table_All_Clones = Euclidian_Children[(Euclidian_Children['Node_x'] <= leaves) & (Euclidian_Children['Node_y'] <= leaves)]

categories = [
    (Final_Table_All_Clones['Organ_x'] == 'Liver') & (Final_Table_All_Clones['Organ_y'] == 'Liver'),
    (Final_Table_All_Clones['Organ_x'] == 'Lung') & (Final_Table_All_Clones['Organ_y'] == 'Lung'),
    ((Final_Table_All_Clones['Organ_x'] == 'Liver') & (Final_Table_All_Clones['Organ_y'] == 'Lung')) | ((Final_Table_All_Clones['Organ_x'] == 'Lung') & (Final_Table_All_Clones['Organ_y'] == 'Liver'))
]

values = ['Liver vs Liver', 'Lung vs Lung', 'Liver vs Lung']

Final_Table_All_Clones['Category'] = np.select(categories, values)
Final_Table_All_Clones = Final_Table_All_Clones.pivot(columns = 'Category', values = 'Distance').reset_index(drop=True)

print(Final_Table_All_Clones)