In [5]:
import pandas as pd
from sklearn.cluster import KMeans

# Load the data
df_raw = pd.read_csv("data/ImmGenATAC18_AllOCRsInfo.csv", header=0, quotechar='"', low_memory=False)

# Extract NK and ILC columns
df_expr = df_raw[['NK.27+11b-.BM', 'NK.27+11b+.BM', 'NK.27-11b+.BM', 'NK.27+11b-.Sp',
       'NK.27+11b+.Sp', 'NK.27-11b+.Sp', 'ILC2.SI', 'ILC3.NKp46-CCR6-.SI',
       'ILC3.NKp46+.SI', 'ILC3.CCR6+.SI']]  # Spalten 63-72
df_expr_T = df_expr.T  # Transponieren: Zelltypen = Zeilen

# K-Means-Clustering
k = 3 
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(df_expr_T)

# Result: Data frame with cell types and the corresponding clusters
df_clusters = pd.DataFrame({
    "Celltype": df_expr_T.index,
    "Cluster": cluster_labels
})

#save as csv
df_clusters.to_csv("celltype_kmeans_clusters.csv", index=False)
print(df_clusters)

              Celltype  Cluster
0        NK.27+11b-.BM        2
1        NK.27+11b+.BM        2
2        NK.27-11b+.BM        2
3        NK.27+11b-.Sp        0
4        NK.27+11b+.Sp        2
5        NK.27-11b+.Sp        2
6              ILC2.SI        1
7  ILC3.NKp46-CCR6-.SI        1
8       ILC3.NKp46+.SI        1
9        ILC3.CCR6+.SI        1




To further show, that the cell types cluster according to their ATAC signal, we peformed KMeans clustering and computed a table with the cell types and the cluster number, they were assigned to. The ILC cell types cluster together in the same cluster. With one exception - the NK.27+11b-.Sp cells - the NK cells cluster in the same cluster aswell. The different clustering of the NK.27+11b-.Sp cells could be explained by their less mature stage compared to CD11b+ subtypes and the different origin tissue (spleen vs. bone marrow)