In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.manifold import TSNE

# 1. Lade den Datensatz und wähle die Zelltypen-Spalten (hier ILC + NK Beispiel)
celltype_cols = [
    "ILC2.SI", "ILC3.NKp46-CCR6-.SI", "ILC3.NKp46+.SI", "ILC3.CCR6+.SI",
    "NK.27+11b-.BM", "NK.27+11b+.BM", "NK.27-11b+.BM",
    "NK.27+11b-.Sp", "NK.27+11b+.Sp", "NK.27-11b+.Sp"
]

# 2. Nur die Spalten (Zelltypen) laden
data = pd.read_csv("data/ImmGenATAC18_AllOCRsInfo.csv", usecols=celltype_cols, encoding="latin1")

 
data_sampled = data.sample(n=1000, random_state=42)


# 3. Transponieren: Zeilen = Zelltypen, Spalten = OCRs
df_T = data_sampled.T
df_T.index.name = "CellType"



# 5. PCA zur Reduktion (optional, für Visualisierung nützlich)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_T)
df_pca = pd.DataFrame(pca_result, columns=["PC1", "PC2"])
df_pca["CellType"] = df_T.index.name

#tsne = TSNE(n_components=2, perplexity=5, n_iter=500, learning_rate=200, verbose=1)
#tsne_result = tsne.fit_transform(df_pca)

# 6. Clustering (hier hierarchisch)
clustering = AgglomerativeClustering(n_clusters=10)  # Anzahl Cluster anpassen
df_pca["Cluster"] = clustering.fit_predict(df_pca)

# 7. Visualisierung
plt.figure(figsize=(10, 7))
sns.scatterplot(data=df_pca, x="PC1", y="PC2", hue="CellType", style="Cluster", palette="tab10", s=100)
plt.title("Clustering of Cell Types by Chromatin Accessibility")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

ValueError: could not convert string to float: 'CellType'