In [None]:

import pandas as pd
import scanpy as sc
import numpy as np
from anndata import AnnData
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Paths
RNA_seq_path = r"C:\Users\helen\Downloads\datasets\mmc2.csv"
Cell_population_qc_path = r"C:\Users\helen\Downloads\datasets\mmc1.xlsx"

# Load RNA-seq and QC metadata
RNA_seq = pd.read_csv(RNA_seq_path, index_col=0).T
RNA_seq = np.log1p(RNA_seq.apply(pd.to_numeric, errors='coerce'))
qc_meta = pd.read_excel(Cell_population_qc_path, index_col=0)

# Match sample names
qc_meta.index = qc_meta.index.str.replace(r"#\d+", "", regex=True)
RNA_seq.index = RNA_seq.index.str.replace(r"#\d+", "", regex=True)

# Drop not matching and NaNs
RNA_seq = RNA_seq.dropna(axis=0, how='any').dropna(axis=1, how='any')
qc_meta = qc_meta.loc[~qc_meta.index.duplicated(keep='first')]

common_samples = RNA_seq.index.intersection(qc_meta.index)
RNA_seq = RNA_seq.loc[common_samples]
qc_meta = qc_meta.loc[common_samples]

print(f"After matching: {len(common_samples)} samples")

# Convert QC columns to numeric
numeric_cols = [
    'InputCellNumber', 'PF.reads', '%chrM.mapped',
    'Paired.read.after.removing.PCR.duplication',
    '%fragment.1Kb_TSS', 'Replicate.cor'
]
for col in numeric_cols:
    if col in qc_meta.columns:
        qc_meta[col] = pd.to_numeric(qc_meta[col], errors='coerce')

# Create AnnData object
adata = AnnData(X=RNA_seq.values)
adata.obs_names = RNA_seq.index
adata.var_names = RNA_seq.columns
adata.obs = qc_meta.copy()

# PCA,neighbors,UMAP
sc.pp.pca(adata, n_comps=50)
sc.pp.neighbors(adata, use_rep='X_pca')
sc.tl.umap(adata)

# Leiden clustering
sc.tl.leiden(adata, resolution=3.2)
adata.obs['leiden'] = adata.obs['leiden'].astype(str)

# clean plotting columns
for col in ['CellType', 'Lineage', 'leiden']:
    adata.obs[col] = adata.obs[col].astype(str).fillna("Unknown")

#UMAP plot
sc.pl.umap(adata, color=["Lineage", "leiden"], wspace=0.4, sort_order=False)

#check if colums are clen
print(adata.obs["CellType"].unique())
print(adata.obs["Lineage"].unique())
print(adata.obs["leiden"].unique())

# Leiden at multiple resolutions
print("\nLeiden clustering summary:")
for r in [0.2, 0.6, 1.0, 2.0]:
    key = f'leiden_{r}'
    sc.tl.leiden(adata, resolution=r, key_added=key)
    n_clusters = adata.obs[key].nunique()
    print(f"Resolution {r}: {n_clusters} clusters")

# KMeans clustering on UMAP coordinates
kmeans = KMeans(n_clusters=12, random_state=42)
adata.obs['kmeans12'] = kmeans.fit_predict(adata.obsm['X_umap']).astype(str)

# comparison kmeans/leiden: ARI and NMI
ari = adjusted_rand_score(adata.obs["CellType"], adata.obs["leiden"])
nmi = normalized_mutual_info_score(adata.obs["CellType"], adata.obs["leiden"])
ari_kmeans = adjusted_rand_score(adata.obs["CellType"], adata.obs["kmeans12"])
nmi_kmeans = normalized_mutual_info_score(adata.obs["CellType"], adata.obs["kmeans12"])

print(f"\nClustering Evaluation:")
print(f"Leiden vs CellType — ARI: {ari:.3f}, NMI: {nmi:.3f}")
print(f"KMeans vs CellType — ARI: {ari_kmeans:.3f}, NMI: {nmi_kmeans:.3f}")

# Save AnnData object
adata.write("adata_qc_integrated.h5ad")

# Heatmaps
def plot_heatmap(df, title, xlabel, ylabel, normalize=False):
    if normalize:
        df = df.div(df.sum(axis=1), axis=0)
    plt.figure(figsize=(10, 6))
    sns.heatmap(df, cmap="crest", annot=normalize)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

# Heatmap: leiden vs CellType
ct_cluster = pd.crosstab(adata.obs["leiden"], adata.obs["CellType"])
plot_heatmap(ct_cluster, "Leiden clusters vs Cell Type", "Cell Type", "Leiden Cluster")

# Heatmap: leiden vs Lineage
lineage_cluster = pd.crosstab(adata.obs["leiden"], adata.obs["Lineage"])
plot_heatmap(lineage_cluster, "Leiden clusters vs Lineage", "Lineage", "Leiden Cluster")

# Heatmap: kmeans vs Lineage (normalized)
kmeans_lineage = pd.crosstab(adata.obs["Lineage"], adata.obs["kmeans12"])
plot_heatmap(kmeans_lineage, "KMeans clusters vs Lineage (normalized)", "KMeans Cluster", "Lineage", normalize=True)

# Final UMAP: All clustering results
# sc.pl.umap(adata, color=["CellType", "Lineage", "leiden", "kmeans12"], wspace=0.4)


In [None]:


# correlate clusters to celltype
import pandas as pd
import scanpy as sc
import numpy as np
from anndata import AnnData
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler

# Load RNA-seq data
RNA_seq_path = r"C:\Users\helen\Downloads\datasets\mmc2.csv"
RNA_seq = pd.read_csv(RNA_seq_path, index_col=0)
RNA_seq_T = RNA_seq.T
RNA_seq_T_log = np.log1p(RNA_seq_T)

# Load QC metadata
Cell_population_qc_path = r"C:\Users\helen\Downloads\datasets\mmc1.xlsx"
qc_meta= pd.read_excel(Cell_population_qc_path, index_col=0)

#match names to only keep ones that match
qc_meta.index = qc_meta.index.str.replace(r"#\d+", "", regex=True)
RNA_seq_T_log.index = RNA_seq_T_log.index.str.replace(r"#\d+", "", regex=True)

# all RNA-seq values gotta be numeric
RNA_seq_T_log = RNA_seq_T_log.apply(pd.to_numeric, errors='coerce')

# Drop any rows or columns with NaN values
RNA_seq_T_log = RNA_seq_T_log.dropna(axis=0, how='any')
RNA_seq_T_log = RNA_seq_T_log.dropna(axis=1, how='any')

# Ensure numeric columns in qc_meta are numeric
numeric_cols = ['InputCellNumber', 'PF.reads', '%chrM.mapped', 'Paired.read.after.removing.PCR.duplication', '%fragment.1Kb_TSS', 'Replicate.cor']
for col in numeric_cols:
    if col in qc_meta.columns:
        qc_meta[col] = pd.to_numeric(qc_meta[col], errors='coerce')

#check if index names match
print("Before intersecting:")
print("RNA samples:", RNA_seq_T_log.shape[0])
print("QC samples:", qc_meta.shape[0])
# Check for name mismatches
unmatched = RNA_seq_T_log.index.difference(qc_meta.index)
print(f"Unmatched samples after stripping: {len(unmatched)}")

qc_meta = qc_meta.loc[~qc_meta.index.duplicated(keep='first')]
common_samples = RNA_seq_T_log.index.intersection(qc_meta.index)
RNA_seq_T_log = RNA_seq_T_log.loc[common_samples]
qc_meta = qc_meta.loc[common_samples]

# Create AnnData
adata = AnnData(X=RNA_seq_T_log.values)
adata.obs_names = RNA_seq_T_log.index
adata.var_names = RNA_seq_T_log.columns
adata.obs = qc_meta.copy()

# pca, UMAP, leiden clustering
sc.pp.pca(adata, n_comps=50)

sc.pp.neighbors(adata, use_rep='X', n_neighbors=10)
sc.tl.leiden(adata, resolution=3.2)
adata.obs['leiden'] = adata.obs['leiden'].astype(str)

sc.tl.umap(adata) 
sc.pl.umap(adata, color=["CellType", "Lineage", "leiden"], wspace=0.4, sort_order=False)

print(adata.obs["CellType"].value_counts())
print("Missing values:")
print(adata.obs[["CellType", "Lineage", "leiden"]].isnull().sum())
# checking for wrong types
print("\nUnique types per column:")
for col in ["CellType", "Lineage", "leiden"]:
    print(f"{col}:", adata.obs[col].dropna().map(type).value_counts())

#clean up columns
for col in ["CellType", "Lineage", "leiden"]:
    adata.obs[col] = adata.obs[col].astype(str).fillna("Unknown")

sc.pl.umap(adata, color=["CellType", "Lineage", "leiden"], wspace=0.4, sort_order=False)

# Leiden clustering find right resolution
for r in [0.2, 0.6, 1.0, 2.0]:
    sc.tl.leiden(adata, resolution=r, key_added=f'leiden_{r}')
    print(f"Resolution {r}: {adata.obs[f'leiden_{r}'].nunique()} clusters")

sc.pp.neighbors(adata, use_rep='X', n_neighbors=10)
sc.tl.leiden(adata, resolution=3.2)
adata.obs['leiden'] = adata.obs['leiden'].astype(str)

#kMeans clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=12, random_state=42)
adata.obs['kmeans12'] = kmeans.fit_predict(adata.obsm['X_umap']).astype(str)

#check how well kmeans and leiden correlate
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
ari = adjusted_rand_score(adata.obs["CellType"], adata.obs["leiden"])
nmi = normalized_mutual_info_score(adata.obs["CellType"], adata.obs["leiden"])
print(f"Leiden Clustering -> CellType ARI: {ari:.3f}, NMI: {nmi:.3f}")
ari_kmeans = adjusted_rand_score(adata.obs["CellType"], adata.obs["kmeans12"])
nmi_kmeans = normalized_mutual_info_score(adata.obs["CellType"], adata.obs["kmeans12"])
print(f"KMeans Clustering -> CellType ARI: {ari_kmeans:.3f}, NMI: {nmi_kmeans:.3f}")

# Save Anndata object
adata.write("adata_qc_integrated.h5ad")

# visualize leiden and heatmap
import seaborn as sns
import matplotlib.pyplot as plt

# Heatmap of Leiden vs CellType (from QC metadata)
ct_cluster = pd.crosstab(adata.obs["leiden"], adata.obs["CellType"])

plt.figure(figsize=(10, 6))
sns.heatmap(ct_cluster, cmap="crest")
plt.title("correlating leiden clusters to cell types")
plt.xlabel("Cell Type")
plt.ylabel("Leiden Cluster")
plt.tight_layout()
plt.show()


# Heatmap cluster to cell type correlation
heatmap_data = pd.crosstab(adata.obs["leiden"], adata.obs["Lineage"])
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data, cmap="crest")
plt.title("correlating leiden clusters to lineages")
plt.xlabel("Lineage")
plt.ylabel("Leiden Cluster")
plt.tight_layout()
plt.show()

# heatmap of correlation kmeans clusters to cell types
heatmap_data = pd.crosstab(adata.obs["Lineage"], adata.obs["kmeans12"])
heatmap_data_normalized = heatmap_data.div(heatmap_data.sum(axis=1), axis=0)

plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data_normalized, cmap="crest")
plt.title("KMeans Clusters vs. Lineage")
plt.xlabel("KMeans Cluster")
plt.ylabel("Lineage")
plt.tight_layout()
plt.show()

#plotting umap with leiden and kmeans clusters
sc.pl.umap(adata, color=["CellType", "Lineage", "leiden", "kmeans12"], wspace=0.4, sort_order=False)

#%pip install openpyxl
#import pandas as pd
#Cell_population_qc_path = r"C:\Users\helen\Downloads\datasets\mmc1.xlsx"
#qc_metadata = pd.read_excel(Cell_population_qc_path, index_col=0)

# match names
#qc_metadata.index = qc_metadata.index.str.replace(r"#\d+", "", regex=True)
#qc_metadata.index.name = "SampleName"

# check if index names match
#matched = qc_metadata.loc[qc_metadata.index.intersection(adata.obs_names)]

# adding columsns to adata.obs
#adata.obs = adata.obs.join(matched[["CellType", "Lineage", "CellFamily", "Organ"]])

# majority celltypefor cluster annotations
#cluster_annotations = (
 #   adata.obs.groupby("leiden")["CellType"]
  #  .agg(lambda x: x.value_counts().index[0])  # get most common
   # .to_dict()
#)

# Map annotating cell type to clusters
#adata.obs["leiden_annotated"] = adata.obs["leiden"].map(cluster_annotations)

#7. Plot annotated UMAP
#import scanpy as sc
#sc.pl.umap(adata, color="leiden_annotated", title="Leiden Cluster Annotations", legend_loc="on data")


In [None]:

import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from anndata import AnnData
from matplotlib.gridspec import GridSpec

rna_path = r"C:\Users\helen\Downloads\datasets\mmc2.csv"
qc_path = r"C:\Users\helen\Downloads\datasets\mmc1.xlsx"

rna = pd.read_csv(rna_path, index_col=0).T
rna = rna.apply(pd.to_numeric, errors="coerce")
rna = np.log1p(rna)

qc = pd.read_excel(qc_path, index_col=0)

#Clean sample names 
rna.index = rna.index.str.replace(r"#\d+", "", regex=True)
qc.index = qc.index.str.replace(r"#\d+", "", regex=True)

# Match samples
common_samples = rna.index.intersection(qc.index)
rna_matched = rna.loc[common_samples].copy()
qc_matched = qc.loc[common_samples].copy()

# Confirm both have the same length
print(f"RNA matched: {rna_matched.shape}")
print(f"QC matched: {qc_matched.shape}")

# Create AnnData correctly
adata = AnnData(
    X=rna_matched.values,
    obs=qc_matched,
    var=pd.DataFrame(index=rna_matched.columns)
)

# Clean CellType and Lineage
for col in ['Lineage', 'CellType']:
    adata.obs[col] = adata.obs.get(col, pd.Series(index=adata.obs_names)).astype(str).fillna("Unknown")

# clustering 
sc.pp.pca(adata, n_comps=50)
sc.pp.neighbors(adata, use_rep='X_pca')
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=1.0)
adata.obs['leiden'] = adata.obs['leiden'].astype(str)

#Crosstabs and normalization
ct_lineage = pd.crosstab(adata.obs['leiden'], adata.obs['Lineage'])
ct_lineage_norm = ct_lineage.div(ct_lineage.sum(axis=1), axis=0)

ct_celltype = pd.crosstab(adata.obs['leiden'], adata.obs['CellType'])
ct_celltype_norm = ct_celltype.div(ct_celltype.sum(axis=1), axis=0)

#  UMAP + Heatmaps in (GridSpec layout)
fig = plt.figure(figsize=(22, 10))
gs = GridSpec(1, 3, width_ratios=[1, 1.2, 1.2], wspace=0.4)

# UMAP plot
ax0 = fig.add_subplot(gs[0])
sc.pl.umap(adata, color='leiden', show=False, ax=ax0, legend_loc='right margin', title='UMAP: Leiden Clusters')

# Heatmap: Leiden vs Lineage
ax1 = fig.add_subplot(gs[1])
sns.heatmap(ct_lineage_norm, cmap="viridis", ax=ax1, cbar=True, linewidths=0.3)
ax1.set_title("Leiden Clusters vs Lineage")
ax1.set_xlabel("Lineage")
ax1.set_ylabel("Leiden Cluster")

# Heatmap: Leiden vs CellType
ax2 = fig.add_subplot(gs[2])
sns.heatmap(ct_celltype_norm, cmap="viridis", ax=ax2, cbar=True, linewidths=0.3)
ax2.set_title("Leiden Clusters vs Cell Type")
ax2.set_xlabel("Cell Type")
ax2.set_ylabel("")

plt.tight_layout()
plt.show()


In [None]:
# correlate clusters to celltype
import pandas as pd
import scanpy as sc
import numpy as np
from anndata import AnnData
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# gene expression data
RNA_seq_path = r"C:\Users\helen\Downloads\datasets\mmc2.csv"
RNA_seq = pd.read_csv(RNA_seq_path, index_col=0)
RNA_seq_T = RNA_seq.T
RNA_seq_log = np.log1p(RNA_seq_T)

# AnnData object
adata = AnnData(X=RNA_seq_log.values)
adata.obs_names = RNA_seq_log.index
adata.var_names = RNA_seq_log.columns

# obs=colums= cellnames
adata.obs['cell_type'] = adata.obs_names.astype('category')
adata.var['gene_name'] = adata.var_names

#umap
scaler = StandardScaler()
RNA_seq_scaled = scaler.fit_transform(RNA_seq_log)
reducer = umap.UMAP(random_state=42)
umap_coords = reducer.fit_transform(RNA_seq_scaled)
adata.obsm['X_umap'] = umap_coords

# Leiden clusters
leiden_clusters = pd.read_csv("leiden_clusters.csv", index_col=0)
assert all(leiden_clusters.index == adata.obs_names), "Leiden index mismatch!"
adata.obs['leiden'] = leiden_clusters["Leiden_Cluster"].astype(str)

# annotating clusters with cell type names 
leiden_to_celltype = {
    '0': 'αβ/γδ T cells & NKT',
    '1': 'B cells & Dendritic cells',
    '2': 'Progenitors & stromal cells',
    '3': 'Myeloid cells (Mo, MF, GN)',
    '4': 'Cytotoxic T & NK cells',
    '5': 'Tregs & ILCs'
}
adata.obs['cell_type_annotated'] = adata.obs['leiden'].map(leiden_to_celltype).astype('category')

# annotated UMAP
sc.pl.umap(adata, color='cell_type_annotated', size=20, title='UMAP with Cell Type Annotations', legend_loc='on data')

# marker genees by cluster
marker_genes = [
    'Cd3d', 'Cd4', 'Cd8a', 'Trdc', 'Foxp3', 'Zbtb16', 'Ifng', 'Il17a',  # T/NKT/γδ
    'Cd19', 'Ms4a1', 'Cd79a', 'Cd74',                                   # B cells
    'Irf8', 'Itgax', 'Siglech',                                        # DCs
    'Kit', 'Ly6a', 'Flt3', 'Il7r', 'Rag1', 'Cd34', 'Pecam1',           # progenitors/stroma
    'Cd14', 'Lyz2', 'Adgre1', 'Cx3cr1', 'Trem2', 'Itgam',              # myeloid
    'Gzmb', 'Prf1', 'Nkg7', 'Klrb1c', 'Klrc1',                          # cytotoxic/NK
    'Gata3', 'Rorc', 'Il22'                                            # ILCs
]

sc.pl.dotplot(adata, var_names=marker_genes, groupby='leiden', standard_scale='var', figsize=(14, 6))

adata.write("adataRNA.h5ad")
adata_loaded = sc.read("adataRNA.h5ad")
print(adata_loaded.obs[['leiden', 'cell_type_annotated']].head())
