In [None]:
import scanpy as sc
from pathlib import Path

# 1. 根目录
DATA_DIR = Path("/Users/zacchan/Downloads/GSE261596_RAW/rawdata")

# 2. 找到所有子目录（6m_AD_rep1 等），保持字母顺序方便复现
sample_dirs = sorted([p for p in DATA_DIR.iterdir() if p.is_dir()])

print(f"✔️  检测到 {len(sample_dirs)} 个样本： {[p.name for p in sample_dirs]}")

# 3. 逐个读取 10x MTX，并记录样本名
adatas = []
for p in sample_dirs:
    ad = sc.read_10x_mtx(p, var_names="gene_symbols", cache=True)  # 读取 gz 压缩的 mtx
    ad.var_names_make_unique()                                     # 同名基因自动加后缀防冲突
    ad.obs["sample"] = p.name                                      # 在 obs 新增“sample”列
    adatas.append(ad)

# 4. 合并（根据 obs["sample"] 自动分组，也可手动传递 batch_categories）
adata = adatas[0].concatenate(
    *adatas[1:],                      # 其余样本
    batch_key="sample",               # 新列：样本标签
    batch_categories=[a.obs['sample'][0] for a in adatas]  # 保持原始顺序
)

# 5. 确认
print(adata)           # 查看维度
adata.obs["sample"] = adata.obs["sample"].astype("category")
adata.obs["sample"].cat.categories


In [None]:
adata.write_h5ad(
		DATA_DIR / "GSE261596.h5ad",  # 保存到根目录
		compression="gzip"             # 压缩保存
)

In [None]:
import scanpy as sc

# 1. 提取 6 个月样本（样本名中包含 "6m"）
adata_6m = adata[adata.obs["sample"].str.contains("6m")].copy()
print(f"✔️ 提取 6 个月样本，共 {adata_6m.n_obs} 个细胞，{adata_6m.n_vars} 个基因")

# 2. 标准流程
sc.pp.filter_cells(adata_6m, min_genes=200)
sc.pp.filter_genes(adata_6m, min_cells=3)
adata_6m.var["mt"] = adata_6m.var_names.str.startswith("mt-")  # 线粒体基因（小鼠）

sc.pp.calculate_qc_metrics(adata_6m, qc_vars=["mt"], inplace=True)

# 可选质控筛选
adata_6m = adata_6m[adata_6m.obs["pct_counts_mt"] < 10].copy()

# 3. 预处理
sc.pp.normalize_total(adata_6m, target_sum=1e4)
sc.pp.log1p(adata_6m)
sc.pp.highly_variable_genes(adata_6m, min_mean=0.0125, max_mean=3, min_disp=0.5)

adata_6m = adata_6m[:, adata_6m.var.highly_variable]  # 只保留高变基因
sc.pp.scale(adata_6m, max_value=10)
sc.tl.pca(adata_6m, svd_solver='arpack')

# 4. 聚类 + 可视化
sc.pp.neighbors(adata_6m, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata_6m)
sc.tl.leiden(adata_6m, resolution=0.5)

# 5. 画图
sc.pl.umap(adata_6m, color=["leiden", "sample"], wspace=0.4)


In [None]:
# save adata_6m to h5ad
save_path = DATA_DIR / "adata_6m.h5ad"
adata_6m.write_h5ad(save_path, compression="gzip")  # 压缩保存
print(f"✔️ 6 个月样本已保存到 {save_path}")


In [None]:
import celltypist
import scanpy as sc
from celltypist import models




In [None]:
models.download_models(force_update = True)
models.models_description()

In [None]:
# Indeed, the `model` argument defaults to `Immune_All_Low.pkl`.
model = models.Model.load(model = 'Mouse_Whole_Brain.pkl')

In [None]:
model.cell_types

In [None]:
# 需要标准化
sc.pp.log1p(adata_6m)
sc.pp.normalize_total(adata_6m, target_sum=1e4)

In [None]:
#  先进行log1p转换，再normalize
import scanpy as sc
import numpy as np

# 备份原始数据
adata_copy = adata_6m.copy()

# 1. 先做log1p
sc.pp.log1p(adata_copy)

# 2. 再normalize
sc.pp.normalize_total(adata_copy)

# 检查结果
print(f"有NaN: {np.isnan(adata_copy.X).sum()}")

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
# 备份原始数据
adata_copy = adata.copy()

In [None]:
# -----------------------------------------------------------
# 1️⃣  CellTypist 自动标注
# -----------------------------------------------------------
predictions1 = celltypist.annotate(adata, model = 'Mouse_Whole_Brain.pkl', majority_voting = True)

In [None]:
predictions1.predicted_labels

In [None]:
result_model1 = predictions1.predicted_labels[['predicted_labels','majority_voting']].rename(columns={'predicted_labels': 'pre1', 'majority_voting': 'pre1_mv'})

In [None]:
adata.obs = adata.obs.join(result_model1, how='left')

In [None]:
adata.obs

In [None]:
adata.write_h5ad(
		DATA_DIR / "adata_celltypist.h5ad",  # 保存到根目录
		compression="gzip"                     # 压缩保存
)

In [None]:

# -----------------------------------------------------------
# 2️⃣  提取小胶质细胞并重聚类
# -----------------------------------------------------------
micro = adata_6m[adata_6m.obs['cell_type'].str.contains('Microglia'), :].copy()
print(f"✔️  提取到 {micro.n_obs} 个小胶质细胞")

# —— 常规预处理（只对 micro 再来一次）——
sc.pp.highly_variable_genes(micro, min_mean=0.0125, max_mean=3, min_disp=0.5)
micro = micro[:, micro.var.highly_variable]
sc.pp.scale(micro, max_value=10)
sc.tl.pca(micro, svd_solver='arpack')

sc.pp.neighbors(micro, n_neighbors=10, n_pcs=30)
sc.tl.umap(micro)
sc.tl.leiden(micro, resolution=0.6)

# 可视化
sc.pl.umap(micro, color=['leiden', 'sample'], wspace=0.4)

# -----------------------------------------------------------
# 3️⃣  构建 AD / WT 分组标签
# -----------------------------------------------------------
def map_group(sample):
    return 'AD' if '_AD_' in sample else 'WT'

micro.obs['group'] = micro.obs['sample'].map(map_group).astype('category')
print(micro.obs['group'].value_counts())

# -----------------------------------------------------------
# 4️⃣  全局 AD vs WT 差异表达（所有小胶质细胞）
# -----------------------------------------------------------
sc.tl.rank_genes_groups(
    micro, 
    groupby='group', 
    groups=['AD'], 
    reference='WT', 
    method='wilcoxon', 
    key_added='DE_all_micro'
)
sc.pl.rank_genes_groups(micro, key='DE_all_micro', n_genes=20, sharey=False)

# -----------------------------------------------------------
# 5️⃣  每个小胶质亚群内的 AD vs WT
# -----------------------------------------------------------
for cl in micro.obs['leiden'].cat.categories:
    key = f'DE_micro_cl{cl}'
    sc.tl.rank_genes_groups(
        micro[micro.obs['leiden'] == cl], 
        groupby='group',
        groups=['AD'], 
        reference='WT', 
        method='wilcoxon',
        key_added=key
    )
    sc.pl.rank_genes_groups(
        micro[micro.obs['leiden'] == cl], 
        key=key, 
        n_genes=15, 
        title=f'Cluster {cl}  AD vs WT',
        sharey=False
    )

# -----------------------------------------------------------
# 6️⃣  导出差异基因表（可选）
# -----------------------------------------------------------
# 示例：导出全局差异基因 top 50
de_all = sc.get.rank_genes_groups_df(micro, key='DE_all_micro', n_genes=50)
de_all.to_csv('DE_micro_AD_vs_WT_top50.csv', index=False)
print("📑 已保存差异基因表：DE_micro_AD_vs_WT_top50.csv")