In [1]:
import scanpy as sc
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns # 用于绘制更美观的折线图

In [2]:
# --- 1. 设置参数 ---

# 数据集列表
datasetlist = ['M-MG', 'R-MG', 'S-MG', 'R-AG', 'R-CG', 'S-AG']
# 数据文件所在目录
data_path = "D:/111/"
# 输出图表保存目录


# 模块 A: 乳腺泌乳基因
module_a_genes = ['Lalba', 'Elf5', 'Prlr']
module_a_name = 'Lactation_Score'

# 模块 B: RANKL, LIF/LIFR, TNF 相关基因 (小鼠)
module_b_genes = ['Wnt6','Lef1','Procr']
module_b_name = 'masc_Score'

# AnnData 对象中的列名
cell_type_key = 'newcelltype' # 细胞类型注释列名
stage_key = 'stage'          # 阶段注释列名


# 可视化参数
umap_cmap = 'viridis'
lineplot_palette = "tab10" # 为折线图选择一个颜色方案

In [3]:
# --- 2. 定义辅助函数 ---

def check_and_score_genes(adata, gene_list, module_name):
    """检查基因存在性并计算模块分数"""
    print(f"  计算模块: {module_name}")
    genes_in_adata = adata.var_names.tolist()
    genes_present = [gene for gene in gene_list if gene in genes_in_adata]
    genes_missing = [gene for gene in gene_list if gene not in genes_in_adata]
    adata.X=adata.layers['normalized']
    print(f"    存在于数据中的基因 ({len(genes_present)}): {genes_present}")
    if genes_missing:
        print(f"    警告: 以下基因不在数据中，将被忽略 ({len(genes_missing)}): {genes_missing}")

    if not genes_present:
        print(f"    错误: 模块 '{module_name}' 中没有任何基因存在于数据中，无法计算分数。")
        return False
    else:
        # 使用指定的层进行计算
        sc.tl.score_genes(
            adata,
            gene_list=genes_present,
            score_name=module_name,
            ctrl_size=min(len(genes_present), 50),
            n_bins=25,
            use_raw=False # 明确不使用 .raw
        )
        print(f"    分数已计算并添加到 adata.obs['{module_name}']")
        return True

#

In [4]:
# --- 3. 批量处理 ---

for dataset in datasetlist:
    print(f"\n======= 开始处理数据集: {dataset} =======")
    adata_file = f"{data_path}{dataset}_cleaned.h5ad"

    # --- 3.1 加载数据 ---
    try:
        print(f"加载数据: {adata_file}")
        adata = sc.read_h5ad(adata_file)
        print(f"数据加载完成: {adata}")
    except FileNotFoundError:
        print(f"错误: 文件未找到 {adata_file}，跳过此数据集。")
        continue
    if cell_type_key not in adata.obs.columns:
        print(f"错误: 细胞类型列 '{cell_type_key}' 在 {dataset} 中未找到。跳过此数据集。")
        continue
    if stage_key not in adata.obs.columns:
        print(f"错误: 阶段列 '{stage_key}' 在 {dataset} 中未找到。跳过此数据集。")
        continue
    if 'X_umap' not in adata.obsm:
        print(f"警告: {dataset} 中未找到 'X_umap' embedding。UMAP 图将无法绘制。")

    # 将 stage 列转为 Categorical 并排序（如果需要，假设 stage 有特定顺序）
    # 如果你的 stage 是 P1, P2 ... L1, L2 这种，可能需要自定义排序
    # 例如: stage_order = ['P1', 'P2', 'P3', 'L1', 'L2', 'L3']
    # adata.obs[stage_key] = pd.Categorical(adata.obs[stage_key], categories=stage_order, ordered=True)
    # 否则，它会按字母/数字顺序排序
    if not pd.api.types.is_categorical_dtype(adata.obs[stage_key]):
        adata.obs[stage_key] = adata.obs[stage_key].astype('category')
        # 如果需要特定顺序，在这里设置 categories 和 ordered=True
        # print(f"将 {stage_key} 列转换为 category 类型。当前顺序: {adata.obs[stage_key].cat.categories.tolist()}")


    # --- 3.3 计算模块分数 ---
    score_a_success = check_and_score_genes(adata, module_a_genes, module_a_name)
    score_b_success = check_and_score_genes(adata, module_b_genes, module_b_name)

    # --- 3.4 可视化 ---
    if 'X_umap' in adata.obsm:
        print("  绘制 UMAP 图...")
        sc.pl.umap(
            adata,
            color=[module_a_name, module_b_name] if score_a_success and score_b_success else \
                  ([module_a_name] if score_a_success else []) + ([module_b_name] if score_b_success else []),
            cmap=umap_cmap,
            title=[f'{dataset} - {module_a_name}', f'{dataset} - {module_b_name}'],
            save=f'_{dataset}_umap_scores.png',
            show=False,
            ncols=1 # 每个图占一行
        )

    if score_a_success or score_b_success:
        sc.pl.violin(
            adata,
            keys=[module_a_name] * score_a_success + [module_b_name] * score_b_success, # 只包含成功计算的分数
            groupby=cell_type_key,
            rotation=90,
            save=f'_{dataset}_violin_scores.png',
            show=False
        )

    print("  绘制阶段性折线图...")
    if score_a_success or score_b_success:
        # 提取需要的数据
        plot_data = adata.obs[[cell_type_key, stage_key] + \
                              ([module_a_name] if score_a_success else []) + \
                              ([module_b_name] if score_b_success else [])].copy()

        # 计算每个 stage 和 celltype 的平均分数
        mean_scores = plot_data.groupby([stage_key, cell_type_key]).mean().reset_index()

        # 获取 stage 的排序（如果已设置为 categorical ordered）
        stage_order = None
        if pd.api.types.is_categorical_dtype(mean_scores[stage_key]) and mean_scores[stage_key].cat.ordered:
            stage_order = mean_scores[stage_key].cat.categories.tolist()


        # 分别为每个模块绘制折线图
        plt.figure(figsize=(10, 6)) # 创建新的 figure
        if score_a_success:
            sns.lineplot(data=mean_scores, x=stage_key, y=module_a_name, hue=cell_type_key,
                         marker='o', palette=lineplot_palette, sort= stage_order is None) # 如果没有预设顺序，seaborn 会尝试排序
            plt.title(f'{dataset} - Average {module_a_name} by Stage and Cell Type')
            plt.ylabel(f'Average {module_a_name}')
            plt.xlabel('Stage')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout() # 调整布局防止标签重叠
            plt.savefig(f'{dataset}_lineplot_{module_a_name}.png', dpi=300)
            plt.close() # 关闭当前 figure，为下一个图做准备

        if score_b_success:
            plt.figure(figsize=(10, 6)) # 创建新的 figure
            sns.lineplot(data=mean_scores, x=stage_key, y=module_b_name, hue=cell_type_key,
                         marker='o', palette=lineplot_palette, sort= stage_order is None)
            plt.title(f'{dataset} - Average {module_b_name} by Stage and Cell Type')
            plt.ylabel(f'Average {module_b_name}')
            plt.xlabel('Stage')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.savefig(f'{dataset}_lineplot_{module_b_name}.png', dpi=300)
            plt.close() # 关闭当前 figure

    print(f"======= 数据集 {dataset} 处理完成 =======")

    # (可选) 释放内存
    del adata
    import gc
    gc.collect()


加载数据: D:/111/M-MG_cleaned.h5ad
数据加载完成: AnnData object with n_obs × n_vars = 10880 × 12088
    obs: 'sample', 'n_genes_by_counts', 'total_counts', 'doublet_score', 'predicted_doublet', 'gland', 'stage_old', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'total_counts_RIBO', 'pct_counts_RIBO', 'log1p_total_counts_RIBO', 'leiden', 'anno', 'stage_new', 'species', 'celltype', 'stage', 'cellid', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'newcelltype', 'subtype'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dendrogram_leiden', 'gland_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'newcelltype_colors', 'pca', 'rank_genes_groups', 'sample_colors', 'stage_colors', 'subtype_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'counts', 'matrix

  if not pd.api.types.is_categorical_dtype(adata.obs[stage_key]):


    分数已计算并添加到 adata.obs['Lactation_Score']
  计算模块: masc_Score
    存在于数据中的基因 (3): ['Wnt6', 'Lef1', 'Procr']
    分数已计算并添加到 adata.obs['masc_Score']
  绘制 UMAP 图...
  绘制阶段性折线图...


  mean_scores = plot_data.groupby([stage_key, cell_type_key]).mean().reset_index()
  if pd.api.types.is_categorical_dtype(mean_scores[stage_key]) and mean_scores[stage_key].cat.ordered:



加载数据: D:/111/R-MG_cleaned.h5ad
数据加载完成: AnnData object with n_obs × n_vars = 19235 × 12088
    obs: 'sample', 'n_genes_by_counts', 'total_counts', 'doublet_score', 'predicted_doublet', 'gland', 'stage_old', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'total_counts_RIBO', 'pct_counts_RIBO', 'log1p_total_counts_RIBO', 'leiden', 'anno', 'stage_new', 'species', 'celltype', 'stage', 'cellid', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'newcelltype', 'subtype'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dendrogram_leiden', 'gland_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'newcelltype_colors', 'pca', 'rank_genes_groups', 'sample_colors', 'stage_colors', 'subtype_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'counts', 'matrix

  if not pd.api.types.is_categorical_dtype(adata.obs[stage_key]):


    分数已计算并添加到 adata.obs['Lactation_Score']
  计算模块: masc_Score
    存在于数据中的基因 (3): ['Wnt6', 'Lef1', 'Procr']
    分数已计算并添加到 adata.obs['masc_Score']
  绘制 UMAP 图...
  绘制阶段性折线图...


  mean_scores = plot_data.groupby([stage_key, cell_type_key]).mean().reset_index()
  if pd.api.types.is_categorical_dtype(mean_scores[stage_key]) and mean_scores[stage_key].cat.ordered:



加载数据: D:/111/S-MG_cleaned.h5ad
数据加载完成: AnnData object with n_obs × n_vars = 22855 × 12088
    obs: 'sample', 'n_genes_by_counts', 'total_counts', 'doublet_score', 'predicted_doublet', 'gland', 'stage_old', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'total_counts_RIBO', 'pct_counts_RIBO', 'log1p_total_counts_RIBO', 'leiden', 'anno', 'stage_new', 'species', 'celltype', 'stage', 'cellid', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'newcelltype', 'subtype'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dendrogram_leiden', 'gland_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'newcelltype_colors', 'pca', 'rank_genes_groups', 'sample_colors', 'stage_colors', 'subtype_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'counts', 'matrix

  if not pd.api.types.is_categorical_dtype(adata.obs[stage_key]):


    分数已计算并添加到 adata.obs['Lactation_Score']
  计算模块: masc_Score
    存在于数据中的基因 (3): ['Wnt6', 'Lef1', 'Procr']
    分数已计算并添加到 adata.obs['masc_Score']
  绘制 UMAP 图...
  绘制阶段性折线图...


  mean_scores = plot_data.groupby([stage_key, cell_type_key]).mean().reset_index()
  if pd.api.types.is_categorical_dtype(mean_scores[stage_key]) and mean_scores[stage_key].cat.ordered:



加载数据: D:/111/R-AG_cleaned.h5ad
数据加载完成: AnnData object with n_obs × n_vars = 20422 × 12088
    obs: 'sample', 'n_genes_by_counts', 'total_counts', 'doublet_score', 'predicted_doublet', 'gland', 'stage_old', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'total_counts_RIBO', 'pct_counts_RIBO', 'log1p_total_counts_RIBO', 'leiden', 'anno', 'stage_new', 'species', 'celltype', 'stage', 'cellid', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'newcelltype', 'subtype'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dendrogram_leiden', 'gland_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'newcelltype_colors', 'pca', 'rank_genes_groups', 'sample_colors', 'stage_colors', 'subtype_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'counts', 'matrix

  if not pd.api.types.is_categorical_dtype(adata.obs[stage_key]):


    分数已计算并添加到 adata.obs['Lactation_Score']
  计算模块: masc_Score
    存在于数据中的基因 (3): ['Wnt6', 'Lef1', 'Procr']
    分数已计算并添加到 adata.obs['masc_Score']
  绘制 UMAP 图...
  绘制阶段性折线图...


  mean_scores = plot_data.groupby([stage_key, cell_type_key]).mean().reset_index()
  if pd.api.types.is_categorical_dtype(mean_scores[stage_key]) and mean_scores[stage_key].cat.ordered:



加载数据: D:/111/R-CG_cleaned.h5ad
数据加载完成: AnnData object with n_obs × n_vars = 37627 × 12088
    obs: 'sample', 'n_genes_by_counts', 'total_counts', 'doublet_score', 'predicted_doublet', 'gland', 'stage_old', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'total_counts_RIBO', 'pct_counts_RIBO', 'log1p_total_counts_RIBO', 'leiden', 'anno', 'stage_new', 'species', 'celltype', 'stage', 'cellid', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'newcelltype', 'subtype'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dendrogram_leiden', 'gland_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'newcelltype_colors', 'pca', 'rank_genes_groups', 'sample_colors', 'stage_colors', 'subtype_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'counts', 'matrix

  if not pd.api.types.is_categorical_dtype(adata.obs[stage_key]):


    分数已计算并添加到 adata.obs['Lactation_Score']
  计算模块: masc_Score
    存在于数据中的基因 (3): ['Wnt6', 'Lef1', 'Procr']
    分数已计算并添加到 adata.obs['masc_Score']
  绘制 UMAP 图...
  绘制阶段性折线图...


  mean_scores = plot_data.groupby([stage_key, cell_type_key]).mean().reset_index()
  if pd.api.types.is_categorical_dtype(mean_scores[stage_key]) and mean_scores[stage_key].cat.ordered:



加载数据: D:/111/S-AG_cleaned.h5ad
数据加载完成: AnnData object with n_obs × n_vars = 18859 × 12088
    obs: 'sample', 'n_genes_by_counts', 'total_counts', 'doublet_score', 'predicted_doublet', 'gland', 'stage_old', 'log1p_n_genes_by_counts', 'log1p_total_counts', 'total_counts_MT', 'pct_counts_MT', 'log1p_total_counts_MT', 'total_counts_RIBO', 'pct_counts_RIBO', 'log1p_total_counts_RIBO', 'leiden', 'anno', 'stage_new', 'species', 'celltype', 'stage', 'cellid', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'newcelltype', 'subtype'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'dendrogram_leiden', 'gland_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'newcelltype_colors', 'pca', 'rank_genes_groups', 'sample_colors', 'stage_colors', 'subtype_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    layers: 'ambiguous', 'counts', 'matrix

  if not pd.api.types.is_categorical_dtype(adata.obs[stage_key]):


    分数已计算并添加到 adata.obs['Lactation_Score']
  计算模块: masc_Score
    存在于数据中的基因 (3): ['Wnt6', 'Lef1', 'Procr']
    分数已计算并添加到 adata.obs['masc_Score']
  绘制 UMAP 图...
  绘制阶段性折线图...


  mean_scores = plot_data.groupby([stage_key, cell_type_key]).mean().reset_index()
  if pd.api.types.is_categorical_dtype(mean_scores[stage_key]) and mean_scores[stage_key].cat.ordered:




In [5]:
print("\n所有数据集处理完毕。")
print(f"结果图表已保存到: {output_dir}")


所有数据集处理完毕。


NameError: name 'output_dir' is not defined