In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据结构探索器 - 读取所有数据文件的前两行以理解数据结构
"""

import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 设置路径
data_dir = "/Users/heweilin/Desktop/P056_Code/Data"
output_dir = "/Users/heweilin/Desktop/P056_Code/Data_Post_Processed"

print("="*100)
print("数据结构探索器 - 理解所有数据文件结构")
print("="*100)

def explore_file(file_path, file_type="csv"):
    """探索单个文件的结构"""
    try:
        file_name = os.path.basename(file_path)
        print(f"\n📁 文件: {file_name}")
        print(f"📍 路径: {file_path}")
        
        # 获取文件大小
        file_size = os.path.getsize(file_path)
        print(f"📊 大小: {file_size:,} bytes")
        
        # 根据文件类型读取
        if file_type == "csv":
            df = pd.read_csv(file_path, nrows=2)
        elif file_type == "tsv":
            df = pd.read_csv(file_path, sep='\t', nrows=2)
        elif file_type == "xlsx":
            # 对于Excel文件，读取所有工作表
            excel_file = pd.ExcelFile(file_path)
            print(f"📋 Excel工作表: {excel_file.sheet_names}")
            for sheet_name in excel_file.sheet_names:
                print(f"\n   📄 工作表: {sheet_name}")
                df = pd.read_excel(file_path, sheet_name=sheet_name, nrows=2)
                print(f"   📐 形状: {df.shape}")
                print(f"   🏷️  列名: {list(df.columns)}")
                if len(df) > 0:
                    print(f"   📝 前两行数据:")
                    for i, row in df.iterrows():
                        print(f"      行{i+1}: {dict(row)}")
            return
        else:
            print(f"   ⚠️  未知文件类型: {file_type}")
            return
        
        print(f"📐 形状: {df.shape}")
        print(f"🏷️  列名: {list(df.columns)}")
        print(f"📝 数据类型:")
        for col, dtype in df.dtypes.items():
            print(f"   - {col}: {dtype}")
        
        if len(df) > 0:
            print(f"📝 前两行数据:")
            for i, row in df.iterrows():
                print(f"   行{i+1}: {dict(row)}")
        else:
            print("   ⚠️  文件为空")
            
    except Exception as e:
        print(f"   ❌ 读取错误: {e}")

def explore_directory(directory_path, title):
    """探索目录下的所有数据文件"""
    print(f"\n{'='*50}")
    print(f"{title}")
    print(f"目录: {directory_path}")
    print(f"{'='*50}")
    
    if not os.path.exists(directory_path):
        print(f"❌ 目录不存在: {directory_path}")
        return
    
    # 获取所有文件
    all_files = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            all_files.append(file_path)
    
    # 按文件类型分类
    csv_files = [f for f in all_files if f.endswith('.csv')]
    tsv_files = [f for f in all_files if f.endswith('.tsv')]
    xlsx_files = [f for f in all_files if f.endswith('.xlsx')]
    txt_files = [f for f in all_files if f.endswith('.txt')]
    other_files = [f for f in all_files if not any(f.endswith(ext) for ext in ['.csv', '.tsv', '.xlsx', '.txt'])]
    
    print(f"📊 文件统计:")
    print(f"   📄 CSV文件: {len(csv_files)} 个")
    print(f"   📄 TSV文件: {len(tsv_files)} 个") 
    print(f"   📄 Excel文件: {len(xlsx_files)} 个")
    print(f"   📄 文本文件: {len(txt_files)} 个")
    print(f"   📄 其他文件: {len(other_files)} 个")
    
    # 处理CSV文件
    if csv_files:
        print(f"\n🔍 探索CSV文件:")
        for file_path in sorted(csv_files):
            explore_file(file_path, "csv")
    
    # 处理TSV文件
    if tsv_files:
        print(f"\n🔍 探索TSV文件:")
        for file_path in sorted(tsv_files):
            explore_file(file_path, "tsv")
    
    # 处理Excel文件
    if xlsx_files:
        print(f"\n🔍 探索Excel文件:")
        for file_path in sorted(xlsx_files):
            explore_file(file_path, "xlsx")
    
    # 处理文本文件
    if txt_files:
        print(f"\n🔍 探索文本文件:")
        for file_path in sorted(txt_files):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    lines = f.readlines()[:5]  # 读取前5行
                print(f"\n📁 文件: {os.path.basename(file_path)}")
                print(f"📍 路径: {file_path}")
                print(f"📊 总行数: {len(open(file_path, 'r', encoding='utf-8').readlines())}")
                print(f"📝 前5行内容:")
                for i, line in enumerate(lines, 1):
                    print(f"   行{i}: {line.strip()}")
            except Exception as e:
                print(f"❌ 读取文本文件错误 {file_path}: {e}")
    
    # 列出其他文件
    if other_files:
        print(f"\n📋 其他文件列表:")
        for file_path in sorted(other_files):
            print(f"   📄 {os.path.basename(file_path)} ({os.path.getsize(file_path):,} bytes)")

# 1. 探索原始数据目录
explore_directory(data_dir, "📂 原始数据目录 (Data)")

# 2. 探索处理后数据目录  
explore_directory(output_dir, "📂 处理后数据目录 (Data_Post_Processed)")

# 3. 创建数据结构总结
print(f"\n{'='*100}")
print("📋 数据结构总结")
print(f"{'='*100}")

# 统计所有文件
total_files = 0
file_types = {}

for directory in [data_dir, output_dir]:
    if os.path.exists(directory):
        for root, dirs, files in os.walk(directory):
            for file in files:
                total_files += 1
                ext = os.path.splitext(file)[1].lower()
                file_types[ext] = file_types.get(ext, 0) + 1

print(f"🎯 总体统计:")
print(f"   📊 总文件数: {total_files}")
print(f"   📁 目录数: 2 (Data + Data_Post_Processed)")
print(f"\n📄 文件类型分布:")
for ext, count in sorted(file_types.items()):
    if ext:
        print(f"   {ext}: {count} 个文件")
    else:
        print(f"   无扩展名: {count} 个文件")

print(f"\n🔍 关键观察:")
print(f"   1. 原始数据文件命名规律: 数字前缀 + 描述")
print(f"   2. 处理后文件命名规律: 数字前缀 + 处理步骤描述")  
print(f"   3. Excel文件包含多个工作表，需要分别处理")
print(f"   4. TSV文件使用制表符分隔")
print(f"   5. 某些目录可能包含子文件夹和多个相关文件")

print(f"\n" + "="*100)
print("✅ 数据结构探索完成!")
print("="*100)

数据结构探索器 - 理解所有数据文件结构

📂 原始数据目录 (Data)
目录: /Users/heweilin/Desktop/P056_Code/Data
📊 文件统计:
   📄 CSV文件: 8 个
   📄 TSV文件: 1 个
   📄 Excel文件: 4 个
   📄 文本文件: 52 个
   📄 其他文件: 3 个

🔍 探索CSV文件:

📁 文件: 1mRNA_DEGs_proteincoding.csv
📍 路径: /Users/heweilin/Desktop/P056_Code/Data/1mRNA_DEGs_proteincoding.csv
📊 大小: 2,605,398 bytes
📐 形状: (2, 13)
🏷️  列名: ['Row.names', 'baseMean', 'log2FoldChange', 'lfcSE', 'stat', 'pvalue', 'padj', 'Chromosome', 'Gene_start', 'Gene_end', 'Strand', 'Gene_type', 'SYMBOL']
📝 数据类型:
   - Row.names: object
   - baseMean: float64
   - log2FoldChange: float64
   - lfcSE: float64
   - stat: float64
   - pvalue: float64
   - padj: float64
   - Chromosome: int64
   - Gene_start: int64
   - Gene_end: int64
   - Strand: int64
   - Gene_type: object
   - SYMBOL: object
📝 前两行数据:
   行1: {'Row.names': 'ENSG00000181626', 'baseMean': 0.246272501, 'log2FoldChange': -30.0, 'lfcSE': 4.018502953, 'stat': -7.465466705, 'pvalue': 8.3e-14, 'padj': 2.79e-10, 'Chromosome': 18, 'Gene_start': 12093843,

In [110]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
4.3 通路富集分析（GO/KEGG）
分析维生素B12缺乏相关调控网络中涉及的主要生物过程和信号通路
"""

import pandas as pd
import numpy as np
import os
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# 设置路径
data_dir = "/Users/heweilin/Desktop/P056_Code/Data"
output_dir = "/Users/heweilin/Desktop/P056_Code/Data_Post_Processed" 
figure_dir = "/Users/heweilin/Desktop/P056_Code/Figure"

print("="*80)
print("4.3 通路富集分析（GO/KEGG）数据准备")
print("="*80)

# 1. 读取数据
print("\n【步骤1】读取分析数据文件")
print("-"*50)

try:
    # 读取miRNA调控分析结果
    mirna_detailed = pd.read_csv(os.path.join(output_dir, "1_miRNA_mRNA_Detailed_Relationships.csv"))
    print(f"✅ miRNA调控详细关系: {mirna_detailed.shape[0]} 行")
    
    # 读取DNA甲基化调控分析结果
    dna_detailed = pd.read_csv(os.path.join(output_dir, "2_DNA_Methylation_DEG_Detailed.csv"))
    print(f"✅ DNA甲基化调控详细: {dna_detailed.shape[0]} 行")
    
    # 读取基因注释信息
    gene_annotation = pd.read_csv(os.path.join(data_dir, "8rel_ENS_SYM_chr.tsv"), sep='\t')
    print(f"✅ 基因注释数据: {gene_annotation.shape[0]} 行")
    
except Exception as e:
    print(f"❌ 数据读取错误: {e}")
    exit()

# 2. 提取分析对象基因集
print("\n【步骤2】提取富集分析基因集")
print("-"*50)

# 2.1 被差异miRNA调控的DEGs（经典调控：miRNA上调+目标基因下调 或 miRNA下调+目标基因上调）
print("🔬 筛选miRNA经典调控模式的DEGs:")

# 筛选条件：Is_DEG == 'Yes' 且 Relationship_Category 为经典调控模式
classic_mirna_regulation = mirna_detailed[
    (mirna_detailed['Is_DEG'] == 'Yes') & 
    (mirna_detailed['Relationship_Category'].isin(['Consistent_Inhibition', 'Consistent_Derepression']))
].copy()

print(f"   - 经典miRNA调控关系总数: {classic_mirna_regulation.shape[0]}")
print(f"   - 调控类型分布: {classic_mirna_regulation['Relationship_Category'].value_counts().to_dict()}")

# 提取去重的基因集合
mirna_regulated_genes = set(classic_mirna_regulation['Target_Gene'].dropna().unique())
print(f"   - 被经典miRNA调控的去重DEGs: {len(mirna_regulated_genes)} 个")

# 2.2 被DMR调控的DEGs（经典表观遗传调控：甲基化上调+基因下调 或 去甲基化+基因上调）
print("\n🔬 筛选DMR经典调控模式的DEGs:")

# 筛选条件：Is_DEG == 'Yes' 且 Relationship_Category 为经典调控模式
classic_dmr_regulation = dna_detailed[
    (dna_detailed['Is_DEG'] == 'Yes') & 
    (dna_detailed['Relationship_Category'].isin(['Classic_Methylation_Repression', 'Classic_Demethylation_Activation']))
].copy()

print(f"   - 经典DMR调控关系总数: {classic_dmr_regulation.shape[0]}")
print(f"   - 调控类型分布: {classic_dmr_regulation['Relationship_Category'].value_counts().to_dict()}")

# 提取去重的基因集合
dmr_regulated_genes = set(classic_dmr_regulation['Gene_Symbol'].dropna().unique())
print(f"   - 被经典DMR调控的去重DEGs: {len(dmr_regulated_genes)} 个")

# 2.3 交集关键DEGs（同时受miRNA和DMR经典调控）
intersection_genes = mirna_regulated_genes.intersection(dmr_regulated_genes)
print(f"\n🎯 交集关键DEGs: {len(intersection_genes)} 个")
if len(intersection_genes) > 0:
    print(f"   - 双重调控基因: {', '.join(sorted(list(intersection_genes))[:10])}{'...' if len(intersection_genes) > 10 else ''}")

# 3. 基因集合详细分析
print("\n【步骤3】基因集合调控特征分析")
print("-"*50)

# 3.1 miRNA调控基因特征分析
print("📊 miRNA调控基因特征:")
mirna_gene_stats = classic_mirna_regulation.groupby('Target_Gene').agg({
    'miRNA': 'count',
    'miRNA_Direction': lambda x: '; '.join(x.unique()),
    'Target_Direction': 'first',
    'Target_log2FC': 'first',
    'Relationship_Category': lambda x: '; '.join(x.unique()),
    'Evidence_Type': lambda x: '; '.join(x.unique())
}).reset_index()

mirna_gene_stats.columns = ['Gene_Symbol', 'Regulating_miRNA_Count', 'miRNA_Directions', 
                           'Gene_Direction', 'Gene_LogFC', 'Regulation_Types', 'Evidence_Types']
mirna_gene_stats = mirna_gene_stats.sort_values('Regulating_miRNA_Count', ascending=False)

print(f"   - Top 10 受多个miRNA调控的基因:")
for _, row in mirna_gene_stats.head(10).iterrows():
    print(f"     * {row['Gene_Symbol']}: {row['Regulating_miRNA_Count']} 个miRNA调控 ({row['Gene_Direction']}, FC={row['Gene_LogFC']:.3f})")

# 3.2 DMR调控基因特征分析  
print("\n📊 DMR调控基因特征:")
dmr_gene_stats = classic_dmr_regulation.groupby('Gene_Symbol').agg({
    'DMR_Index': 'count',
    'Methylation_Direction': lambda x: '; '.join(x.unique()),
    'Expression_Direction': 'first',
    'DEG_log2FC': 'first',
    'Relationship_Category': lambda x: '; '.join(x.unique()),
    'Methylation_Value': 'mean'
}).reset_index()

dmr_gene_stats.columns = ['Gene_Symbol', 'Regulating_DMR_Count', 'Methylation_Directions', 
                         'Gene_Direction', 'Gene_LogFC', 'Regulation_Types', 'Mean_Methylation_Change']
dmr_gene_stats = dmr_gene_stats.sort_values('Regulating_DMR_Count', ascending=False)

print(f"   - Top 10 受多个DMR调控的基因:")
for _, row in dmr_gene_stats.head(10).iterrows():
    print(f"     * {row['Gene_Symbol']}: {row['Regulating_DMR_Count']} 个DMR调控 ({row['Gene_Direction']}, FC={row['Gene_LogFC']:.3f})")

# 4. 准备富集分析的基因列表
print("\n【步骤4】准备富集分析数据")
print("-"*50)

# 创建富集分析输入数据
enrichment_gene_sets = {
    'miRNA_Classic_Regulated_DEGs': {
        'genes': list(mirna_regulated_genes),
        'description': '被差异miRNA经典调控的DEGs（抑制/去抑制模式）',
        'source_data': mirna_gene_stats,
        'regulation_details': classic_mirna_regulation
    },
    'DMR_Classic_Regulated_DEGs': {
        'genes': list(dmr_regulated_genes), 
        'description': '被DMR经典调控的DEGs（甲基化抑制/去甲基化激活模式）',
        'source_data': dmr_gene_stats,
        'regulation_details': classic_dmr_regulation
    },
    'Dual_Regulated_DEGs': {
        'genes': list(intersection_genes),
        'description': '双重调控关键DEGs（同时受miRNA和DMR经典调控）',
        'source_data': None,
        'regulation_details': None
    }
}

# 5. 基因集合统计和验证
print("\n【步骤5】基因集合统计验证")
print("-"*50)

# 创建基因集合汇总表
gene_sets_summary = []
for set_name, set_info in enrichment_gene_sets.items():
    gene_count = len(set_info['genes'])
    if gene_count > 0:
        # 验证基因是否在注释文件中
        annotated_genes = set(gene_annotation['SYMBOL'].dropna().unique())
        valid_genes = [g for g in set_info['genes'] if g in annotated_genes]
        
        gene_sets_summary.append({
            'Gene_Set': set_name,
            'Description': set_info['description'],
            'Total_Gene_Count': gene_count,
            'Annotated_Gene_Count': len(valid_genes),
            'Annotation_Rate': len(valid_genes) / gene_count * 100 if gene_count > 0 else 0,
            'Gene_List': '; '.join(sorted(valid_genes)),
            'Sample_Genes': '; '.join(sorted(valid_genes)[:10]) + ('...' if len(valid_genes) > 10 else '')
        })

gene_sets_df = pd.DataFrame(gene_sets_summary)
print("📋 基因集合汇总:")
for _, row in gene_sets_df.iterrows():
    print(f"   🎯 {row['Gene_Set']}: {row['Annotated_Gene_Count']}/{row['Total_Gene_Count']} 个基因 ({row['Annotation_Rate']:.1f}% 注释率)")
    print(f"      {row['Description']}")
    if row['Annotated_Gene_Count'] > 0:
        print(f"      示例基因: {row['Sample_Genes']}")

# 6. 详细的富集分析输入文件准备
print("\n【步骤6】生成富集分析输入文件")
print("-"*50)

# 6.1 miRNA调控基因富集输入（添加功能注释信息）
mirna_enrichment_input = mirna_gene_stats.copy()
mirna_enrichment_input['Regulation_Summary'] = mirna_enrichment_input.apply(
    lambda row: f"{row['Regulating_miRNA_Count']} miRNAs → {row['Gene_Direction']} (FC: {row['Gene_LogFC']:.3f})", axis=1
)
mirna_enrichment_input['Functional_Category'] = mirna_enrichment_input['Regulation_Types'].apply(
    lambda x: 'Inhibition' if 'Consistent_Inhibition' in x else 'Derepression'
)

print(f"📤 miRNA调控基因富集分析输入: {mirna_enrichment_input.shape[0]} 个基因")
print("   - 功能分类分布:")
func_dist = mirna_enrichment_input['Functional_Category'].value_counts()
for category, count in func_dist.items():
    print(f"     * {category}: {count} 个基因")

# 6.2 DMR调控基因富集输入（添加表观遗传分类）
dmr_enrichment_input = dmr_gene_stats.copy()
dmr_enrichment_input['Regulation_Summary'] = dmr_enrichment_input.apply(
    lambda row: f"{row['Regulating_DMR_Count']} DMRs → {row['Gene_Direction']} (FC: {row['Gene_LogFC']:.3f})", axis=1
)
dmr_enrichment_input['Epigenetic_Category'] = dmr_enrichment_input['Regulation_Types'].apply(
    lambda x: 'Methylation_Repression' if 'Classic_Methylation_Repression' in x else 'Demethylation_Activation'
)

print(f"📤 DMR调控基因富集分析输入: {dmr_enrichment_input.shape[0]} 个基因")
print("   - 表观遗传分类分布:")
epi_dist = dmr_enrichment_input['Epigenetic_Category'].value_counts()
for category, count in epi_dist.items():
    print(f"     * {category}: {count} 个基因")

# 7. 交集基因的双重调控分析
if len(intersection_genes) > 0:
    print("\n【步骤7】交集基因双重调控机制分析")
    print("-"*50)
    
    intersection_analysis = []
    for gene in intersection_genes:
        # miRNA调控信息
        mirna_info = mirna_gene_stats[mirna_gene_stats['Gene_Symbol'] == gene]
        dmr_info = dmr_gene_stats[dmr_gene_stats['Gene_Symbol'] == gene]
        
        if len(mirna_info) > 0 and len(dmr_info) > 0:
            intersection_analysis.append({
                'Gene_Symbol': gene,
                'miRNA_Regulation_Count': mirna_info['Regulating_miRNA_Count'].iloc[0],
                'miRNA_Functional_Category': mirna_info['Regulation_Types'].iloc[0],
                'DMR_Regulation_Count': dmr_info['Regulating_DMR_Count'].iloc[0],
                'DMR_Epigenetic_Category': dmr_info['Regulation_Types'].iloc[0],
                'Gene_Expression_Direction': mirna_info['Gene_Direction'].iloc[0],
                'Gene_LogFC': mirna_info['Gene_LogFC'].iloc[0],
                'Dual_Regulation_Pattern': f"miRNA + DMR → {mirna_info['Gene_Direction'].iloc[0]}",
                'Regulatory_Complexity': mirna_info['Regulating_miRNA_Count'].iloc[0] + dmr_info['Regulating_DMR_Count'].iloc[0]
            })
    
    intersection_df = pd.DataFrame(intersection_analysis)
    intersection_df = intersection_df.sort_values('Regulatory_Complexity', ascending=False)
    
    print(f"🔍 交集基因双重调控机制分析:")
    for _, row in intersection_df.head(10).iterrows():
        print(f"   🎯 {row['Gene_Symbol']} (复杂度: {row['Regulatory_Complexity']}):")
        print(f"      - miRNA调控: {row['miRNA_Regulation_Count']} 个调控因子")
        print(f"      - DMR调控: {row['DMR_Regulation_Count']} 个调控区域") 
        print(f"      - 最终表达: {row['Gene_Expression_Direction']} (FC: {row['Gene_LogFC']:.3f})")

# 8. 保存分析结果
print("\n【步骤8】保存富集分析输入文件")
print("-"*50)

try:
    # 8.1 保存基因集合汇总
    gene_sets_output = os.path.join(output_dir, "4_Pathway_Enrichment_Gene_Sets_Summary.csv")
    gene_sets_df.to_csv(gene_sets_output, index=False, encoding='utf-8-sig')
    print(f"✅ 基因集合汇总: {gene_sets_output}")
    
    # 8.2 保存miRNA调控基因富集输入
    mirna_output = os.path.join(output_dir, "4_miRNA_Classic_Regulated_DEGs_Enrichment.csv")
    mirna_enrichment_input.to_csv(mirna_output, index=False, encoding='utf-8-sig')
    print(f"✅ miRNA经典调控基因: {mirna_output}")
    
    # 8.3 保存DMR调控基因富集输入
    dmr_output = os.path.join(output_dir, "4_DMR_Classic_Regulated_DEGs_Enrichment.csv")
    dmr_enrichment_input.to_csv(dmr_output, index=False, encoding='utf-8-sig')
    print(f"✅ DMR经典调控基因: {dmr_output}")
    
    # 8.4 保存交集基因分析（如果有）
    if len(intersection_genes) > 0:
        intersection_output = os.path.join(output_dir, "4_Dual_Regulated_DEGs_Analysis.csv")
        intersection_df.to_csv(intersection_output, index=False, encoding='utf-8-sig')
        print(f"✅ 双重调控基因分析: {intersection_output}")
    
    # 8.5 创建标准富集分析用的基因列表文件
    for set_name, set_info in enrichment_gene_sets.items():
        if len(set_info['genes']) > 0:
            # 只保留有注释的基因
            annotated_genes = set(gene_annotation['SYMBOL'].dropna().unique())
            valid_genes = [g for g in set_info['genes'] if g in annotated_genes]
            
            if len(valid_genes) > 0:
                gene_list_file = os.path.join(output_dir, f"4_{set_name}_gene_list.txt")
                with open(gene_list_file, 'w', encoding='utf-8') as f:
                    for gene in sorted(valid_genes):
                        f.write(f"{gene}\n")
                print(f"✅ {set_name}基因列表 ({len(valid_genes)}个): {gene_list_file}")
    
    # 8.6 创建调控机制分类的基因列表
    print("\n📋 按调控机制分类的基因列表:")
    
    # miRNA抑制类基因
    inhibition_genes = mirna_enrichment_input[
        mirna_enrichment_input['Functional_Category'] == 'Inhibition'
    ]['Gene_Symbol'].tolist()
    if len(inhibition_genes) > 0:
        inhibition_file = os.path.join(output_dir, "4_miRNA_Inhibition_genes.txt")
        with open(inhibition_file, 'w', encoding='utf-8') as f:
            for gene in sorted(inhibition_genes):
                f.write(f"{gene}\n")
        print(f"✅ miRNA抑制调控基因 ({len(inhibition_genes)}个): 4_miRNA_Inhibition_genes.txt")
    
    # miRNA去抑制类基因
    derepression_genes = mirna_enrichment_input[
        mirna_enrichment_input['Functional_Category'] == 'Derepression'
    ]['Gene_Symbol'].tolist()
    if len(derepression_genes) > 0:
        derepression_file = os.path.join(output_dir, "4_miRNA_Derepression_genes.txt")
        with open(derepression_file, 'w', encoding='utf-8') as f:
            for gene in sorted(derepression_genes):
                f.write(f"{gene}\n")
        print(f"✅ miRNA去抑制调控基因 ({len(derepression_genes)}个): 4_miRNA_Derepression_genes.txt")
    
    # 甲基化抑制类基因
    meth_repression_genes = dmr_enrichment_input[
        dmr_enrichment_input['Epigenetic_Category'] == 'Methylation_Repression'
    ]['Gene_Symbol'].tolist()
    if len(meth_repression_genes) > 0:
        meth_repression_file = os.path.join(output_dir, "4_Methylation_Repression_genes.txt")
        with open(meth_repression_file, 'w', encoding='utf-8') as f:
            for gene in sorted(meth_repression_genes):
                f.write(f"{gene}\n")
        print(f"✅ 甲基化抑制基因 ({len(meth_repression_genes)}个): 4_Methylation_Repression_genes.txt")
    
    # 去甲基化激活类基因
    demeth_activation_genes = dmr_enrichment_input[
        dmr_enrichment_input['Epigenetic_Category'] == 'Demethylation_Activation'
    ]['Gene_Symbol'].tolist()
    if len(demeth_activation_genes) > 0:
        demeth_activation_file = os.path.join(output_dir, "4_Demethylation_Activation_genes.txt")
        with open(demeth_activation_file, 'w', encoding='utf-8') as f:
            for gene in sorted(demeth_activation_genes):
                f.write(f"{gene}\n")
        print(f"✅ 去甲基化激活基因 ({len(demeth_activation_genes)}个): 4_Demethylation_Activation_genes.txt")
    
except Exception as e:
    print(f"❌ 文件保存错误: {e}")

# 9. 维生素B12相关通路预分析
print("\n【步骤9】维生素B12代谢相关基因预筛选")
print("-"*50)

# 维生素B12代谢相关的关键词
b12_keywords = ['B12', 'cobalamin', 'methionine', 'homocysteine', 'folate', 'folic', 
                'MTR', 'MTRR', 'CBS', 'BHMT', 'DNMT', 'methyltransferase',
                'one carbon', 'methylation', 'SAM', 'SAH', 'SAHH']

# 在所有调控基因中搜索B12相关基因
all_regulated_genes = list(mirna_regulated_genes.union(dmr_regulated_genes))
b12_related_genes = []

for gene in all_regulated_genes:
    gene_upper = gene.upper()
    for keyword in b12_keywords:
        if keyword.upper() in gene_upper:
            b12_related_genes.append(gene)
            break

if len(b12_related_genes) > 0:
    print(f"🔍 发现可能的维生素B12相关基因: {len(b12_related_genes)} 个")
    for gene in sorted(b12_related_genes):
        # 检查在哪个调控集合中
        in_mirna = gene in mirna_regulated_genes
        in_dmr = gene in dmr_regulated_genes
        regulation_type = []
        if in_mirna: regulation_type.append("miRNA")
        if in_dmr: regulation_type.append("DMR")
        print(f"   * {gene}: {', '.join(regulation_type)} 调控")
    
    # 保存B12相关基因列表
    b12_file = os.path.join(output_dir, "4_VitaminB12_Related_genes.txt")
    with open(b12_file, 'w', encoding='utf-8') as f:
        for gene in sorted(b12_related_genes):
            f.write(f"{gene}\n")
    print(f"✅ 维生素B12相关基因列表: 4_VitaminB12_Related_genes.txt")
else:
    print("ℹ️  未发现明显的维生素B12代谢相关基因（基于基因名称）")

# 10. 分析总结
print("\n【步骤10】通路富集分析总结")
print("="*80)
print("🎯 经典调控模式基因集合准备完成!")
print(f"\n📊 数据统计:")
print(f"   - miRNA经典调控DEGs: {len(mirna_regulated_genes)} 个")
print(f"     * 抑制模式: {len([g for g in mirna_enrichment_input['Functional_Category'] if g == 'Inhibition'])} 个")
print(f"     * 去抑制模式: {len([g for g in mirna_enrichment_input['Functional_Category'] if g == 'Derepression'])} 个")
print(f"   - DMR经典调控DEGs: {len(dmr_regulated_genes)} 个")
print(f"     * 甲基化抑制: {len([g for g in dmr_enrichment_input['Epigenetic_Category'] if g == 'Methylation_Repression'])} 个")
print(f"     * 去甲基化激活: {len([g for g in dmr_enrichment_input['Epigenetic_Category'] if g == 'Demethylation_Activation'])} 个")
print(f"   - 双重调控关键DEGs: {len(intersection_genes)} 个")

print(f"\n📁 输出文件总结:")
print(f"   📋 基础分析文件:")
print(f"     - 基因集合汇总: 4_Pathway_Enrichment_Gene_Sets_Summary.csv")
print(f"     - miRNA调控基因详情: 4_miRNA_Classic_Regulated_DEGs_Enrichment.csv")
print(f"     - DMR调控基因详情: 4_DMR_Classic_Regulated_DEGs_Enrichment.csv")
if len(intersection_genes) > 0:
    print(f"     - 双重调控分析: 4_Dual_Regulated_DEGs_Analysis.csv")

print(f"\n   📝 富集分析基因列表:")
print(f"     - miRNA调控基因: 4_miRNA_Classic_Regulated_DEGs_gene_list.txt")
print(f"     - DMR调控基因: 4_DMR_Classic_Regulated_DEGs_gene_list.txt")
if len(intersection_genes) > 0:
    print(f"     - 双重调控基因: 4_Dual_Regulated_DEGs_gene_list.txt")

print(f"\n   🎯 调控机制分类:")
print(f"     - miRNA抑制基因: 4_miRNA_Inhibition_genes.txt")
print(f"     - miRNA去抑制基因: 4_miRNA_Derepression_genes.txt")
print(f"     - 甲基化抑制基因: 4_Methylation_Repression_genes.txt")
print(f"     - 去甲基化激活基因: 4_Demethylation_Activation_genes.txt")

if len(b12_related_genes) > 0:
    print(f"     - 维生素B12相关基因: 4_VitaminB12_Related_genes.txt")

print(f"\n🔬 富集分析建议:")
print(f"   1. 使用各基因列表进行GO生物过程富集分析")
print(f"   2. 进行KEGG信号通路富集分析")
print(f"   3. 比较不同调控机制涉及的功能差异:")
print(f"      - miRNA抑制 vs 去抑制的功能偏向")
print(f"      - 甲基化抑制 vs 去甲基化激活的通路差异") 
print(f"      - 双重调控基因的核心功能网络")
print(f"   4. 特别关注维生素B12代谢、一碳单位代谢相关通路")
print(f"   5. 分析神经发育、造血、DNA合成等B12缺乏相关功能")

print("\n" + "="*80)
print("✅ 4.3 通路富集分析数据准备完成!")
print("="*80)

4.3 通路富集分析（GO/KEGG）数据准备

【步骤1】读取分析数据文件
--------------------------------------------------
✅ miRNA调控详细关系: 2238 行
✅ DNA甲基化调控详细: 2777 行
✅ 基因注释数据: 69297 行

【步骤2】提取富集分析基因集
--------------------------------------------------
🔬 筛选miRNA经典调控模式的DEGs:
   - 经典miRNA调控关系总数: 1087
   - 调控类型分布: {'Consistent_Derepression': 546, 'Consistent_Inhibition': 541}
   - 被经典miRNA调控的去重DEGs: 186 个

🔬 筛选DMR经典调控模式的DEGs:
   - 经典DMR调控关系总数: 1330
   - 调控类型分布: {'Classic_Demethylation_Activation': 722, 'Classic_Methylation_Repression': 608}
   - 被经典DMR调控的去重DEGs: 66 个

🎯 交集关键DEGs: 65 个
   - 双重调控基因: ADAMTS15, AFF1, AGPAT5, ARID5A, ARRDC2, ATOH8, C2CD3, C4orf50, CCDC171, CGB3...

【步骤3】基因集合调控特征分析
--------------------------------------------------
📊 miRNA调控基因特征:
   - Top 10 受多个miRNA调控的基因:
     * RABGAP1L: 20 个miRNA调控 (Down-regulated, FC=-0.665)
     * RIPOR2: 20 个miRNA调控 (Down-regulated, FC=-1.033)
     * SLC38A1: 19 个miRNA调控 (Down-regulated, FC=-0.883)
     * INPP5F: 16 个miRNA调控 (Down-regulated, FC=-0.529)
     * KIAA0319: 15 

In [112]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
4.3 GO/KEGG Pathway Enrichment Analysis and Visualization
Real data-based pathway enrichment analysis for different regulatory mechanisms
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import defaultdict
import requests
import json
import time
import warnings
warnings.filterwarnings('ignore')

# Font settings for better visualization
plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# Set directories
data_dir = "/Users/heweilin/Desktop/P056_Code/Data"
output_dir = "/Users/heweilin/Desktop/P056_Code/Data_Post_Processed" 
figure_dir = "/Users/heweilin/Desktop/P056_Code/Figure"

print("="*80)
print("4.3 GO/KEGG Pathway Enrichment Analysis (Real Data)")
print("="*80)

# 1. Read gene set files
print("\n[Step 1] Reading gene set files")
print("-"*50)

# Define gene sets for analysis
gene_sets = {
    '4_miRNA_Inhibition_genes.txt': {
        'name': 'miRNA_Inhibition',
        'description': 'miRNA inhibition regulated genes',
        'color': '#FF6B6B'
    },
    '4_miRNA_Derepression_genes.txt': {
        'name': 'miRNA_Derepression', 
        'description': 'miRNA derepression regulated genes',
        'color': '#4ECDC4'
    },
    '4_Methylation_Repression_genes.txt': {
        'name': 'Methylation_Repression',
        'description': 'Methylation repression genes',
        'color': '#45B7D1'
    },
    '4_Demethylation_Activation_genes.txt': {
        'name': 'Demethylation_Activation',
        'description': 'Demethylation activation genes', 
        'color': '#96CEB4'
    },
    '4_Dual_Regulated_DEGs_gene_list.txt': {
        'name': 'Dual_Regulated',
        'description': 'Dual regulated key genes',
        'color': '#FFEAA7'
    },
    '4_VitaminB12_Related_genes.txt': {
        'name': 'VitaminB12_Related',
        'description': 'Vitamin B12 related genes',
        'color': '#DDA0DD'
    }
}

# Load gene sets
loaded_gene_sets = {}

for filename, info in gene_sets.items():
    filepath = os.path.join(output_dir, filename)
    
    if os.path.exists(filepath):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                genes = [line.strip() for line in f if line.strip()]
            
            if len(genes) > 0:
                loaded_gene_sets[info['name']] = {
                    'genes': genes,
                    'description': info['description'],
                    'color': info['color'],
                    'filename': filename,
                    'count': len(genes)
                }
                print(f"✅ {info['name']}: {len(genes)} genes")
            else:
                print(f"⚠️  {info['name']}: Empty file")
        except Exception as e:
            print(f"❌ Error reading {filename}: {e}")
    else:
        print(f"⚠️  File not found: {filename}")

print(f"\n📊 Successfully loaded {len(loaded_gene_sets)} gene sets")

# 2. Enrichr API-based enrichment analysis
print("\n[Step 2] Performing enrichment analysis using Enrichr API")
print("-"*50)

def enrichr_analysis(gene_list, gene_set_library):
    """
    Perform enrichment analysis using Enrichr API
    """
    # Submit gene list
    genes_str = '\n'.join(gene_list)
    payload = {
        'list': (None, genes_str),
        'description': (None, 'Gene_list')
    }
    
    try:
        response = requests.post('https://maayanlab.cloud/Enrichr/addList', files=payload)
        if not response.ok:
            return None
        
        user_list_id = json.loads(response.text)['userListId']
        
        # Get enrichment results
        query_string = f'?userListId={user_list_id}&backgroundType={gene_set_library}'
        response = requests.get(f'https://maayanlab.cloud/Enrichr/enrich{query_string}')
        
        if not response.ok:
            return None
            
        data = json.loads(response.text)
        
        # Parse results
        results = []
        for term_data in data[gene_set_library]:
            if len(term_data) >= 9:  # Ensure all required fields exist
                results.append({
                    'Term': term_data[1],
                    'P_value': term_data[2],
                    'Adjusted_P_value': term_data[6],
                    'Combined_Score': term_data[4],
                    'Genes': term_data[5],
                    'Gene_Count': len(term_data[5]) if term_data[5] else 0
                })
        
        return pd.DataFrame(results)
        
    except Exception as e:
        print(f"❌ Enrichr API error: {e}")
        return None

# Define enrichment libraries
enrichment_libraries = {
    'GO_Biological_Process_2023': 'GO BP',
    'KEGG_2021_Human': 'KEGG',
    'WikiPathway_2023_Human': 'WikiPathways'
}

# Perform enrichment analysis for each gene set
enrichment_results = {}

for gene_set_name, gene_set_info in loaded_gene_sets.items():
    print(f"\n🔬 Analyzing {gene_set_name} ({gene_set_info['count']} genes):")
    
    enrichment_results[gene_set_name] = {}
    
    for library, short_name in enrichment_libraries.items():
        print(f"   - Running {short_name} enrichment...")
        
        result = enrichr_analysis(gene_set_info['genes'], library)
        
        if result is not None and len(result) > 0:
            # Filter significant results
            significant_results = result[result['Adjusted_P_value'] < 0.05].copy()
            significant_results = significant_results.sort_values('Adjusted_P_value').head(20)
            
            if len(significant_results) > 0:
                significant_results['Gene_Set'] = gene_set_name
                significant_results['Library'] = short_name
                significant_results['-log10(adj_p)'] = -np.log10(significant_results['Adjusted_P_value'] + 1e-300)
                
                enrichment_results[gene_set_name][short_name] = significant_results
                print(f"     ✅ Found {len(significant_results)} significant terms")
            else:
                print(f"     ⚠️  No significant terms found")
        else:
            print(f"     ❌ Analysis failed")
        
        # Add delay to avoid rate limiting
        time.sleep(1)

# 3. Create visualization functions
print("\n[Step 3] Creating visualization functions")
print("-"*50)

def create_barplot(enrichment_data, gene_set_name, library_type, top_n=10):
    """
    Create enrichment barplot
    """
    if gene_set_name not in enrichment_data or library_type not in enrichment_data[gene_set_name]:
        return None
    
    df = enrichment_data[gene_set_name][library_type].head(top_n).copy()
    
    if len(df) == 0:
        return None
    
    # Create figure
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Create horizontal bar plot
    bars = ax.barh(range(len(df)), df['-log10(adj_p)'], 
                   color=loaded_gene_sets[gene_set_name]['color'], alpha=0.7)
    
    # Customize plot
    ax.set_yticks(range(len(df)))
    ax.set_yticklabels([term[:60] + '...' if len(term) > 60 else term 
                       for term in df['Term']], fontsize=10)
    ax.set_xlabel('-log10(Adjusted P-value)', fontsize=12)
    ax.set_title(f'{library_type} Enrichment: {loaded_gene_sets[gene_set_name]["description"]}', 
                fontsize=14, fontweight='bold')
    
    # Add value labels on bars
    for i, (bar, value) in enumerate(zip(bars, df['-log10(adj_p)'])):
        ax.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2, 
               f'{value:.1f}', va='center', fontsize=9)
    
    # Add significance line
    ax.axvline(x=-np.log10(0.05), color='red', linestyle='--', alpha=0.5, label='p=0.05')
    ax.legend()
    
    plt.tight_layout()
    return fig

def create_dotplot(enrichment_data, gene_set_name, library_type, top_n=15):
    """
    Create enrichment dotplot
    """
    if gene_set_name not in enrichment_data or library_type not in enrichment_data[gene_set_name]:
        return None
    
    df = enrichment_data[gene_set_name][library_type].head(top_n).copy()
    
    if len(df) == 0:
        return None
    
    # Create figure
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Create scatter plot
    scatter = ax.scatter(df['-log10(adj_p)'], range(len(df)), 
                        s=df['Gene_Count']*20, 
                        c=df['Combined_Score'], 
                        cmap='viridis', alpha=0.7, edgecolors='black', linewidth=0.5)
    
    # Customize plot
    ax.set_yticks(range(len(df)))
    ax.set_yticklabels([term[:50] + '...' if len(term) > 50 else term 
                       for term in df['Term']], fontsize=10)
    ax.set_xlabel('-log10(Adjusted P-value)', fontsize=12)
    ax.set_title(f'{library_type} Enrichment: {loaded_gene_sets[gene_set_name]["description"]}', 
                fontsize=14, fontweight='bold')
    
    # Add colorbar
    cbar = plt.colorbar(scatter, ax=ax)
    cbar.set_label('Combined Score', fontsize=10)
    
    # Add legend for dot size
    sizes = [5, 10, 20]
    for size in sizes:
        ax.scatter([], [], s=size*20, c='gray', alpha=0.6, edgecolors='black')
    ax.legend([f'{size} genes' for size in sizes], 
             title='Gene Count', loc='lower right', frameon=True)
    
    # Add significance line
    ax.axvline(x=-np.log10(0.05), color='red', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    return fig

def create_comparison_heatmap(enrichment_data, library_type, top_n=10):
    """
    Create comparison heatmap across gene sets
    """
    # Collect all terms from all gene sets
    all_terms = set()
    gene_set_scores = {}
    
    for gene_set_name in enrichment_data.keys():
        if library_type in enrichment_data[gene_set_name]:
            df = enrichment_data[gene_set_name][library_type].head(top_n)
            terms = set(df['Term'].tolist())
            all_terms.update(terms)
            
            gene_set_scores[gene_set_name] = dict(zip(df['Term'], df['-log10(adj_p)']))
    
    if len(all_terms) == 0:
        return None
    
    # Create matrix
    matrix_data = []
    term_list = list(all_terms)
    
    for term in term_list:
        row = []
        for gene_set_name in enrichment_data.keys():
            score = gene_set_scores.get(gene_set_name, {}).get(term, 0)
            row.append(score)
        matrix_data.append(row)
    
    # Create DataFrame
    heatmap_df = pd.DataFrame(matrix_data, 
                             index=[term[:40] + '...' if len(term) > 40 else term for term in term_list],
                             columns=[loaded_gene_sets[name]['description'] for name in enrichment_data.keys()])
    
    # Create heatmap
    fig, ax = plt.subplots(figsize=(12, max(8, len(term_list)*0.3)))
    
    sns.heatmap(heatmap_df, annot=False, cmap='YlOrRd', 
               cbar_kws={'label': '-log10(Adjusted P-value)'}, 
               ax=ax, linewidths=0.5)
    
    ax.set_title(f'{library_type} Enrichment Comparison Across Gene Sets', 
                fontsize=14, fontweight='bold')
    ax.set_xlabel('Gene Sets', fontsize=12)
    ax.set_ylabel('Enriched Terms', fontsize=12)
    
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    
    return fig

# 4. Generate visualizations
print("\n[Step 4] Generating visualizations")
print("-"*50)

# Create output directory for figures
os.makedirs(figure_dir, exist_ok=True)

# Generate individual plots for each gene set and library
for gene_set_name in loaded_gene_sets.keys():
    for library_short in ['GO BP', 'KEGG']:
        if gene_set_name in enrichment_results and library_short in enrichment_results[gene_set_name]:
            
            # Create barplot
            fig_bar = create_barplot(enrichment_results, gene_set_name, library_short)
            if fig_bar:
                filename = f"1_{gene_set_name}_{library_short.replace(' ', '_')}_barplot.png"
                fig_bar.savefig(os.path.join(figure_dir, filename), dpi=300, bbox_inches='tight')
                plt.close(fig_bar)
                print(f"✅ Saved: {filename}")
            
            # Create dotplot
            fig_dot = create_dotplot(enrichment_results, gene_set_name, library_short)
            if fig_dot:
                filename = f"1_{gene_set_name}_{library_short.replace(' ', '_')}_dotplot.png"
                fig_dot.savefig(os.path.join(figure_dir, filename), dpi=300, bbox_inches='tight')
                plt.close(fig_dot)
                print(f"✅ Saved: {filename}")

# Generate comparison heatmaps
for library_short in ['GO BP', 'KEGG']:
    fig_heatmap = create_comparison_heatmap(enrichment_results, library_short)
    if fig_heatmap:
        filename = f"1_{library_short.replace(' ', '_')}_comparison_heatmap.png"
        fig_heatmap.savefig(os.path.join(figure_dir, filename), dpi=300, bbox_inches='tight')
        plt.close(fig_heatmap)
        print(f"✅ Saved: {filename}")

# 5. Save enrichment results to files
print("\n[Step 5] Saving enrichment results")
print("-"*50)

# Combine all results into comprehensive tables
all_go_results = []
all_kegg_results = []

for gene_set_name, libraries in enrichment_results.items():
    for library_name, results_df in libraries.items():
        if library_name == 'GO BP':
            all_go_results.append(results_df)
        elif library_name == 'KEGG':
            all_kegg_results.append(results_df)

# Save GO results
if all_go_results:
    combined_go = pd.concat(all_go_results, ignore_index=True)
    go_output_file = os.path.join(output_dir, "4_GO_BP_Enrichment_Results.csv")
    combined_go.to_csv(go_output_file, index=False, encoding='utf-8-sig')
    print(f"✅ GO BP results saved: {go_output_file}")

# Save KEGG results
if all_kegg_results:
    combined_kegg = pd.concat(all_kegg_results, ignore_index=True)
    kegg_output_file = os.path.join(output_dir, "4_KEGG_Pathway_Enrichment_Results.csv")
    combined_kegg.to_csv(kegg_output_file, index=False, encoding='utf-8-sig')
    print(f"✅ KEGG results saved: {kegg_output_file}")

# 6. Create summary report
print("\n[Step 6] Creating summary report")
print("-"*50)

summary_report = []

for gene_set_name, gene_set_info in loaded_gene_sets.items():
    summary_entry = {
        'Gene_Set': gene_set_name,
        'Description': gene_set_info['description'],
        'Gene_Count': gene_set_info['count'],
        'GO_BP_Terms': len(enrichment_results.get(gene_set_name, {}).get('GO BP', [])),
        'KEGG_Pathways': len(enrichment_results.get(gene_set_name, {}).get('KEGG', [])),
        'Top_GO_Term': '',
        'Top_KEGG_Pathway': ''
    }
    
    # Get top terms
    if gene_set_name in enrichment_results:
        if 'GO BP' in enrichment_results[gene_set_name] and len(enrichment_results[gene_set_name]['GO BP']) > 0:
            summary_entry['Top_GO_Term'] = enrichment_results[gene_set_name]['GO BP'].iloc[0]['Term']
        
        if 'KEGG' in enrichment_results[gene_set_name] and len(enrichment_results[gene_set_name]['KEGG']) > 0:
            summary_entry['Top_KEGG_Pathway'] = enrichment_results[gene_set_name]['KEGG'].iloc[0]['Term']
    
    summary_report.append(summary_entry)

# Save summary report
summary_df = pd.DataFrame(summary_report)
summary_output_file = os.path.join(output_dir, "4_Enrichment_Analysis_Summary.csv")
summary_df.to_csv(summary_output_file, index=False, encoding='utf-8-sig')
print(f"✅ Summary report saved: {summary_output_file}")

# 7. Final summary
print("\n[Step 7] Analysis Summary")
print("="*80)
print("🎯 Pathway Enrichment Analysis Completed!")

print(f"\n📊 Analysis Statistics:")
total_go_terms = sum([len(libraries.get('GO BP', [])) for libraries in enrichment_results.values()])
total_kegg_pathways = sum([len(libraries.get('KEGG', [])) for libraries in enrichment_results.values()])

print(f"   - Gene sets analyzed: {len(loaded_gene_sets)}")
print(f"   - Total GO BP terms identified: {total_go_terms}")
print(f"   - Total KEGG pathways identified: {total_kegg_pathways}")

print(f"\n📁 Output Files:")
print(f"   📋 Data files:")
print(f"     - GO BP results: 4_GO_BP_Enrichment_Results.csv")
print(f"     - KEGG results: 4_KEGG_Pathway_Enrichment_Results.csv")
print(f"     - Summary report: 4_Enrichment_Analysis_Summary.csv")

print(f"\n   📊 Visualization files (in Figure directory):")
for gene_set_name in loaded_gene_sets.keys():
    for library_short in ['GO BP', 'KEGG']:
        if gene_set_name in enrichment_results and library_short in enrichment_results[gene_set_name]:
            print(f"     - 1_{gene_set_name}_{library_short.replace(' ', '_')}_barplot.png")
            print(f"     - 1_{gene_set_name}_{library_short.replace(' ', '_')}_dotplot.png")

print(f"     - 1_GO_BP_comparison_heatmap.png")
print(f"     - 1_KEGG_comparison_heatmap.png")

print(f"\n🔬 Key Findings:")
for gene_set_name, gene_set_info in loaded_gene_sets.items():
    if gene_set_name in enrichment_results:
        go_count = len(enrichment_results[gene_set_name].get('GO BP', []))
        kegg_count = len(enrichment_results[gene_set_name].get('KEGG', []))
        print(f"   - {gene_set_info['description']}: {go_count} GO terms, {kegg_count} KEGG pathways")

print(f"\n💡 Next Steps:")
print(f"   1. Review the enrichment results in the CSV files")
print(f"   2. Examine the visualization plots for biological insights")
print(f"   3. Focus on vitamin B12 metabolism-related pathways")
print(f"   4. Compare enrichment patterns between different regulatory mechanisms")
print(f"   5. Validate key findings with literature review")

print("\n" + "="*80)
print("✅ 4.3 GO/KEGG Pathway Enrichment Analysis Completed!")
print("="*80)

4.3 GO/KEGG Pathway Enrichment Analysis (Real Data)

[Step 1] Reading gene set files
--------------------------------------------------
✅ miRNA_Inhibition: 64 genes
✅ miRNA_Derepression: 122 genes
✅ Methylation_Repression: 30 genes
✅ Demethylation_Activation: 36 genes
✅ Dual_Regulated: 65 genes
✅ VitaminB12_Related: 1 genes

📊 Successfully loaded 6 gene sets

[Step 2] Performing enrichment analysis using Enrichr API
--------------------------------------------------

🔬 Analyzing miRNA_Inhibition (64 genes):
   - Running GO BP enrichment...
     ⚠️  No significant terms found
   - Running KEGG enrichment...
     ✅ Found 1 significant terms
   - Running WikiPathways enrichment...
     ⚠️  No significant terms found

🔬 Analyzing miRNA_Derepression (122 genes):
   - Running GO BP enrichment...
     ✅ Found 3 significant terms
   - Running KEGG enrichment...
     ✅ Found 3 significant terms
   - Running WikiPathways enrichment...
     ✅ Found 20 significant terms

🔬 Analyzing Methylation_Re

findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati


[Step 3] Creating visualization functions
--------------------------------------------------

[Step 4] Generating visualizations
--------------------------------------------------


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_miRNA_Inhibition_KEGG_barplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_miRNA_Inhibition_KEGG_dotplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_miRNA_Derepression_GO_BP_barplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_miRNA_Derepression_GO_BP_dotplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_miRNA_Derepression_KEGG_barplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_miRNA_Derepression_KEGG_dotplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_Demethylation_Activation_GO_BP_barplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_Demethylation_Activation_GO_BP_dotplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_Demethylation_Activation_KEGG_barplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_Demethylation_Activation_KEGG_dotplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_Dual_Regulated_GO_BP_barplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_Dual_Regulated_GO_BP_dotplot.png


findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberation Sans' not found.
findfont: Font family 'Liberati

✅ Saved: 1_GO_BP_comparison_heatmap.png
✅ Saved: 1_KEGG_comparison_heatmap.png

[Step 5] Saving enrichment results
--------------------------------------------------
✅ GO BP results saved: /Users/heweilin/Desktop/P056_Code/Data_Post_Processed/4_GO_BP_Enrichment_Results.csv
✅ KEGG results saved: /Users/heweilin/Desktop/P056_Code/Data_Post_Processed/4_KEGG_Pathway_Enrichment_Results.csv

[Step 6] Creating summary report
--------------------------------------------------
✅ Summary report saved: /Users/heweilin/Desktop/P056_Code/Data_Post_Processed/4_Enrichment_Analysis_Summary.csv

[Step 7] Analysis Summary
🎯 Pathway Enrichment Analysis Completed!

📊 Analysis Statistics:
   - Gene sets analyzed: 6
   - Total GO BP terms identified: 22
   - Total KEGG pathways identified: 5

📁 Output Files:
   📋 Data files:
     - GO BP results: 4_GO_BP_Enrichment_Results.csv
     - KEGG results: 4_KEGG_Pathway_Enrichment_Results.csv
     - Summary report: 4_Enrichment_Analysis_Summary.csv

   📊 Visualizat