In [1]:
import pandas as pd
import os

def extract_species_level_data(file_path, domain_name):
    """
    从绝对丰度文件中提取物种水平的数据
    """
    try:
        # 读取数据
        df = pd.read_csv(file_path)

        print(f"\n=== 处理 {domain_name} 数据 ===")
        print(f"原始数据行数: {len(df)}")

        # 筛选物种水平数据：包含 |s__ 且后面没有更细的分类级别
        species_df = df[df.iloc[:, 0].str.contains('\|s__[^|]*$', na=False)]

        print(f"物种水平数据行数: {len(species_df)}")
        print(f"筛选比例: {len(species_df)/len(df):.2%}")

        # 显示前几个物种名称
        if len(species_df) > 0:
            print("\n前5个物种示例:")
            for i, species_id in enumerate(species_df.iloc[:, 0].head()):
                print(f"{i+1}. {species_id}")
        else:
            print("警告: 没有找到物种水平的数据!")

        return species_df

    except Exception as e:
        print(f"处理文件 {file_path} 时出错: {e}")
        return None

# 定义文件路径
file_paths = {
    "archaea": "E:/Python/MI_Analysis/metagenome/Absolute_abundance_analysis/filtered_data_1percent/心梗组_古菌_filtered_1percent.csv",
    "bacteria": "E:/Python/MI_Analysis/metagenome/Absolute_abundance_analysis/filtered_data_1percent/心梗组_细菌_filtered_1percent.csv",
    "fungi": "E:/Python/MI_Analysis/metagenome/Absolute_abundance_analysis/filtered_data_1percent/心梗组_真菌_filtered_1percent.csv",
    "virus": "E:/Python/MI_Analysis/metagenome/Absolute_abundance_analysis/filtered_data_1percent/心梗组_病毒_filtered_1percent.csv"
}

# 创建输出目录
output_dir = "E:/Python/MI_Analysis/metagenome/Absolute_abundance_analysis/species_level_data"
os.makedirs(output_dir, exist_ok=True)

# 处理所有文件
results = {}

for domain, file_path in file_paths.items():
    if os.path.exists(file_path):
        species_data = extract_species_level_data(file_path, domain)

        if species_data is not None and len(species_data) > 0:
            # 保存结果
            output_path = os.path.join(output_dir, f"心梗组_{domain}_species_level.csv")
            species_data.to_csv(output_path, index=False, encoding='utf-8-sig')
            print(f"结果已保存至: {output_path}")

            results[domain] = species_data
        else:
            print(f"跳过 {domain} - 无有效数据")
    else:
        print(f"文件不存在: {file_path}")

# 汇总统计
print("\n" + "="*50)
print("筛选汇总:")
print("="*50)
for domain, df in results.items():
    print(f"{domain}: {len(df)} 个物种")

print(f"\n所有结果已保存到: {output_dir}")


=== 处理 archaea 数据 ===
原始数据行数: 3860
物种水平数据行数: 2272
筛选比例: 58.86%

前5个物种示例:
1. k__Archaea|p__Methanobacteriota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter_A|s__Methanobrevibacter_A_sp900766745
2. k__Archaea|p__Thermoplasmatota|c__Thermoplasmata|o__Methanomassiliicoccales|f__Methanomassiliicoccaceae|g__Methanomassiliicoccus_A|s__Methanomassiliicoccus_A_intestinalis
3. k__Archaea|p__Nanoarchaeota|c__Nanoarchaeia|o__SCGC-AAA011-G17|f__GW2011-AR18|g__JAGWAD01|s__JAGWAD01_sp018302105
4. k__Archaea|p__Nanoarchaeota|c__Nanoarchaeia|o__Pacearchaeales|f__JAHJTO01|g__JAHJTO01|s__JAHJTO01_sp018829845
5. k__Archaea|p__Asgardarchaeota|c__Hermodarchaeia|o__Hermodarchaeales|f__Hermodarchaeaceae|g__Hermodarchaeum|s__Hermodarchaeum_sp016550425
结果已保存至: E:/Python/MI_Analysis/metagenome/Absolute_abundance_analysis/species_level_data\心梗组_archaea_species_level.csv

=== 处理 bacteria 数据 ===
原始数据行数: 799
物种水平数据行数: 516
筛选比例: 64.58%

前5个物种示例:
1. k__Bacteria|p__Actinobacteria