# 一、下载NCBI序列和信息

## 并行版本

In [2]:
import os
from Bio import Entrez, SeqIO
from concurrent.futures import ThreadPoolExecutor, as_completed

# ——————— 配置部分 ———————
Entrez.email         = "giantlinlinlin@gmail.com"
save_directory       = r"/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download"
os.makedirs(save_directory, exist_ok=True)
success_log_file     = os.path.join(save_directory, "success_log.txt")
failure_log_file     = os.path.join(save_directory, "failure_log.txt")
basic_info_file      = os.path.join(save_directory, "基本信息.txt")
conf_file            = os.path.join(os.path.dirname(save_directory), "conf", "下载NCBI.txt")


def download_and_process_sequence(seq_id):
    try:
        # —— 下载并解析 FASTA —— 
        raw_fasta = Entrez.efetch(db="nucleotide",
                                  id=seq_id,
                                  rettype="fasta",
                                  retmode="text")
        # 用 fasta-pearson 格式，允许前置注释
        fasta_records = list(SeqIO.parse(raw_fasta, "fasta-pearson"))
        raw_fasta.close()

        if not fasta_records:
            raise ValueError("No FASTA record found")
        record_fasta = fasta_records[0]

        # 保存 FASTA 文件
        fasta_path = os.path.join(save_directory, f"{seq_id}.fasta")
        SeqIO.write(record_fasta, fasta_path, "fasta")
        print(f"[FASTA] 已保存：{fasta_path}")

        # —— 下载并解析 GenBank —— 
        raw_gb = Entrez.efetch(db="nucleotide",
                               id=seq_id,
                               rettype="gb",
                               retmode="text")
        gb_records = list(SeqIO.parse(raw_gb, "genbank"))
        raw_gb.close()

        if not gb_records:
            raise ValueError("No GenBank record found")
        record_gb = gb_records[0]

        # 提取 source feature 信息
        country = isolate = lat_lon = "Not Available"
        for feat in record_gb.features:
            if feat.type == "source":
                qs = feat.qualifiers
                country  = qs.get("country",  ["Not Available"])[0]
                isolate  = qs.get("isolate",  ["Not Available"])[0]
                lat_lon  = qs.get("lat_lon",  ["Not Available"])[0]
                break

        # 提取参考文献标题
        refs = record_gb.annotations.get("references", [])
        titles = [r.title for r in refs if r.title]
        titles_str = ", ".join(titles) if titles else "None"

        # 记录到“基本信息.txt”
        with open(basic_info_file, "a", encoding="utf-8") as out:
            out.write(f"{seq_id}\t{country}\t{isolate}\t{lat_lon}\t{titles_str}\n")
        print(f"[INFO] {seq_id} 已追加至基本信息文件")

        # 成功日志
        with open(success_log_file, "a", encoding="utf-8") as log:
            log.write(seq_id + "\n")

    except Exception as e:
        # 打印错误并写入失败日志
        print(f"[ERROR] {seq_id}: {e}")
        with open(failure_log_file, "a", encoding="utf-8") as log:
            log.write(f"{seq_id}\t{e}\n")


if __name__ == "__main__":
    # 读取 seq_id 列表
    with open(conf_file, "r", encoding="utf-8") as f:
        id_list = [line.strip() for line in f if line.strip()]

    # 并行下载处理
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(download_and_process_sequence, sid) for sid in id_list]
        for future in as_completed(futures):
            try:
                future.result()
            except Exception:
                pass  # 已在函数内部处理过

    print("所有序列处理完成。")


[FASTA] 已保存：/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/PQ827212.fasta
[FASTA] 已保存：/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/PQ827205.fasta
[FASTA] 已保存：/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/PQ827284.fasta
[FASTA] 已保存：/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/PQ827225.fasta
[INFO] PQ827212 已追加至基本信息文件
[FASTA] 已保存：/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/PQ827249.fasta
[INFO] PQ827225 已追加至基本信息文件
[INFO] PQ827284 已追加至基本信息文件
[INFO] PQ827205 已追加至基本信息文件
[FASTA] 已保存：/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/PQ827299.fasta
[INFO] PQ827249 已追加至基本信息文件
[FASTA] 已保存：/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/PQ827316.fasta
[INFO] PQ827299 已追加至基本信息文件
[INFO] PQ827316 已追加至基本信息文件
所有序列处理完成。


# 二、处理下载序列

In [3]:
import os
import re
from Bio import SeqIO

# 目标文件夹
folder_path = r"/mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download"

# 遍历目录下所有 fasta 文件
for filename in os.listdir(folder_path):
    if filename.lower().endswith((".fasta", ".fa")):
        file_path = os.path.join(folder_path, filename)
        
        # 临时保存修改后的记录
        modified_records = []
        
        with open(file_path, "r", encoding="utf-8") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                # 删除 ID 中的 `.1` `.2` `.3` 后缀
                original_id = record.id
                cleaned_id = re.sub(r"\.\d+$", "", record.id)
                record.id = cleaned_id
                record.name = cleaned_id  # 一般也一并更新 name
                record.description = cleaned_id  # 只保留 ID

                modified_records.append(record)

        # 写回原文件（覆盖）
        with open(file_path, "w", encoding="utf-8") as output_handle:
            SeqIO.write(modified_records, output_handle, "fasta")

        print(f"处理完成: {filename}")


处理完成: PQ827204.fasta
处理完成: PQ827205.fasta
处理完成: PQ827206.fasta
处理完成: PQ827207.fasta
处理完成: PQ827208.fasta
处理完成: PQ827209.fasta
处理完成: PQ827210.fasta
处理完成: PQ827211.fasta
处理完成: PQ827212.fasta
处理完成: PQ827213.fasta
处理完成: PQ827214.fasta
处理完成: PQ827215.fasta
处理完成: PQ827216.fasta
处理完成: PQ827217.fasta
处理完成: PQ827218.fasta
处理完成: PQ827219.fasta
处理完成: PQ827220.fasta
处理完成: PQ827221.fasta
处理完成: PQ827222.fasta
处理完成: PQ827223.fasta
处理完成: PQ827224.fasta
处理完成: PQ827225.fasta
处理完成: PQ827226.fasta
处理完成: PQ827227.fasta
处理完成: PQ827228.fasta
处理完成: PQ827229.fasta
处理完成: PQ827230.fasta
处理完成: PQ827231.fasta
处理完成: PQ827232.fasta
处理完成: PQ827233.fasta
处理完成: PQ827234.fasta
处理完成: PQ827235.fasta
处理完成: PQ827236.fasta
处理完成: PQ827237.fasta
处理完成: PQ827238.fasta
处理完成: PQ827239.fasta
处理完成: PQ827240.fasta
处理完成: PQ827241.fasta
处理完成: PQ827242.fasta
处理完成: PQ827243.fasta
处理完成: PQ827244.fasta
处理完成: PQ827245.fasta
处理完成: PQ827246.fasta
处理完成: PQ827247.fasta
处理完成: PQ827248.fasta
处理完成: PQ827249.fasta
处理完成: PQ827250.fasta
处理完成: PQ82725

In [4]:
!for file in /mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/*.fasta; do cat $file >> /mnt/c/Users/Administrator/Desktop/merge.fasta;done

In [5]:
!rm /mnt/f/OneDrive/文档（科研）/脚本/Download/9-My-Toolskit/1-下载数据/download/*.fasta