In [2]:
from Bio import SeqIO
import pandas as pd
import os

In [4]:
# === Main paths ===
fastas_folder = "filtered_fastas_nr"        # folder with the FASTA files after filtering and redundancy remova
codes_folder = "final_codes"                # where to save the relationship CSVs
output_fastas_folder = "final_fastas" # where to save the renamed FASTAs

# Create output folders if they do not exist
os.makedirs(codes_folder, exist_ok=True)
os.makedirs(output_fastas_folder, exist_ok=True)

# === Iterate over all FASTA files in the folder ===
for file in os.listdir(fastas_folder):
    if file.endswith(".fasta"):
        fasta_path = os.path.join(fastas_folder, file)

        # Extract gene name (everything before the first "_")
        gene = file.split("_")[0]

        # Define output names
        output_fasta = os.path.join(output_fastas_folder, f"{gene}_final.fasta")
        relation_file = os.path.join(codes_folder, f"{gene}_codes.csv")

        # Read original FASTA
        records = list(SeqIO.parse(fasta_path, "fasta"))
        relations = []

        # Rename sequences
        for i, rec in enumerate(records, start=1):
            original_id = rec.id
            new_id = f"{gene}_{i}"
            rec.id = new_id
            rec.description = ""
            relations.append({"original_id": original_id, "new_id": new_id})

        # Save new FASTA
        SeqIO.write(records, output_fasta, "fasta")

        # Save CSV with relationship
        pd.DataFrame(relations).to_csv(relation_file, index=False)

        print(f"âœ… {file}: {len(records)} sequences renamed â†’ {output_fasta}")

print("\nðŸŽ‰ Process completed! FASTAs renamed and codes successfully saved.")

âœ… lst_filtered_nr.fasta: 2266 sequences renamed â†’ final_fastas/lst_final.fasta
âœ… neuS_filtered_nr.fasta: 141 sequences renamed â†’ final_fastas/neuS_final.fasta
âœ… kpsT_filtered_nr.fasta: 9575 sequences renamed â†’ final_fastas/kpsT_final.fasta
âœ… kpsM_filtered_nr.fasta: 211 sequences renamed â†’ final_fastas/kpsM_final.fasta
âœ… neuA_filtered_nr.fasta: 23111 sequences renamed â†’ final_fastas/neuA_final.fasta
âœ… pm0188_filtered_nr.fasta: 40 sequences renamed â†’ final_fastas/pm0188_final.fasta
âœ… lic3X_filtered_nr.fasta: 1424 sequences renamed â†’ final_fastas/lic3X_final.fasta
âœ… neuD_filtered_nr.fasta: 4733 sequences renamed â†’ final_fastas/neuD_final.fasta
âœ… kpsD_filtered_nr.fasta: 6670 sequences renamed â†’ final_fastas/kpsD_final.fasta
âœ… neuO_filtered_nr.fasta: 148 sequences renamed â†’ final_fastas/neuO_final.fasta

ðŸŽ‰ Process completed! FASTAs renamed and codes successfully saved.
