## Genome FASTA files were programmatically renamed using a metadata-driven script to ensure consistent sample identifiers across all downstream analyses:

In [6]:
import os
import pandas as pd
import glob
import shutil
import re
from pathlib import Path


In [9]:
os.makedirs(/"data/genomes/fasta", exist_ok=True)

# Migrate the fna files to a separate folder
src = "data/genomes/ncbi_dataset/data/**/*.fna"
dst = "data/genomes/fasta/"

for f in glob.glob(src, recursive=True):  
    filename = f.split("/")[-1]
    shutil.copy(f, dst + filename) 

print("FASTA copied to data/genomes/fasta/")

FASTA copiados a data/genomes/fasta/


In [11]:
# metadata
df = pd.read_csv('/Notebooks/metadata_gcf.txt')




0    GCF_000624315
1    GCF_000625755
2    GCF_000624295
3    GCF_000624995
4    GCF_000624335
5    GCF_000626555
6    GCF_000625555
7    GCF_000625535
8    GCF_000625495
9    GCF_000624155
Name: accession_no, dtype: object
             region        name   accession_no
0           Alberta  EC20111515  GCF_000624315
1           Alberta  EC20120677  GCF_000625755
2           Alberta  EC20111514  GCF_000624295
3  British Columbia  EC20120765  GCF_000624995
4  British Columbia  EC20111554  GCF_000624335
5  British Columbia  EC20111510  GCF_000626555
6  British Columbia  EC20120686  GCF_000625555
7  British Columbia  EC20120685  GCF_000625535
8       Nova Scotia  EC20120590  GCF_000625495
9           Ontario  EC20120916  GCF_000624155


In [13]:
src_folder = "/data/genomes/fasta/"
dst_folder = "/data/genomes/fasta_ready/"
os.makedirs(dst_folder, exist_ok=True)

renamed = []

# Rename files according to region_name_accession_number
for _, row in df.iterrows():
    acc = row["accession_no"]
    region = row["region"]
    name = row["name"]
    
    
    src_pattern = os.path.join(src_folder, f"{acc}*.fna") 
    files = glob.glob(src_pattern)
    
    if not files:
        print(f"[WARN] No FASTA found for {acc}")
        continue
    
    src_file = files[0]
    
    
    dst_file = os.path.join(dst_folder, f"{region}_{name}_{acc}.fna")
    
    shutil.copy(src_file, dst_file)
    renamed.append(dst_file)
    print(f"[OK] {src_file} → {dst_file}")

print("\nTotal FASTA renamed:", len(renamed)) 


[OK] /home/mliva/data/genomes/fasta/GCF_000624315.2_ASM62431v2_genomic.fna → /home/mliva/data/genomes/fasta_ready/Alberta_EC20111515_GCF_000624315.fna
[OK] /home/mliva/data/genomes/fasta/GCF_000625755.1_ASM62575v2_genomic.fna → /home/mliva/data/genomes/fasta_ready/Alberta_EC20120677_GCF_000625755.fna
[OK] /home/mliva/data/genomes/fasta/GCF_000624295.2_ASM62429v2_genomic.fna → /home/mliva/data/genomes/fasta_ready/Alberta_EC20111514_GCF_000624295.fna
[OK] /home/mliva/data/genomes/fasta/GCF_000624995.2_ASM62499v2_genomic.fna → /home/mliva/data/genomes/fasta_ready/British Columbia_EC20120765_GCF_000624995.fna
[OK] /home/mliva/data/genomes/fasta/GCF_000624335.2_ASM62433v2_genomic.fna → /home/mliva/data/genomes/fasta_ready/British Columbia_EC20111554_GCF_000624335.fna
[OK] /home/mliva/data/genomes/fasta/GCF_000626555.2_ASM62655v2_genomic.fna → /home/mliva/data/genomes/fasta_ready/British Columbia_EC20111510_GCF_000626555.fna
[OK] /home/mliva/data/genomes/fasta/GCF_000625555.2_ASM62555v2_geno

In [18]:
# Unzip fasta files from enterobase
!gunzip /home/mliva/data/genomes/enterobase_fasta/*.gz



In [22]:
df_enterobase = pd.read_csv("/Notebooks/metadata_srr.txt")
print(df_enterobase)

           region        name accession_no assembly_barcode
0         Alberta  EC20090641   SRR1183981  SAL_KA3404AA_AS
1         Alberta  EC20090698   SRR1183982  SAL_KA3468AA_AS
2   New Brunswick   N13-02934   SRR5241846  SAL_QA6543AA_AS
3   New Brunswick   N13-02944   SRR5241820  SAL_QA5373AA_AS
4   New Brunswick   N13-02946   SRR5241852  SAL_QA6290AA_AS
5         Ontario  EC20090332   SRR1183990  SAL_KA3477AA_AS
6         Ontario  EC20090193   SRR1183988  SAL_KA3478AA_AS
7         Ontario  EC20100130   SRR1183993  SAL_KA3433AA_AS
8         Ontario  EC20090135   SRR1183989  SAL_KA3479AA_AS
9          Quebec   N13-01312   SRR5239201  SAL_QA5780AA_AS
10         Quebec   N13-01311   SRR5239213  SAL_QA6400AA_AS
11         Quebec   N13-01348   SRR5241832  SAL_QA5385AA_AS
12         Quebec   N13-01330   SRR5241836  SAL_QA5389AA_AS
13         Quebec   N13-01323   SRR5241839  SAL_QA5815AA_AS


In [23]:
# Rename the enterobase fna files from the source to destination folders.

src_folder = "/data/genomes/enterobase_fasta/"
dst_folder = "/data/genomes/fasta_ready/"

renamed = []

for _, row in df_enterobase.iterrows():
    acc = row["accession_no"]
    region = row["region"]
    name = row["name"]
    barcode = row['assembly_barcode']
    
    src_pattern = os.path.join(src_folder, f"{barcode}*.fna")
    files = glob.glob(src_pattern)
    
    if not files:
        print(f"[WARN] No FASTA found for {barcode}")
        continue
    
    src_file = files[0]
    
    dst_file = os.path.join(dst_folder, f"{region}_{name}_{acc}.fna")
    
    shutil.copy(src_file, dst_file)
    renamed.append(dst_file)
    print(f"[OK] {src_file} → {dst_file}")

print("\nTotal FASTA renamed:", len(renamed))


[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_KA3404AA_AS_genomic.fna → /home/mliva/data/genomes/fasta_ready/Alberta_EC20090641_SRR1183981.fna
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_KA3468AA_AS_genomic.fna → /home/mliva/data/genomes/fasta_ready/Alberta_EC20090698_SRR1183982.fna
[WARN] No FASTA found for SAL_QA6543AA_AS
[WARN] No FASTA found for SAL_QA5373AA_AS
[WARN] No FASTA found for SAL_QA6290AA_AS
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_KA3477AA_AS_genomic.fna → /home/mliva/data/genomes/fasta_ready/Ontario_EC20090332_SRR1183990.fna
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_KA3478AA_AS_genomic.fna → /home/mliva/data/genomes/fasta_ready/Ontario_EC20090193_SRR1183988.fna
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_KA3433AA_AS_genomic.fna → /home/mliva/data/genomes/fasta_ready/Ontario_EC20100130_SRR1183993.fna
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_KA3479AA_AS_genomic.fna → /home/mliva/data/genomes/fasta_ready/Ontario_EC20090135_S

In [24]:
# Rename the enterobase fasta files from the source to destination folders.

src_folder = "/data/genomes/enterobase_fasta/"
dst_folder = "/data/genomes/fasta_ready/"

renamed = []

for _, row in df_enterobase.iterrows():
    acc = row["accession_no"]
    region = row["region"]
    name = row["name"]
    barcode = row['assembly_barcode']
    
    src_pattern = os.path.join(src_folder, f"{barcode}*.fasta")
    files = glob.glob(src_pattern)
    
    if not files:
        print(f"[WARN] No FASTA found for {barcode}")
        continue
    
    src_file = files[0]
    
    dst_file = os.path.join(dst_folder, f"{region}_{name}_{acc}.fna")
    
    shutil.copy(src_file, dst_file)
    renamed.append(dst_file)
    print(f"[OK] {src_file} → {dst_file}")

print("\nTotal FASTA renamed:", len(renamed))


[WARN] No FASTA found for SAL_KA3404AA_AS
[WARN] No FASTA found for SAL_KA3468AA_AS
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_QA6543AA_AS.scaffold.fasta → /home/mliva/data/genomes/fasta_ready/New Brunswick_N13-02934_SRR5241846.fna
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_QA5373AA_AS.scaffold.fasta → /home/mliva/data/genomes/fasta_ready/New Brunswick_N13-02944_SRR5241820.fna
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_QA6290AA_AS.scaffold.fasta → /home/mliva/data/genomes/fasta_ready/New Brunswick_N13-02946_SRR5241852.fna
[WARN] No FASTA found for SAL_KA3477AA_AS
[WARN] No FASTA found for SAL_KA3478AA_AS
[WARN] No FASTA found for SAL_KA3433AA_AS
[WARN] No FASTA found for SAL_KA3479AA_AS
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_QA5780AA_AS.scaffold.fasta → /home/mliva/data/genomes/fasta_ready/Quebec_N13-01312_SRR5239201.fna
[OK] /home/mliva/data/genomes/enterobase_fasta/SAL_QA6400AA_AS.scaffold.fasta → /home/mliva/data/genomes/fasta_ready/Quebec_N13-013

In [26]:
# Count the number of files in a folder

file = Path("/home/mliva/data/genomes/fasta_ready/")
file_counts = len(list(carpeta.glob("*.fna")))
print(file_counts)

24
