In [18]:
import pandas as pd
import os
import glob
from Bio import SeqIO

### Filter 1 - Selecting sequences that have the essential signatures

In [None]:
essential_signatures = {
    "SIAE": "PF03629",
    "kpsD": "PF02563",
    "kpsE": "PTHR32309",
    "kpsM": "PF01061",
    "kpsT": "cd03220",
    "lic3A": "PF06002",
    "lic3B": "PF06002",
    "lst": "PF07922",
    "cpsK": "PF07922",
    "nagA": "cd00854",
    "nagB": "cd01399",
    "nanA": "PTHR42849",
    "nanC": "PF06178",
    "nanE": "PF04131",
    "nanH": "cd15482",
    "nanK": "PTHR18964",
    "nanM": "PF24996",
    "nanQ": "PF04074",
    "nanR": "cd07377",
    "nanT": "cd17316",
    "nanU": "cd08977",
    "neuA": "cd02513",
    "neuB": "cd11615",
    "neuC": "TIGR03568",
    "neuD": "cd03360",
    "neuE": "PF20471",
    "neuO": "cd04647",
    "neuS": "PF07388",
    "ompC": "PF00267",
    "ompF": "PF00267",
    "satA": "PTHR30290",
    "satB": "PTHR43163",
    "satC": "PTHR43297",
    "satD": "PTHR43776",
    "siaM": "PF06808",
    "siaT": "PF06808",
    "siaP": "PF03480",
    "siaQ": "PF04290"  
}

In [None]:
# Function to parse InterProScan output

def read_interproscan_tsv(file):
    columns = [
        "seq_id", "md5", "length", "analysis", "signature_accession",
        "signature_desc", "start", "end", "evalue", "status",
        "date", "ipr", "ipr_desc", "go", "pathway"
    ]
    try:
        df = pd.read_csv(file, sep="\t", header=None, names=columns, usecols=[0, 4])
        return df
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return pd.DataFrame(columns=["seq_id", "signature_accession"])

In [None]:
# Identifying IDs that contain the essential signature

interpro_folder = "./interpro_outputs"
output_folder = "./IDs_filter1"

os.makedirs(output_folder, exist_ok=True)

for gene, signature in essential_signatures.items():
    print(f"\n=== Processing {gene} (signature: {signature}) ===")

    interpro_file = f"{gene}_interproscan.tsv"
    interpro_path = os.path.join(interpro_folder, interpro_file)

    if not os.path.exists(interpro_path):
        print(f"[ERROR] File not found: {interpro_path}. Skipping...")
        continue

    df = read_interproscan_tsv(interpro_path)

    if df.empty:
        print(f"[WARNING] Empty file for {gene}.")
        continue

    # --- FILTER BY SIGNATURE ---
    df_filtered = df[df["signature_accession"] == signature]

    # --- REMOVE DUPLICATES ---
    # Deduplicate only by seq_id
    df_filtered = df_filtered.drop_duplicates(subset=["seq_id"])

    # Output file name
    output_file = os.path.join(
        output_folder,
        f"{gene}_IDs_filter1.tsv"
    )

    df_filtered[["seq_id"]].to_csv(output_file, sep="\t", index=False, header=True)

    print(f"Rows found (after deduplication): {len(df_filtered)}")
    print(f"File saved to: {output_file}")


=== Processando SIAE (assinatura: PF03629) ===
Linhas encontradas (ap√≥s deduplica√ß√£o): 113516
Arquivo salvo em: ./IDs_filtro1/SIAE_IDs_filtro1.tsv

=== Processando kpsD (assinatura: PF02563) ===
Linhas encontradas (ap√≥s deduplica√ß√£o): 12824
Arquivo salvo em: ./IDs_filtro1/kpsD_IDs_filtro1.tsv

=== Processando kpsE (assinatura: PTHR32309) ===
Linhas encontradas (ap√≥s deduplica√ß√£o): 10297
Arquivo salvo em: ./IDs_filtro1/kpsE_IDs_filtro1.tsv

=== Processando kpsM (assinatura: PF01061) ===
Linhas encontradas (ap√≥s deduplica√ß√£o): 1045
Arquivo salvo em: ./IDs_filtro1/kpsM_IDs_filtro1.tsv

=== Processando kpsT (assinatura: cd03220) ===
Linhas encontradas (ap√≥s deduplica√ß√£o): 12389
Arquivo salvo em: ./IDs_filtro1/kpsT_IDs_filtro1.tsv

=== Processando lic3A (assinatura: PF06002) ===
Linhas encontradas (ap√≥s deduplica√ß√£o): 1394
Arquivo salvo em: ./IDs_filtro1/lic3A_IDs_filtro1.tsv

=== Processando lic3B (assinatura: PF06002) ===
Linhas encontradas (ap√≥s deduplica√ß√£o): 3
Arq

In [None]:
# Retrieving only InterProScan output entries corresponding to sequences
# that contain the essential signature
# Test code for a single protein 
# Optional

# file paths
ids_file = "./interpro_with_essential_signature/cpsK_essential_signature.tsv"     # left
data_file = "./interpro_outputs/cpsK_interproscan.tsv"                            # right
output_file = "cpsK_interpro_filter1.tsv"

# load tables
df_ids = pd.read_csv(ids_file, sep="\t")                    # has header
df_data = pd.read_csv(data_file, sep="\t", header=None)     # no header

# name of the 1st column of df_ids
ids_column = df_ids.columns[0]

# use the first column of df_data (column 0)
df_filtered = df_data[df_data[0].isin(df_ids[ids_column])]

# save
df_filtered.to_csv(output_file, sep="\t", index=False, header=False)

In [None]:
# Retrieving only InterProScan output entries corresponding to sequences
# that contain the essential signature
# Expanded code for all proteins

# === DIRECTORIES ===
signatures_folder = "./IDs_filter1"
interpro_folder = "./interpro_outputs"        # where *_interproscan.tsv are stored
output_folder = "./interpro_outputs_filter1"

os.makedirs(output_folder, exist_ok=True)

# === LOOP FOR ALL SIGNATURE FILES ===
for sig_file in glob.glob(os.path.join(signatures_folder, "*_IDs_filter1.tsv")):
    
    # extract gene name from filename
    gene = os.path.basename(sig_file).replace("_IDs_filter1.tsv", "")
    
    # corresponding InterProScan file
    interpro_file = os.path.join(interpro_folder, f"{gene}_interproscan.tsv")
    
    # if missing, skip
    if not os.path.exists(interpro_file):
        print(f"[WARNING] InterProScan file not found for {gene}")
        continue
    
    print(f"Processing {gene}...")
    
    # load files
    df_ids = pd.read_csv(sig_file, sep="\t")                    # has header
    df_inter = pd.read_csv(interpro_file, sep="\t", header=None)  # no header
    
    # first column of ID file
    ids_column = df_ids.columns[0]
    
    # filtering
    df_filtered = df_inter[df_inter[0].isin(df_ids[ids_column])]
    
    # save
    out = os.path.join(output_folder, f"{gene}_interproscan_filter1.tsv")
    df_filtered.to_csv(out, sep="\t", index=False, header=False)
    
print("\n### PROCESSING COMPLETED ###")

Processando neuE...
Processando nanU...
Processando neuB...
Processando nanM...
Processando satA...
Processando neuO...
Processando nanT...
Processando satB...
Processando siaP...
Processando kpsT...
Processando ompC...
Processando siaQ...
Processando kpsE...
Processando nagA...
Processando kpsD...
Processando siaM...
Processando nanR...
Processando nanE...
Processando nagB...
Processando lic3B...
Processando satC...
Processando lst...
Processando neuS...
Processando nanH...
Processando neuD...
Processando kpsM...
Processando ompF...
Processando SIAE...
Processando nanA...
Processando satD...
Processando neuC...
Processando lic3A...
Processando siaT...
Processando nanQ...
Processando cpsK...
Processando nanK...
Processando nanC...
Processando neuA...

### PROCESSAMENTO CONCLU√çDO ###


### Filter 2 - Selecting IDs that have the essential signature and no extra signatures 
Extra signatures are understood to be those that are not present for any of the references

#### Assembling and verifying the code

In [None]:
# ----------------------
# Main comparison function
# ----------------------
def compare_with_reference(gene_file, ref_file):

    # 1) Get gene name from file name (e.g., cpsK_interproscan_filtro1.tsv ‚Üí cpsK)
    filename = os.path.basename(gene_file)
    gene = filename.split("_")[0]   # assumes format "gene_interproscan_filtro1.tsv"
    print(f"Processing gene: {gene}")

    # 2) Read gene InterProScan file
    df_gene = read_interproscan_tsv(gene_file)

    # 3) Read reference file
    df_ref = pd.read_csv(ref_file)

    # 4) Filter references for this gene
    df_ref_gene = df_ref[df_ref["Gene"] == gene]

    if df_ref_gene.empty:
        print(f"No reference found for {gene}.")
        return None

    # 5) Collect all signature_accession used in references
    ref_signatures = set(df_ref_gene["signature_accession"].dropna().unique())
    print(f"Reference signatures ({gene}): {ref_signatures}")

    # 6) Find suspicious signatures (not present in references)
    suspicious_ids = (
        df_gene[
            ~df_gene["signature_accession"].isin(ref_signatures)
        ]["seq_id"]
        .unique()
    )

    print(f"Suspicious IDs: {suspicious_ids}")

    # 7) Create output dataframe
    df_out = df_gene[df_gene["seq_id"].isin(suspicious_ids)]

    # 8) Save
    out = f"{gene}_suspicious_IDs.csv"
    df_out.to_csv(out, index=False)
    print(f"Generated file: {out}")

    return df_out

In [None]:
compare_with_reference(
    "interpro_outputs_filter1/cpsK_interproscan_filter1.tsv",
    "interproscan_ref_summary.csv"
)

Processando gene: cpsK
Signatures de refer√™ncia (cpsK): {'G3DSA:3.40.50.11110', 'PF07922', 'G3DSA:3.30.370.20'}
IDs suspeitos: ['cpsK_50_uniprot' 'cpsK_57_uniprot']
Arquivo gerado: cpsK_IDs_suspeitos.csv


Unnamed: 0,seq_id,signature_accession
16,cpsK_50_uniprot,G3DSA:3.40.50.11110
17,cpsK_50_uniprot,PF07922
18,cpsK_50_uniprot,Coil
31,cpsK_57_uniprot,PF07922
32,cpsK_57_uniprot,Coil
33,cpsK_57_uniprot,G3DSA:3.40.50.11110


In [None]:
# cpsK - identified sequences with unexpected disordered regions

#### Expanded code for all proteins

In [None]:
interpro_folder = "./interpro_outputs_filter1"
ref_file = "./interproscan_ref_summary.csv"
output_folder = "./suspect_IDs"

os.makedirs(output_folder, exist_ok=True)

# Read reference file
df_ref = pd.read_csv(ref_file)

# Process all .tsv files
files = [f for f in os.listdir(interpro_folder) if f.endswith(".tsv")]

print(f"Found {len(files)} files to process.")

for file in files:
    path = os.path.join(interpro_folder, file)
    gene = file.split("_")[0]

    print(f"\n=== Processing gene: {gene} ===")

    df_gene = read_interproscan_tsv(path)
    df_ref_gene = df_ref[df_ref["Gene"] == gene]

    if df_ref_gene.empty:
        print(f"No reference found for {gene}. Skipping...")
        continue

    signatures_ref = set(df_ref_gene["signature_accession"].dropna().astype(str).unique())

    df_gene["signature_accession"] = df_gene["signature_accession"].astype(str)

    suspect_ids = df_gene[
        ~df_gene["signature_accession"].isin(signatures_ref)
    ]["seq_id"].unique()

    df_output = df_gene[df_gene["seq_id"].isin(suspect_ids)]

    outpath = os.path.join(output_folder, f"{gene}_suspect_IDs.tsv")
    df_output[["seq_id"]].drop_duplicates().to_csv(
        outpath,
        sep="\t",
        index=False
    )

    print(f"File generated: {outpath} ({len(df_output)} rows)")

print("\nProcessing completed! üëç")

Encontrados 38 arquivos para processar.

=== Processando gene: siaQ ===
Arquivo gerado: ./IDs_suspeitos/siaQ_IDs_suspeitos.tsv (31541 linhas)

=== Processando gene: lst ===
Arquivo gerado: ./IDs_suspeitos/lst_IDs_suspeitos.tsv (112 linhas)

=== Processando gene: ompF ===
Arquivo gerado: ./IDs_suspeitos/ompF_IDs_suspeitos.tsv (9383 linhas)

=== Processando gene: neuC ===
Arquivo gerado: ./IDs_suspeitos/neuC_IDs_suspeitos.tsv (11942 linhas)

=== Processando gene: neuE ===
Arquivo gerado: ./IDs_suspeitos/neuE_IDs_suspeitos.tsv (2 linhas)

=== Processando gene: nanC ===
Arquivo gerado: ./IDs_suspeitos/nanC_IDs_suspeitos.tsv (10536 linhas)

=== Processando gene: siaT ===
Arquivo gerado: ./IDs_suspeitos/siaT_IDs_suspeitos.tsv (25967 linhas)

=== Processando gene: satC ===
Arquivo gerado: ./IDs_suspeitos/satC_IDs_suspeitos.tsv (9614 linhas)

=== Processando gene: neuB ===
Arquivo gerado: ./IDs_suspeitos/neuB_IDs_suspeitos.tsv (269311 linhas)

=== Processando gene: nanA ===
Arquivo gerado: ./I

In [None]:
# S√≥ contar quantos IDs tem em cada arquivo de IDs_suspeitos

def contar_ids_unicos(pasta):
    arquivos = [f for f in os.listdir(pasta) if f.endswith(".tsv")]

    resultados = []

    for arq in arquivos:
        caminho = os.path.join(pasta, arq)

        # L√™ o CSV
        df = pd.read_csv(caminho)

        # Conta IDs √∫nicos
        ids_unicos = df["seq_id"].nunique()

        resultados.append({
            "arquivo": arq,
            "ids_unicos": ids_unicos
        })

    # Retorna resultado como DataFrame
    return pd.DataFrame(resultados)


# ===== Executar =====
resultado = contar_ids_unicos("./IDs_suspeitos")
print(resultado)

                    arquivo  ids_unicos
0    siaP_IDs_suspeitos.tsv      230859
1    kpsT_IDs_suspeitos.tsv         944
2   lic3A_IDs_suspeitos.tsv         116
3    nagB_IDs_suspeitos.tsv       55208
4    nanT_IDs_suspeitos.tsv        4038
5    nanE_IDs_suspeitos.tsv        1959
6    satB_IDs_suspeitos.tsv          79
7   lic3B_IDs_suspeitos.tsv           0
8    siaM_IDs_suspeitos.tsv       17296
9    neuC_IDs_suspeitos.tsv        1513
10   nanH_IDs_suspeitos.tsv       33477
11   nanC_IDs_suspeitos.tsv        2398
12   nanM_IDs_suspeitos.tsv        1053
13   kpsD_IDs_suspeitos.tsv        9886
14   neuO_IDs_suspeitos.tsv          12
15   ompF_IDs_suspeitos.tsv         596
16   neuS_IDs_suspeitos.tsv           0
17   nanQ_IDs_suspeitos.tsv        1023
18   ompC_IDs_suspeitos.tsv       12179
19   nanA_IDs_suspeitos.tsv       17160
20   nagA_IDs_suspeitos.tsv        8986
21   SIAE_IDs_suspeitos.tsv       53340
22   nanU_IDs_suspeitos.tsv      123177
23   satA_IDs_suspeitos.tsv        3816


### Compiling FASTA files

In [None]:
# Code to identify which IDs are present in IDs_filter1 but not in suspect_IDs
# In other words, retrieve IDs of sequences that contain the essential signature
# and remove IDs of sequences that contain additional signatures

# Writing the code for a single protein set

# Read files (automatically detects separator)
df_filter1 = pd.read_csv("./IDs_filter1/cpsK_IDs_filter1.tsv", sep=None, engine="python")
df_suspects = pd.read_csv("./suspect_IDs/cpsK_suspect_IDs.csv", sep=None, engine="python")

# Extract ID lists
ids_filter1 = set(df_filter1["seq_id"])
ids_suspects = set(df_suspects["seq_id"])

# IDs present in filter1 but not in suspects
unique_ids = sorted(ids_filter1 - ids_suspects)

# Save as txt (one ID per line)
with open("cpsK_exclusive_IDs.txt", "w") as f:
    for seqid in unique_ids:
        f.write(seqid + "\n")

print("File generated: cpsK_exclusive_IDs.txt")

Arquivo gerado: cpsK_IDs_exclusivos.txt


In [None]:
# Expanded code to process all files to be curated

dir_filter1 = "IDs_filter1"
dir_suspects = "IDs_suspects"
dir_output = "IDs_curated"

os.makedirs(dir_output, exist_ok=True)

# List files in directories
files_filter1 = {f.split("_")[0]: f for f in os.listdir(dir_filter1)}
files_suspects = {f.split("_")[0]: f for f in os.listdir(dir_suspects)}

genes = sorted(set(files_filter1.keys()) & set(files_suspects.keys()))

for gene in genes:
    file_filter1 = os.path.join(dir_filter1, files_filter1[gene])
    file_suspects = os.path.join(dir_suspects, files_suspects[gene])

    # Read files (automatic separator detection)
    df_filter1 = pd.read_csv(file_filter1, engine="python")
    df_suspects = pd.read_csv(file_suspects, engine="python")

    # Extract IDs
    ids_filter1 = set(df_filter1["seq_id"])
    ids_suspects = set(df_suspects["seq_id"])

    # Difference
    unique_ids = sorted(ids_filter1 - ids_suspects)

    # Save output file
    out_file = os.path.join(dir_output, f"{gene}_curated_IDs.tsv")
    df_out = pd.DataFrame({"seq_id": unique_ids})
    df_out.to_csv(out_file, sep="\t", index=False, header=True)

    print(f"{gene}: {len(unique_ids)} curated IDs saved.")

print("\n‚úîÔ∏è Process completed!")

SIAE: 60176 IDs curados salvos.
cpsK: 31 IDs curados salvos.
kpsD: 2938 IDs curados salvos.
kpsE: 6598 IDs curados salvos.
kpsM: 1012 IDs curados salvos.
kpsT: 11445 IDs curados salvos.
lic3A: 1278 IDs curados salvos.
lic3B: 3 IDs curados salvos.
lst: 2023 IDs curados salvos.
nagA: 114823 IDs curados salvos.
nagB: 85856 IDs curados salvos.
nanA: 14001 IDs curados salvos.
nanC: 1029 IDs curados salvos.
nanE: 36663 IDs curados salvos.
nanH: 22950 IDs curados salvos.
nanK: 11298 IDs curados salvos.
nanM: 4458 IDs curados salvos.
nanQ: 11433 IDs curados salvos.
nanR: 10517 IDs curados salvos.
nanT: 4172 IDs curados salvos.
nanU: 27378 IDs curados salvos.
neuA: 22064 IDs curados salvos.
neuB: 31211 IDs curados salvos.
neuC: 19190 IDs curados salvos.
neuD: 13279 IDs curados salvos.
neuE: 11 IDs curados salvos.
neuO: 114 IDs curados salvos.
neuS: 22 IDs curados salvos.
ompC: 5201 IDs curados salvos.
ompF: 10057 IDs curados salvos.
satA: 922 IDs curados salvos.
satB: 4631 IDs curados salvos.
s

In [None]:
# Extracting FASTA sequences with essential signatures

dir_fastas = "merged_nr_fastas"                  # Original FASTA files
dir_ids = "IDs_filter1"                          # TSV files with approved seq_id
dir_output = "FASTAS_with_essential_signature"   # Final FASTA files

os.makedirs(dir_output, exist_ok=True)

# Map files by gene (before the first "_")
fastas = {f.split("_")[0]: f for f in os.listdir(dir_fastas) if f.endswith(".fasta")}
idfiles = {f.split("_")[0]: f for f in os.listdir(dir_ids) if f.endswith(".tsv")}

# Genes in common
genes = sorted(set(fastas.keys()) & set(idfiles.keys()))

print(f"Processing {len(genes)} genes...\n")

for gene in genes:

    fasta_path = os.path.join(dir_fastas, fastas[gene])
    ids_path = os.path.join(dir_ids, idfiles[gene])

    print(f"‚Üí {gene}")

    # Read approved IDs
    df = pd.read_csv(ids_path, sep="\t")
    approved_ids = set(df["seq_id"].astype(str))

    # Read original FASTA
    records = list(SeqIO.parse(fasta_path, "fasta"))

    # Filter sequences whose header is in the list
    filtered = [
        r for r in records
        if r.id in approved_ids
    ]

    # Output file
    out_fasta = os.path.join(dir_output, f"{gene}_with_essential_signature.fasta")

    SeqIO.write(filtered, out_fasta, "fasta")

    print(f"   ‚úî {len(filtered)} sequences saved to {out_fasta}")

print("\n‚úî Finished!")

Processando 38 genes...

‚Üí SIAE
   ‚úî 113516 sequ√™ncias salvas em FASTAS_with_essencial_signature/SIAE_with_essencial_signature.fasta
‚Üí cpsK
   ‚úî 33 sequ√™ncias salvas em FASTAS_with_essencial_signature/cpsK_with_essencial_signature.fasta
‚Üí kpsD
   ‚úî 12824 sequ√™ncias salvas em FASTAS_with_essencial_signature/kpsD_with_essencial_signature.fasta
‚Üí kpsE
   ‚úî 10297 sequ√™ncias salvas em FASTAS_with_essencial_signature/kpsE_with_essencial_signature.fasta
‚Üí kpsM
   ‚úî 1045 sequ√™ncias salvas em FASTAS_with_essencial_signature/kpsM_with_essencial_signature.fasta
‚Üí kpsT
   ‚úî 12389 sequ√™ncias salvas em FASTAS_with_essencial_signature/kpsT_with_essencial_signature.fasta
‚Üí lic3A
   ‚úî 1394 sequ√™ncias salvas em FASTAS_with_essencial_signature/lic3A_with_essencial_signature.fasta
‚Üí lic3B
   ‚úî 3 sequ√™ncias salvas em FASTAS_with_essencial_signature/lic3B_with_essencial_signature.fasta
‚Üí lst
   ‚úî 2057 sequ√™ncias salvas em FASTAS_with_essencial_signature/lst_with_

In [None]:
# Extracting FASTA sequences with essential signatures and without any extra signatures

dir_fastas = "merged_nr_fastas"        # Original FASTA files
dir_ids = "IDs_cured"                  # TSV files with approved seq_id
dir_output = "FASTAS_cured"            # Final FASTA files

os.makedirs(dir_output, exist_ok=True)

# Map files by gene (before the first "_")
fastas = {f.split("_")[0]: f for f in os.listdir(dir_fastas) if f.endswith(".fasta")}
idfiles = {f.split("_")[0]: f for f in os.listdir(dir_ids) if f.endswith(".tsv")}

# Genes in common
genes = sorted(set(fastas.keys()) & set(idfiles.keys()))

print(f"Processing {len(genes)} genes...\n")

for gene in genes:

    fasta_path = os.path.join(dir_fastas, fastas[gene])
    ids_path = os.path.join(dir_ids, idfiles[gene])

    print(f"‚Üí {gene}")

    # Read approved IDs
    df = pd.read_csv(ids_path, sep="\t")
    approved_ids = set(df["seq_id"].astype(str))

    # Read original FASTA
    records = list(SeqIO.parse(fasta_path, "fasta"))

    # Filter sequences whose header is in the list
    filtered = [
        r for r in records
        if r.id in approved_ids
    ]

    # Output file
    out_fasta = os.path.join(dir_output, f"{gene}_cured.fasta")

    SeqIO.write(filtered, out_fasta, "fasta")

    print(f"   ‚úî {len(filtered)} sequences saved to {out_fasta}")

print("\n‚úî Finished!")

Processando 38 genes...

‚Üí SIAE
   ‚úî 60176 sequ√™ncias salvas em FASTAS_curados/SIAE_curado.fasta
‚Üí cpsK
   ‚úî 31 sequ√™ncias salvas em FASTAS_curados/cpsK_curado.fasta
‚Üí kpsD
   ‚úî 2938 sequ√™ncias salvas em FASTAS_curados/kpsD_curado.fasta
‚Üí kpsE
   ‚úî 6598 sequ√™ncias salvas em FASTAS_curados/kpsE_curado.fasta
‚Üí kpsM
   ‚úî 1012 sequ√™ncias salvas em FASTAS_curados/kpsM_curado.fasta
‚Üí kpsT
   ‚úî 11445 sequ√™ncias salvas em FASTAS_curados/kpsT_curado.fasta
‚Üí lic3A
   ‚úî 1278 sequ√™ncias salvas em FASTAS_curados/lic3A_curado.fasta
‚Üí lic3B
   ‚úî 3 sequ√™ncias salvas em FASTAS_curados/lic3B_curado.fasta
‚Üí lst
   ‚úî 2023 sequ√™ncias salvas em FASTAS_curados/lst_curado.fasta
‚Üí nagA
   ‚úî 114823 sequ√™ncias salvas em FASTAS_curados/nagA_curado.fasta
‚Üí nagB
   ‚úî 85856 sequ√™ncias salvas em FASTAS_curados/nagB_curado.fasta
‚Üí nanA
   ‚úî 14001 sequ√™ncias salvas em FASTAS_curados/nanA_curado.fasta
‚Üí nanC
   ‚úî 1029 sequ√™ncias salvas em FASTAS_curados/na