<a href="https://colab.research.google.com/github/Palaeoprot/Bear/blob/main/Compile_FASTAs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This uses the EMBL-EBI Job Dispatcher sequence analysis tools framework in 2024.

Madeira F et al
Nucleic Acids Research, 01 Jul 2024, 52(W1):W521-W525
https://doi.org/10.1093/nar/gkae241


The **EMBL-EBI Job Dispatcher** sequence analysis tools framework (https://www.ebi.ac.uk/jdispatcher) enables the scientific community to perform a diverse range of sequence analyses using popular bioinformatics applications. Free access to the tools and required sequence datasets is provided through user-friendly web applications, as well as via RESTful and SOAP-based APIs. These are integrated into popular EMBL-EBI resources such as UniProt, InterPro, ENA and Ensembl Genomes. This paper overviews recent improvements to Job Dispatcher, including its brand new website and documentation, enhanced visualisations, improved job management, and a rising trend of user reliance on the service from low- and middle-income regions.

Documentation: https://www.uniprot.org/help/api
WEBSITE_API = "https://rest.uniprot.org/"

Documentation: https://www.ebi.ac.uk/proteins/api/doc/
PROTEINS_API = "https://www.ebi.ac.uk/proteins/api"

In [None]:
!pip install biopython  # Install the Biopython library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import requests, sys, json, os, time
import pandas as pd
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

# Documentation: https://www.uniprot.org/help/api
WEBSITE_API = "https://rest.uniprot.org/"

#Collect sequences
 fetch data, isolate chains, remove duplicates, submit alignment jobs, check if aligned files already exist, and handle cases with a single sequence.

 [NCBI Taxonomy Browser](https://www.ncbi.nlm.nih.gov/guide/taxonomy/)



In [None]:
#@title ##Select taxonomy and genes
import os
import requests
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML

# Define the taxonomy IDs and their names
taxonomy_dict = {
    33554: "Carnivora",
    379584: "Caniformia",
    379583: "Feliformia",
    9632: "Ursidae",
    9615 : "Canis lupus familiaris"
}

# Global variable to store selected taxonomy ID
selected_taxonomy_id = None

# List of gene names
gene_names = [
    "ALB", "AEBP1", "AHSG", "ALPL", "APOA1", "APOA4", "APOE", "APOC1", "APP", "ASPN",
    "BGLAP", "BGN", "C3", "C8B", "C9", "CFH", "CHAD", "CHGA", "CLEC11A", "CLEC3B",
    "CLU", "COL10A1", "COL11A1", "COL11A2", "COL12A1", "COL16A1", "COL1A1", "COL1A2",
    "COL21A1", "COL22A1", "COL24A1", "COL2A1", "COL3A1", "COL4A3", "COL4A4", "COL4A5",
    "COL5A1", "COL5A2", "COL5A3", "COL6A1", "COL6A3", "COL8A1", "CRP", "DCN", "DPT",
    "EEF1A1", "EMILIN1", "EZR", "F10", "F2", "F7", "F9", "FGL2", "FMOD", "FN1",
    "GAPDH", "GAS6", "GC", "HAPLN3", "HSP90B1", "HSPA5", "HTRA1", "IBSP", "IGF1",
    "IGF2", "IGFALS", "IGFBP1", "KNG1", "KRT2", "LOX", "LRRC15", "LUM", "MGP",
    "MMP2", "MSN", "MYO1B", "NUCB1", "NUCB2", "OGN", "OLFML1", "OLFML3", "OMD",
    "P4HB", "PAM", "PCOLCE", "PHOSPHO1", "POSTN", "PROC", "PROS1", "PRSS2",
    "SERPINC1", "SERPIND1", "SERPINF1", "SLC8A3", "SPARC", "SPARCL1", "SPP1",
    "SPP2", "TGFB1", "THBS1", "TNC", "TPP1", "TUBA1B", "VCAN", "VIT", "VTN"
]

# Function to create the output directory based on taxonomy name
def create_output_directory(taxonomy_dict, taxonomy_id):
    name = taxonomy_dict.get(taxonomy_id, "Unknown_Taxonomy")
    output_dir = f"/content/drive/MyDrive/7Papers/26_Ursus_abstrusus/Ancient_Bear/Ancient_Bear_Analysis/FASTAs/Computational_FASTAs/Fasta_{name}/"
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

# Function to create checkboxes for gene names
def create_gene_checkboxes(gene_names):
    checkboxes = [widgets.Checkbox(value=True, description=gene) for gene in gene_names]
    checkbox_dict = {gene: checkbox for gene, checkbox in zip(gene_names, checkboxes)}
    return checkbox_dict

# Function to select or deselect all checkboxes
def select_all(checkbox_dict, value):
    for checkbox in checkbox_dict.values():
        checkbox.value = value

# Create a function to generate a button for each taxonomy ID
def create_buttons(taxonomy_dict):
    buttons = []
    for tax_id, name in taxonomy_dict.items():
        button = widgets.Button(
            description=name,
            button_style='',  # Initially no style
            tooltip=f"Open {name}",
            style={'description_width': 'initial'}
        )

        def on_button_click(b, tax_id=tax_id, name=name, button=button):
            global selected_taxonomy_id
            selected_taxonomy_id = tax_id  # Update the global variable
            output_dir = create_output_directory(taxonomy_dict, tax_id)
            url = f"https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={tax_id}"
            with open(f"{output_dir}/taxonomy_id.txt", "w") as f:
                f.write(f"Taxonomy ID: {tax_id}\nName: {name}\nURL: {url}")
            clear_output()
            display(HTML(f"<a href='{url}' target='_blank'>Open NCBI Taxonomy Browser to check {name} ({tax_id})</a>"))
            # Reset button styles
            for btn in buttons:
                btn.button_style = ''  # Reset to no style
            # Highlight the selected button
            button.button_style = 'success'  # Change color to indicate selection
            display(hbox)  # Redisplay the HBox containing buttons
            display(HTML("<br><h3>Gene Selection</h3>"))  # Add a break and heading
            display(select_all_button, deselect_all_button)  # Redisplay the Select All and Deselect All buttons
            display(gene_grid)  # Redisplay the GridBox containing checkboxes
            print(f"Output directory: {output_dir}")

        button.on_click(on_button_click)
        buttons.append(button)
    return buttons

# Create and display buttons in a row
buttons = create_buttons(taxonomy_dict)
hbox = widgets.HBox(buttons)
display(hbox)

# Add a break and heading
display(HTML("<br><h3>Gene Selection</h3>"))

# Create and display checkboxes for gene names
gene_checkboxes = create_gene_checkboxes(gene_names)
checkbox_widgets = list(gene_checkboxes.values())

# Arrange checkboxes in a grid
n_cols = 5  # Number of columns in the grid
n_rows = (len(checkbox_widgets) + n_cols - 1) // n_cols
grid_layout = widgets.Layout(grid_template_columns=f'repeat({n_cols}, 1fr)')
gene_grid = widgets.GridBox(checkbox_widgets, layout=grid_layout)

# Create a Select All button
select_all_button = widgets.Button(description="Select All", button_style='success')
select_all_button.on_click(lambda b: select_all(gene_checkboxes, True))

# Create a Deselect All button
deselect_all_button = widgets.Button(description="Deselect All", button_style='warning')
deselect_all_button.on_click(lambda b: select_all(gene_checkboxes, False))

# Display the Select All and Deselect All buttons
display(select_all_button, deselect_all_button)
display(gene_grid)


In [None]:
# Ensure the selected_taxonomy_id is defined before running this cell
if selected_taxonomy_id is None:
    raise ValueError("Please select a taxonomy ID before running this cell.")

# Helper function to download data
def get_url(url, **kwargs):
    response = requests.get(url, **kwargs)
    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()
    return response

# Output directory
output_dir = create_output_directory(taxonomy_dict, selected_taxonomy_id)
print(f"Output directory: {output_dir}")

# DataFrame to store the results
df = pd.DataFrame(columns=["Gene Names", "Organism", "Organism (ID)", "Taxonomic lineage", "Length", "Sequence", "Chain", "Aligned Sequence"])

# Check existing files in the output directory
existing_files = set(os.listdir(output_dir))

# Error log file
error_log_file = os.path.join(output_dir, "error_log.txt")

# Get selected gene names from checkboxes
selected_gene_names = [gene for gene, checkbox in gene_checkboxes.items() if checkbox.value]

# Iterate over each gene name and fetch the data
for gene_name in selected_gene_names:
    aligned_file = f"{gene_name}_aligned.fasta"
    if aligned_file in existing_files:
        print(f"Skipping {gene_name} as it is already aligned.")
        continue

    print(f"Fetching data for gene: {gene_name}")
    url = f"{WEBSITE_API}/uniprotkb/search?query={gene_name} AND (taxonomy_id:{selected_taxonomy_id})&fields=gene_names,organism_name,organism_id,lineage,length,sequence,ft_chain&format=tsv"
    response = get_url(url)

    # Parse the response and add to the DataFrame
    lines = response.text.strip().split('\n')[1:]  # Skip the header
    for line in lines:
        parts = line.split('\t')
        if len(parts) == 7:
            gene, organism, organism_id, lineage, length, sequence, chain_info = parts
            # Check if chain_info contains the expected "CHAIN" substring
            if "CHAIN" in chain_info:
                try:
                    # Handle multiple chains if they exist
                    chains = chain_info.split('CHAIN ')
                    for chain in chains[1:]:
                        chain_start, chain_end = chain.split(';')[0].split('..')
                        truncated_sequence = sequence[int(chain_start)-1:int(chain_end)]
                        new_row = {
                            "Gene Names": gene,
                            "Organism": organism,
                            "Organism (ID)": organism_id,
                            "Taxonomic lineage": lineage,
                            "Length": length,
                            "Sequence": truncated_sequence,
                            "Chain": f"{chain_start}..{chain_end}",
                            "Aligned Sequence": ""
                        }
                        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
                except ValueError:
                    with open(error_log_file, 'a') as log:
                        log.write(f"Error parsing chain_info for {gene_name}: {chain_info}\n")
                    print(f"Error parsing chain_info for {gene_name}: {chain_info}")
            else:
                # Use the whole sequence if no valid chain is found
                new_row = {
                    "Gene Names": gene,
                    "Organism": organism,
                    "Organism (ID)": organism_id,
                    "Taxonomic lineage": lineage,
                    "Length": length,
                    "Sequence": sequence,
                    "Chain": "full_sequence",
                    "Aligned Sequence": ""
                }
                df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
                print(f"Using full sequence for {gene_name}: {sequence}")

# Remove duplicate sequences
df = df.drop_duplicates(subset=["Sequence"])

# Filter out genes with fewer than two sequences
df = df.groupby('Gene Names').filter(lambda x: len(x) > 1)

# Remove sequences shorter than 100 unless it's the full length of the chain
df = df[(df["Sequence"].str.len() >= 100) | (df["Chain"] == "full_sequence")]

# Save DataFrame to a CSV file
df.to_csv(os.path.join(output_dir, f"uniprot_{selected_taxonomy_id}_proteins_truncated.csv"), index=False)

# Display the DataFrame to see the data
display(df)

# Function to create a unique identifier with underscores replacing spaces
def create_unique_id(row):
    return f"{row['Gene Names']}_{row['Organism']}_{row['Organism (ID)']}_{row.name}".replace(" ", "_")

# Create FASTA files for each gene, ensuring unique identifiers
job_ids = []
for gene_name in df["Gene Names"].unique():
    gene_df = df[df["Gene Names"] == gene_name].copy()
    gene_df['Unique_ID'] = gene_df.apply(create_unique_id, axis=1)

    fasta_content = ""
    for _, row in gene_df.iterrows():
        fasta_content += f">{row['Unique_ID']}|{row['Organism'].replace(' ', '_')}|{row['Organism (ID)']}|{row['Chain']}\n{row['Sequence']}\n"

    fasta_file = os.path.join(output_dir, f"{gene_name}.fasta")
    with open(fasta_file, 'w') as file:
        file.write(fasta_content)

    # Print the FASTA content for debugging
    print(f"FASTA content for {gene_name}:\n{fasta_content}")

    if len(gene_df) == 1:
        # If there is only one sequence, rename the file to indicate it is aligned
        os.rename(fasta_file, os.path.join(output_dir, f"{gene_name}_aligned.fasta"))
        print(f"Only one sequence for {gene_name}, marked as aligned.")
        continue

    # Submit alignment job using Clustal Omega
    r = requests.post("https://www.ebi.ac.uk/Tools/services/rest/clustalo/run", data={
        "email": "example@example.com",
        "iterations": 0,
        "outfmt": "fa",  # Using FASTA format
        "order": "aligned",
        "title": gene_name,  # Naming the job with the gene name
        "sequence": fasta_content
    })

    if r.status_code != 200:
        # Log errors during job submission
        with open(error_log_file, 'a') as log:
            log.write(f"Error submitting job for {gene_name}: {r.text}\n")
        print(f"Error submitting job for {gene_name}: {r.text}")
        continue

    job_id = r.text
    print(f"Job ID for {gene_name}: {job_id}")
    job_ids.append((gene_name, job_id))

# Save job IDs to a file
job_ids_file = os.path.join(output_dir, "job_ids.txt")
with open(job_ids_file, 'w') as f:
    for gene_name, job_id in job_ids:
        f.write(f"{gene_name}\t{job_id}\n")

print(f"Job IDs saved to {job_ids_file}")


## Assembling Results into a DataFrame and Saving to a TSV File
This script will check the status of the alignment jobs, retrieve the results, update the DataFrame, and save the results to a TSV file.

In [None]:
import os
import time
import requests
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Ensure the selected_taxonomy_id is defined and not an empty string before running this cell
if selected_taxonomy_id is None or selected_taxonomy_id == "":
    raise ValueError("Please select a valid taxonomy ID before running this cell.")

# Helper function to download data
def get_url(url, **kwargs):
    response = requests.get(url, **kwargs)
    if not response.ok:
        print(response.text)
        response.raise_for_status()
    return response

# Output directory
output_dir = create_output_directory(taxonomy_dict, selected_taxonomy_id)
print(f"Output directory: {output_dir}")

# DataFrame to store the results
df = pd.DataFrame(columns=["Gene Names", "Organism", "Organism (ID)", "Taxonomic lineage", "Length", "Sequence", "Chain", "Aligned Sequence"])

# Check existing files in the output directory
existing_files = set(os.listdir(output_dir))

# Error log file
error_log_file = os.path.join(output_dir, "error_log.txt")

# Get selected gene names from checkboxes
selected_gene_names = [gene for gene, checkbox in gene_checkboxes.items() if checkbox.value]

# Iterate over each gene name and fetch the data
for gene_name in selected_gene_names:
    aligned_file = f"{gene_name}_aligned.fasta"
    if aligned_file in existing_files:
        print(f"Skipping {gene_name} as it is already aligned.")
        continue

    print(f"Fetching data for gene: {gene_name}")
    url = f"https://www.ebi.ac.uk/proteins/api/proteins?gene={gene_name}&taxid={selected_taxonomy_id}&format=tsv"
    response = get_url(url)

    # Parse the response and add to the DataFrame
    lines = response.text.strip().split('\n')[1:]  # Skip the header
    for line in lines:
        parts = line.split('\t')
        if len(parts) == 7:
            gene, organism, organism_id, lineage, length, sequence, chain_info = parts
            # Check if chain_info contains the expected "CHAIN" substring
            if "CHAIN" in chain_info:
                try:
                    # Handle multiple chains if they exist
                    chains = chain_info.split('CHAIN ')
                    for chain in chains[1:]:
                        chain_start, chain_end = chain.split(';')[0].split('..')
                        truncated_sequence = sequence[int(chain_start)-1:int(chain_end)]
                        new_row = {
                            "Gene Names": gene,
                            "Organism": organism,
                            "Organism (ID)": organism_id,
                            "Taxonomic lineage": lineage,
                            "Length": length,
                            "Sequence": truncated_sequence,
                            "Chain": f"{chain_start}..{chain_end}",
                            "Aligned Sequence": ""
                        }
                        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
                except ValueError:
                    with open(error_log_file, 'a') as log:
                        log.write(f"Error parsing chain_info for {gene_name}: {chain_info}\n")
                    print(f"Error parsing chain_info for {gene_name}: {chain_info}")
            else:
                # Use the whole sequence if no valid chain is found
                new_row = {
                    "Gene Names": gene,
                    "Organism": organism,
                    "Organism (ID)": organism_id,
                    "Taxonomic lineage": lineage,
                    "Length": length,
                    "Sequence": sequence,
                    "Chain": "full_sequence",
                    "Aligned Sequence": ""
                }
                df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
                print(f"Using full sequence for {gene_name}: {sequence}")

# Remove duplicate sequences
df = df.drop_duplicates(subset=["Sequence"])

# Filter out genes with fewer than two sequences
df = df.groupby('Gene Names').filter(lambda x: len(x) > 1)

# Remove sequences shorter than 100 unless it's the full length of the chain
df = df[(df["Sequence"].str.len() >= 100) | (df["Chain"] == "full_sequence")]

# Save DataFrame to a CSV file
df.to_csv(os.path.join(output_dir, f"uniprot_{selected_taxonomy_id}_proteins_truncated.csv"), index=False)

# Display the DataFrame to see the data
display(df)

# Function to create a unique identifier with underscores replacing spaces
def create_unique_id(row):
    return f"{row['Gene Names']}_{row['Organism']}_{row['Organism (ID)']}_{row.name}".replace(" ", "_")

# Create FASTA files for each gene, ensuring unique identifiers
job_ids = []
for gene_name in df["Gene Names"].unique():
    gene_df = df[df["Gene Names"] == gene_name].copy()
    gene_df['Unique_ID'] = gene_df.apply(create_unique_id, axis=1)

    fasta_content = ""
    for _, row in gene_df.iterrows():
        fasta_content += f">{row['Unique_ID']}|{row['Organism'].replace(' ', '_')}|{row['Organism (ID)']}|{row['Chain']}\n{row['Sequence']}\n"

    fasta_file = os.path.join(output_dir, f"{gene_name}.fasta")
    with open(fasta_file, 'w') as file:
        file.write(fasta_content)

    # Print the FASTA content for debugging
    print(f"FASTA content for {gene_name}:\n{fasta_content}")

    if len(gene_df) == 1:
        # If there is only one sequence, rename the file to indicate it is aligned
        os.rename(fasta_file, os.path.join(output_dir, f"{gene_name}_aligned.fasta"))
        print(f"Only one sequence for {gene_name}, marked as aligned.")
        continue

    # Submit alignment job using Clustal Omega
    r = requests.post("https://www.ebi.ac.uk/Tools/services/rest/clustalo/run", data={
        "email": "example@example.com",
        "iterations": 0,
        "outfmt": "fa",  # Using FASTA format
        "order": "aligned",
        "title": gene_name,  # Naming the job with the gene name
        "sequence": fasta_content
    })

    if r.status_code != 200:
        # Log errors during job submission
        with open(error_log_file, 'a') as log:
            log.write(f"Error submitting job for {gene_name}: {r.text}\n")
        print(f"Error submitting job for {gene_name}: {r.text}")
        continue

    job_id = r.text
    print(f"Job ID for {gene_name}: {job_id}")
    job_ids.append((gene_name, job_id))

# Save job IDs to a file
job_ids_file = os.path.join(output_dir, "job_ids.txt")
with open(job_ids_file, 'w') as f:
    for gene_name, job_id in job_ids:
        f.write(f"{gene_name}\t{job_id}\n")

print(f"Job IDs saved to {job_ids_file}")


In [None]:
# import time  # Import the time module# Set directories
# selected_taxonomy_name = taxonomy_dict.get(selected_taxonomy_id, "Unknown_Taxonomy") # Get name from taxonomy ID
# input_directory = f"/content/drive/MyDrive/7Papers/26_Ursus_abstrusus/Ancient_Bear/Ancient_Bear_Analysis/FASTAs/Computational_FASTAs/Fasta_{selected_taxonomy_name}/"
# job_ids_file = os.path.join(input_directory, "job_ids.txt")
# output_file = os.path.join(input_directory, "uniprot_bear_proteins_aligned.csv")
# fasta_output_file = os.path.join(input_directory, "combined_bear_proteins.fasta")
# error_log_file = os.path.join(input_directory, "error_log.txt")


# # Helper function to download data
# def get_url(url, **kwargs):
#     response = requests.get(url, **kwargs)
#     if not response.ok:
#         print(response.text)
#         response.raise_for_status()
#     return response

# # Read job IDs from the file, handling potential formatting issues
# job_ids = []
# with open(job_ids_file, 'r') as f:
#     for line in f:
#         parts = line.strip().split('\t')
#         if len(parts) == 2:  # Check if the line has the expected two parts
#             gene_name, job_id = parts
#             job_ids.append((gene_name, job_id))
#         else:
#             print(f"Skipping malformed line: {line.strip()}")  # Log or handle malformed lines

# # Initialize DataFrame to store the results
# df = pd.DataFrame(columns=["Gene Names", "Organism", "Organism (ID)", "Taxonomic lineage", "Length", "Sequence", "Chain", "Aligned Sequence"])

# # Process each job ID
# for gene_name, job_id in job_ids:
#     status_url = f"https://www.ebi.ac.uk/Tools/services/rest/clustalo/status/{job_id}"
#     result_url = f"https://www.ebi.ac.uk/Tools/services/rest/clustalo/result/{job_id}/fa"

#     # Check job status
#     status_response = get_url(status_url)
#     status = status_response.text.strip()
#     print(f"Status for {gene_name} ({job_id}): {status}")

#     if status == "FINISHED":
#         # Get the alignment result
#         result_response = get_url(result_url)
#         aligned_sequences = result_response.text

#         # Save the alignment result to a file
#         aligned_fasta_file = os.path.join(input_directory, f"{gene_name}_aligned.fasta")
#         with open(aligned_fasta_file, 'w') as file:
#             file.write(aligned_sequences)
#         print(f"Alignment result saved to {aligned_fasta_file}")

#         # Parse the aligned sequences and add to the DataFrame
#         for record in SeqIO.parse(aligned_fasta_file, "fasta"):
#             aligned_sequence = str(record.seq)
#             parts = record.description.split('|')
#             if len(parts) >= 4:
#                 organism = parts[1].strip()
#                 organism_id = parts[2].strip()
#                 chain = parts[3].strip()
#             else:
#                 organism = "Unknown"
#                 organism_id = "Unknown"
#                 chain = "Unknown"
#                 with open(error_log_file, 'a') as log:
#                     log.write(f"Incomplete description for {record.id} in gene {gene_name}: {record.description}\n")

#             df.loc[len(df)] = {
#                 "Gene Names": gene_name,
#                 "Organism": organism,
#                 "Organism (ID)": organism_id,
#                 "Taxonomic lineage": "",  # This can be filled with the correct lineage if available
#                 "Length": len(aligned_sequence),
#                 "Sequence": "",  # Original sequence is not required here
#                 "Chain": chain,
#                 "Aligned Sequence": aligned_sequence
#             }

#     else:
#         # Log any issues
#         with open(error_log_file, 'a') as log:
#             log.write(f"Job {job_id} for gene {gene_name} is not finished or has errors: Status {status}\n")
#         print(f"Job {job_id} for gene {gene_name} is not finished or has errors: Status {status}")

#     # Avoid hitting the server too hard
#     sleep(5)

# # Save DataFrame to CSV
# df.to_csv(output_file, index=False)
# print(f"Aligned sequences DataFrame saved to {output_file}")

# # Combine all aligned sequences into a single FASTA file
# all_sequences = []
# for _, row in df.iterrows():
#     record = SeqRecord(Seq(row['Aligned Sequence']),
#                        id=f"{row['Gene Names']}_{row.name}",
#                        description=f"GN={row['Gene Names']} | OS={row['Organism']} | OID={row['Organism (ID)']} | Chain={row['Chain']}")
#     all_sequences.append(record)

# SeqIO.write(all_sequences, fasta_output_file, "fasta")
# print(f"Combined FASTA file created: {fasta_output_file}")
