In [None]:
from Bio.Seq import Seq
from Bio.Data import CodonTable
from Bio.SeqUtils import six_frame_translations
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.SeqRecord import SeqRecord
import os

# Define the path to the input FASTA file

dir_r = "./data/raw/"
dir_r_cds ="./data/raw/cds"
dir_r_aa = "./data/raw/aa"
dir_p = "./data/processed/"
dir_f = "./data/final/"

# Create an empty list to hold SeqRecord objects
combined_records = []

# Iterate through all files in the directory
for filename in os.listdir(dir_r_cds):
    if filename.endswith(".fna"):
        filepath = os.path.join(dir_r_cds, filename)
        # Read each file and append the sequences to the combined_records list
        with open(filepath, "r") as handle:
            records = list(SeqIO.parse(handle, "fasta"))
            combined_records.extend(records)

# Create a combined fasta file with all sequences
fasta_file = os.path.join(dir_r, "combined_sequences_cds.fasta")
SeqIO.write(combined_records, fasta_file, "fasta")

In [None]:
#AA Combined Sequence list

# Create an empty list to hold SeqRecord objects
combined_records = []

# Iterate through all files in the directory
for filename in os.listdir(dir_r_aa):
    if filename.endswith(".faa"):
        filepath = os.path.join(dir_r_aa, filename)
        # Read each file and append the sequences to the combined_records list
        with open(filepath, "r") as handle:
            records = list(SeqIO.parse(handle, "fasta"))
            combined_records.extend(records)

# Create a combined fasta file with all sequences
fasta_file = os.path.join(dir_r, "combined_sequences_aa.fasta")
SeqIO.write(combined_records, fasta_file, "fasta")

In [None]:
from Bio import SeqIO
import re
import csv

def old_extract_cds_identifiers(cds_fasta):
    """
    Extracts the protein identifiers from CDS FASTA file headers.

    Args:
        cds_fasta (str): Path to the CDS FASTA file.

    Returns:
        dict: A dictionary with protein_id as the key and the full header as the value.
    """
    cds_ids = {}
    pattern = r"\[protein_id=([A-Za-z0-9_]+\.\d+)\]"

    with open(cds_fasta) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            match = re.search(pattern, record.description)
            if match:
                protein_id = match.group(1)
                cds_ids[protein_id] = record.description  # Store the full header or record if needed

    return cds_ids

def extract_aa_identifiers(aa_fasta):
    """
    Extracts the protein identifiers from AA FASTA file headers.

    Args:
        aa_fasta (str): Path to the AA FASTA file.

    Returns:
        dict: A dictionary with protein_id as the key and the full header as the value.
    """
    aa_ids = {}

    with open(aa_fasta) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            protein_id = record.id  # AA file headers are expected to start with >XP_, >OOF_, >OOG_, etc.stengodsantik 
            aa_ids[protein_id] = record.description  # Store the full header or record if needed

    return aa_ids


def create_mapping(cds_fasta, aa_fasta, output_csv):
    """
    Creates a mapping of CDS and AA protein identifiers and writes the results to a CSV file.

    Args:
        cds_fasta (str): Path to the CDS FASTA file.
        aa_fasta (str): Path to the AA FASTA file.
        output_csv (str): Path to the output CSV file.
    """
    # Extract identifiers
    cds_ids = extract_cds_identifiers(cds_fasta)
    aa_ids = extract_aa_identifiers(aa_fasta)

    # Match identifiers
    matched, unmatched_cds, unmatched_aa = match_identifiers(cds_ids, aa_ids)

    # Write the results to a CSV file
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        # Write header
        writer.writerow(["Protein ID", "CDS Header", "AA Header"])

        # Write matched results
        for protein_id, cds_header, aa_header in matched:
            writer.writerow([protein_id, cds_header, aa_header])

    print(f"Mapping complete. Matched: {len(matched)} | Unmatched CDS: {len(unmatched_cds)} | Unmatched AA: {len(unmatched_aa)}")
    print(f"Results saved to: {output_csv}")



cds_fasta = os.path.join(dir_r, "combined_sequences_cds.fasta")
aa_fasta = os.path.join(dir_r, "combined_sequences_aa.fasta")  





In [70]:
import os
import subprocess
import csv
from Bio import SeqIO

def run_hmmsearch(hmm_file, target_file, output_file):
    """
    Runs the HMMER hmmsearch command on the target AA FASTA file using the provided HMM model.
    
    Args:
        hmm_file (str): Path to the HMM file.
        target_file (str): Path to the target AA FASTA file.
        output_file (str): Path to store the hmmsearch output results.
    
    Returns:
        bool: True if the command succeeds, False otherwise.
    """
    command = ['hmmsearch', '-o', output_file, hmm_file, target_file]

    try:
        # Run the command and capture the output
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()

        if process.returncode == 0:
            print(f"hmmsearch completed successfully. Results saved to: {output_file}")
            return True
        else:
            print(f"Error running hmmsearch: {stderr.decode('utf-8')}")
            return False

    except Exception as e:
        print(f"Exception occurred while running hmmsearch: {e}")
        return False

def extract_protein_names(hmm_output_file, output_file):
    """
    Extracts protein names from the raw hmmsearch output and saves to a CSV file.

    Args:
        hmm_output_file (str): Path to the hmmsearch output file.
        output_file (str): Path to save the extracted protein names.

    Returns:
        bool: True if extraction is successful, False otherwise.
    """
    try:
        protein_names = []
        
        # Updated regex pattern to accommodate both formats (XX_XXXXXX.X and LettersNumbers.Number)
        pattern = re.compile(r"^([A-Z]{1,4}_[A-Z0-9]{6,11}\.\d|[A-Z]{2,5}[0-9]{4,11}\.\d)$")

        # Open the hmmsearch output file
        with open(hmm_output_file, "r") as file:
            rows = list(csv.reader(file, delimiter="\t"))

        # Extract protein names from rows starting from line 15
        row_index = 15
        while row_index < len(rows):
            line = ''.join(rows[row_index]).strip()  # Strip any extra whitespace
            fields = line.split()
            
            # Skip empty or malformed rows
            if len(fields) > 8:
                protein_id = fields[8]  # Protein name is in the 9th column (index 8)
                
                # Check if the protein ID matches the expected pattern
                if pattern.match(protein_id):
                    protein_names.append([protein_id])
                else:
                    # Stop processing if we encounter a line that does not match
                    print(f"Stopping extraction. Encountered non-matching protein ID: {protein_id}")
                    break
            
            row_index += 1

        # Write the extracted protein names to a new CSV file
        with open(output_file, "w", newline="") as file:
            csv_writer = csv.writer(file)
            csv_writer.writerow(["ProteinID"])  # Write header
            csv_writer.writerows(protein_names)  # Write protein names

        print(f"Protein names extraction successful. Results saved to: {output_file}")
        return True

    except Exception as e:
        print(f"Exception occurred while extracting protein names: {e}")
        return False
    

def extract_cds_identifiers(cds_fasta):
    """
    Extracts the protein identifiers from CDS FASTA file headers.

    Args:
        cds_fasta (str): Path to the CDS FASTA file.

    Returns:
        dict: A dictionary with protein_id as the key and the full header as the value.
    """
    cds_ids = {}
    pattern = r"\[protein_id=([A-Za-z0-9_]+\.\d+)\]"  # Match [protein_id=...]
    
    with open(cds_fasta) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            match = re.search(pattern, record.description)
            if match:
                protein_id = match.group(1)
                print(protein_id)
                cds_ids[protein_id] = record  # Store the full SeqRecord, not just description

    print(f"Extracted {len(cds_ids)} CDS identifiers from {cds_fasta}")
    return cds_ids
    
def extract_aa_identifiers(aa_fasta):
    """
    Extracts the protein identifiers from AA FASTA file headers.

    Args:
        aa_fasta (str): Path to the AA FASTA file.

    Returns:
        dict: A dictionary with protein_id as the key and the full header as the value.
    """
    aa_ids = {}

    with open(aa_fasta) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            protein_id = record.id  # AA file headers are expected to start with >XP_, >OOF_, >OOG_, etc.
            aa_ids[protein_id] = record.description  # Store the full header or record if needed

    return aa_ids

def filter_fasta_by_identifiers(aa_fasta, identifiers_csv, output_fasta):
    """
    Filters the AA FASTA file to include only sequences with identifiers in the provided CSV file.
    Maintains the order of the sequences as they appear in the CSV.
    
    Args:
        aa_fasta (str): Path to the AA FASTA file.
        identifiers_csv (str): Path to the CSV file containing the identifiers to keep.
        output_fasta (str): Path to save the filtered FASTA file.
    
    Returns:
        bool: True if filtering is successful, False otherwise.
    """
    try:
        # Read identifiers from the CSV file in order
        identifiers = []
        with open(identifiers_csv, "r") as file:
            csv_reader = csv.reader(file)
            next(csv_reader)  # Skip header
            for row in csv_reader:
                identifiers.append(row[0])  # Maintain the order of the protein IDs

        # Create a dictionary to store sequences by their ID
        seq_dict = {}
        with open(aa_fasta, "r") as input_handle:
            for record in SeqIO.parse(input_handle, "fasta"):
                seq_dict[record.id] = record  # Store sequence records by their ID

        # Write the sequences in the same order as the identifiers in the CSV
        with open(output_fasta, "w") as output_handle:
            for protein_id in identifiers:
                if protein_id in seq_dict:
                    SeqIO.write(seq_dict[protein_id], output_handle, "fasta")

        print(f"Filtering successful. Filtered FASTA file saved to: {output_fasta}")
        return True

    except Exception as e:
        print(f"Exception occurred while filtering FASTA file: {e}")
        return False

def apply_hmm_model(hmm_file, target_fasta, output_dir):
    """
    Applies an HMM model to the target AA FASTA file, runs hmmsearch, extracts the results, and filters the FASTA file.

    Args:
        hmm_file (str): Path to the HMM file.
        target_fasta (str): Path to the target AA FASTA file.
        output_dir (str): Directory where output files will be stored.

    Returns:
        None
    """
    # Define paths for output files
    output_hits_file = os.path.join(output_dir, "enzymehits.csv")
    output_protein_names_file = os.path.join(output_dir, "protein_names.csv")
    filtered_fasta_file = os.path.join(dir_f, "filtered_sequences_aa.fasta")
    
    # Run hmmsearch
    if run_hmmsearch(hmm_file, target_fasta, output_hits_file):
        # Extract protein names from the hmmsearch output
        if extract_protein_names(output_hits_file, output_protein_names_file):
            # Filter the AA FASTA file based on extracted protein identifiers
            filter_fasta_by_identifiers(target_fasta, output_protein_names_file, filtered_fasta_file)

# Example usage
if __name__ == "__main__":
    target_fasta = os.path.join(dir_r, "combined_sequences_aa.fasta")
    hmm_file = os.path.join(dir_p, "output.hmm")
    output_dir = "./data/final/"

    # Apply the HMM model and process the results
    apply_hmm_model(hmm_file, target_fasta, output_dir)


hmmsearch completed successfully. Results saved to: ./data/final/enzymehits.csv
Stopping extraction. Encountered non-matching protein ID: alifrom
Protein names extraction successful. Results saved to: ./data/final/protein_names.csv
Filtering successful. Filtered FASTA file saved to: ./data/final/filtered_sequences_aa.fasta


In [72]:
#Codeset for filtering result further based on the presence of a signal peptide or not

import os
import subprocess
from Bio import SeqIO
import re

def run_signalp6(fasta_file, output_dir):
    """
    Runs the SignalP6 tool on the given FASTA file to detect signal peptides.

    Args:
        fasta_file (str): Path to the input AA FASTA file.
        output_dir (str): Directory where the SignalP6 output files will be stored.

    Returns:
        str: Path to the GFF3 output file generated by SignalP6.
    """
    signalp6_output_dir = os.path.join(output_dir, "signalp6_output")

    command = [
        'signalp6', 
        '--fastafile', fasta_file, 
        '--output_dir', signalp6_output_dir,
        '--format', "none",  # Only prediction summary file
        '--organism', "eukarya",  # Eukaryotic organisms
        '--mode', "fast"  # Fast prediction mode
    ]

    try:
        # Run the SignalP6 command
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()

        if process.returncode == 0:
            print(f"SignalP6 completed successfully. Output directory: {signalp6_output_dir}")

            # Locate the GFF3 file inside the output directory
            gff3_file = os.path.join(signalp6_output_dir, "output.gff3")
            if os.path.exists(gff3_file):
                print(f"GFF3 file found: {gff3_file}")
                return gff3_file
            else:
                print("Error: GFF3 file not found.")
                return None
        else:
            print(f"Error running SignalP6: {stderr.decode('utf-8')}")
            return None

    except Exception as e:
        print(f"Exception occurred while running SignalP6: {e}")
        return None


def filter_fasta_by_signalp6(fasta_file, gff3_file, filtered_fasta_file):
    """
    Filters the FASTA file based on SignalP6 GFF3 output to keep only sequences with signal peptides.

    Args:
        fasta_file (str): Path to the input AA FASTA file.
        gff3_file (str): Path to the GFF3 output file from SignalP6.
        filtered_fasta_file (str): Path to save the filtered FASTA file with signal peptides.

    Returns:
        set: Set of protein IDs from the SignalP6 output.
    """
    signalp_ids = set()

    # Parse GFF3 file and extract protein IDs with signal peptides
    with open(gff3_file, 'r') as gff3:
        for line in gff3:
            if line.startswith('#'):
                continue
            fields = line.split('\t')
            if len(fields) > 8:
                protein_id = fields[0].split()[0]  # Get the first "word" (protein ID)
                signalp_ids.add(protein_id)

    # Write filtered FASTA file
    count = 0
    with open(fasta_file, 'r') as input_fasta, open(filtered_fasta_file, 'w') as output_fasta:
        for record in SeqIO.parse(input_fasta, "fasta"):
            if record.id in signalp_ids:
                SeqIO.write(record, output_fasta, "fasta")
                count += 1

    print(f"Filtered FASTA file saved to: {filtered_fasta_file}")
    print(f"Number of entries added: {count}")

    return signalp_ids  # Return the set of protein IDs


def extract_cds_for_signalp_proteins(cds_fasta_file, signalp_ids, output_cds_fasta):
    """
    Extracts CDS sequences from the CDS FASTA file corresponding to SignalP6 protein IDs.

    Args:
        cds_fasta_file (str): Path to the CDS FASTA file.
        signalp_ids (set): Set of protein IDs from the SignalP6 output.
        output_cds_fasta (str): Path to save the extracted CDS FASTA file.

    Returns:
        None
    """
    # Extract protein IDs and corresponding CDS records from the CDS FASTA
    cds_records = extract_cds_identifiers(cds_fasta_file)

    count = 0  # Track the number of written sequences
    # Write the CDS sequences corresponding to SignalP6 protein IDs
    with open(output_cds_fasta, 'w') as output_fasta:
        for protein_id in signalp_ids:
            if protein_id in cds_records:
                SeqIO.write(cds_records[protein_id], output_fasta, "fasta")
                count += 1

    print(f"Extracted CDS FASTA file saved to: {output_cds_fasta}")
    print(f"Number of entries added: {count}")


#Example Usage

if __name__ == "__main__":
    # Define the input files and output directories
    dir_f = "./data/final"  # Input AA FASTA directory
    dir_r = "./data/raw"  # Input CDS FASTA directory
    output_dir = "./data/final"  # Output directory for filtered sequences

    aa_fasta_file = os.path.join(dir_f, "filtered_sequences_aa.fasta")
    cds_fasta_file = os.path.join(dir_r, "combined_sequences_cds.fasta")
    
    # Run SignalP6 and get the GFF3 file path
    gff3_file = run_signalp6(aa_fasta_file, output_dir)

    # If the GFF3 file exists, proceed to filter AA FASTA and extract CDS
    if gff3_file:
        # Filter the AA FASTA file based on SignalP6 predictions
        filtered_fasta_file = os.path.join(output_dir, "filtered_sequences_aa_with_signal_peptides.fasta")
        signalp_ids = filter_fasta_by_signalp6(aa_fasta_file, gff3_file, filtered_fasta_file)
        
        # Pass the filtered protein IDs to the CDS extraction function
        output_cds_fasta = os.path.join(output_dir, "filtered_sequences_cds_with_signal_peptides.fasta")
        extract_cds_for_signalp_proteins(cds_fasta_file, signalp_ids, output_cds_fasta)



SignalP6 completed successfully. Output directory: ./data/final/signalp6_output
GFF3 file found: ./data/final/signalp6_output/output.gff3
Filtered FASTA file saved to: ./data/final/filtered_sequences_aa_with_signal_peptides.fasta
Number of entries added: 25
Extracted CDS FASTA file saved to: ./data/final/filtered_sequences_cds_with_signal_peptides.fasta
Number of entries added: 25


In [65]:
def filter_fasta_by_signalp6(aa_fasta_file, gff3_file, filtered_fasta_file):
    # Your existing code to filter AA FASTA based on SignalP6
    # Extract protein IDs from SignalP6 GFF3 output

    signalp_ids = set()  # Ensure this is populated
    # After collecting IDs
    print(f"SignalP IDs: {signalp_ids}")
    return signalp_ids  # Ensure this returns a valid set


In [69]:
def extract_cds_identifiers(cds_fasta):
    """
    Extracts the protein identifiers from CDS FASTA file headers.

    Args:
        cds_fasta (str): Path to the CDS FASTA file.

    Returns:
        dict: A dictionary with protein_id as the key and the full header as the value.
    """
    cds_ids = {}
    pattern = r"\[protein_id=([A-Za-z0-9_]+\.\d+)\]"  # Match [protein_id=...]
    
    with open(cds_fasta) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            match = re.search(pattern, record.description)
            if match:
                protein_id = match.group(1)
                print(protein_id)
                cds_ids[protein_id] = record  # Store the full SeqRecord, not just description

    print(f"Extracted {len(cds_ids)} CDS identifiers from {cds_fasta}")
    return cds_ids


extract_cds_identifiers("data/raw/combined_sequences_cds.fasta")

OOG01226.1
OOG01227.1
OOG00238.1
OOG01224.1
OOG01225.1
OOG00239.1
OOG00240.1
OOG00241.1
OOG00242.1
OOG00243.1
OOG00244.1
OOG00245.1
OOG00246.1
OOG00247.1
OOG00248.1
OOG00249.1
OOG00250.1
OOG00251.1
OOG00252.1
OOG00253.1
OOG00254.1
OOG00255.1
OOG00256.1
OOG00257.1
OOG00258.1
OOG00259.1
OOG00260.1
OOG00261.1
OOG00262.1
OOG00263.1
OOG00264.1
OOG00265.1
OOG00266.1
OOG00267.1
OOG00268.1
OOG00269.1
OOG00270.1
OOG00271.1
OOG00272.1
OOG00273.1
OOG00274.1
OOG00277.1
OOG00275.1
OOG00276.1
OOG00278.1
OOG00279.1
OOG00280.1
OOG00281.1
OOG00282.1
OOG00283.1
OOG00284.1
OOG00285.1
OOG00286.1
OOG00287.1
OOG00288.1
OOG00290.1
OOG00289.1
OOG00291.1
OOG00292.1
OOG00293.1
OOG00294.1
OOG00296.1
OOG00295.1
OOG00297.1
OOG00298.1
OOG00299.1
OOG00300.1
OOG00301.1
OOG00303.1
OOG00302.1
OOG00304.1
OOG00305.1
OOG00306.1
OOG00307.1
OOG00308.1
OOG00309.1
OOG00310.1
OOG00311.1
OOG00312.1
OOG00313.1
OOG00314.1
OOG00315.1
OOG00316.1
OOG00317.1
OOG00318.1
OOG00319.1
OOG00320.1
OOG00321.1
OOG00322.1
OOG00323.1
OOG00324.1

{'OOG01226.1': SeqRecord(seq=Seq('ATGGACATCAACAAACACATCCATCCCCTCTATCTTCACTCCGCCAGAAGGCCG...TAG'), id='lcl|KV907493.1_cds_OOG01226.1_1', name='lcl|KV907493.1_cds_OOG01226.1_1', description='lcl|KV907493.1_cds_OOG01226.1_1 [locus_tag=ASPCADRAFT_511297] [db_xref=InterPro:IPR000026,JGIDB:Aspca3_511297] [protein=hypothetical protein] [protein_id=OOG01226.1] [location=complement(join(112..507,573..665))] [gbkey=CDS]', dbxrefs=[]),
 'OOG01227.1': SeqRecord(seq=Seq('ATGTTCCCGATCAAGACCATCATCTCCCTCCTCCCCCTCTTCCTCTCCGTCTCG...TAG'), id='lcl|KV907493.1_cds_OOG01227.1_2', name='lcl|KV907493.1_cds_OOG01227.1_2', description='lcl|KV907493.1_cds_OOG01227.1_2 [locus_tag=ASPCADRAFT_511297] [db_xref=InterPro:IPR000026,JGIDB:Aspca3_1] [protein=hypothetical protein] [protein_id=OOG01227.1] [location=complement(join(112..507,573..587))] [gbkey=CDS]', dbxrefs=[]),
 'OOG00238.1': SeqRecord(seq=Seq('ATGCTAAGACCCGGGCAGGACATCCTCGCCATGTCCGACATCCAGGCCAACGAG...TAA'), id='lcl|KV907493.1_cds_OOG00238.1_3', name='lcl|K