In [None]:
import csv
import os
from Bio import SeqIO

# Define input and output file paths
gff_file = file1
fasta_file = file1
output_dir = "outputs"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

def process_file(gff_file,fasta_file,output_dir):
    genes = []

    # Parse GFF file to extract fusion genes and their positions
    with open(gff_file, 'r') as file:
        reader = csv.reader(file, delimiter='\t')

        for row in reader:
            if not row or row[0].startswith("#"):  # Skip empty or comment lines
                continue

            if len(row) < 9:
                continue  # Skip malformed lines

            seqid, source, feature_type, start_position, end_position, score, strand, phase, attributes = row

            # Check if the feature is a CDS and contains 'fusion' in the product attribute
            if feature_type == "CDS" and 'fusion' in attributes:
                id_value = None
                product_value = None

                for attr in attributes.split(';'):
                    if attr.startswith('ID='):
                        id_value = attr.split('=')[1]
                    elif attr.startswith('product='):
                        product_value = attr.split('=')[1]

                # Store only if 'fusion' is in the product value
                if id_value and product_value and 'fusion' in product_value:
                    genes.append((seqid, id_value, int(start_position), int(end_position)))

    # Parse the FASTA file
    sequences = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta"))

    # Process extracted genes
    for seqid, gene_id, start_position, end_position in genes:
        if seqid in sequences:
            seq_record = sequences[seqid]
            subsequence = seq_record.seq[start_position - 1 : end_position]  # Convert to 0-based index

            # Generate filename inside the "outputs" directory
            output_fasta = os.path.join(output_dir, f"{seq_record.id}_{gene_id}_{start_position}_{end_position}.fasta")

            # Save extracted sequences to a separate FASTA file
            with open(output_fasta, "w") as out_fasta:
                out_fasta.write(f">{seq_record.id}_{gene_id}_{start_position}_{end_position}\n{subsequence}\n")

            print(f"Saved: {output_fasta}")

    print(f"Extraction complete. FASTA files saved in '{output_dir}'")
for i in gff_fasta_files:
    process_file(i,i,output_dir)
