In [None]:
from Bio import SearchIO
import pandas as pd

In [None]:
from Bio.Blast import NCBIXML

# Open the BLAST XML output file
blast_data = []
with open("blast_results.xml", "r") as xml_file:
    # Parse the XML file
    blast_records = NCBIXML.parse(xml_file)
    
    # Iterate through BLAST records (queries)
    for blast_record in blast_records:
        query_id = blast_record.query  # Get query sequence ID

        # Iterate through alignments (i.e., hits)
        for alignment in blast_record.alignments:
            subject_id = alignment.hit_id  # Subject ID (from the database)
            description = alignment.hit_def  # Subject description
            accession = alignment.accession  # Subject accession number
            print(query_id, description)
            for hsp in alignment.hsps:
                blast_data.append({
                    "Query ID": query_id,
                    "Subject ID": subject_id,
                    "Accession": accession,
                    "Description": description,
                    "Score": hsp.score,
                    "Bit Score": hsp.bits,
                    "E-value": hsp.expect,
                    "Query Start": hsp.query_start,
                    "Query End": hsp.query_end,
                    "Subject Start": hsp.sbjct_start,
                    "Subject End": hsp.sbjct_end,
                    "Identities": hsp.identities,
                    "Alignment Length": hsp.align_length,
                    "Gaps": hsp.gaps,
                    "Query Sequence": hsp.query,
                    "Match": hsp.match,
                    "Subject Sequence": hsp.sbjct
                })

blast_df = pd.DataFrame(blast_data)

In [None]:
filtered_df = blast_df[(blast_df['Gaps'] == 0)]


In [None]:
blast_df['Query ID'].nunique()

In [None]:
filtered_df['Query ID'].nunique()

In [None]:
filtered_df = filtered_df[(filtered_df['Description'].str.contains('ribosomal'))
                                    | (filtered_df['Description'].str.contains('mitochondrion'))]

In [None]:
filtered_description = filtered_df[(filtered_df['Description'].str.contains('ribosomal'))
                                    | (filtered_df['Description'].str.contains('mitochondrion'))]

In [None]:
filtered_df.head()

In [None]:
filtered_df = filtered_df[['Query ID', 'Accession', 'Description', 'Bit Score', 'E-value']]

In [None]:
filtered_df.head()

In [None]:
filtered_df.to_csv('blast_results.csv', index=False)

In [None]:
filtered_df.sort_values('Query ID') 

In [None]:
filtered_description['Description'].to_csv('A_filtered_description.txt')

In [None]:

descriptions = filtered_description['Description'].to_list()

In [None]:
descriptions

In [None]:
# for each row in the dataframe, from Description column take the first two words, if the first word is 'PREDICTED:' then take next two words

species = set()
for description in descriptions:
    words = description.split()
    if words[0] == 'PREDICTED:':
        species.add(words[1] + ' ' + words[2])
    else:
        species.add(words[0] + ' ' + words[1])

In [None]:
len(species)

In [None]:
with open('species.txt', 'w') as f:
    for item in species:
        f.write("%s\n" % item)