In [11]:
# Needed modules
import os
import subprocess
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [19]:
# Needed functions
def fasta_creator(csv_input, output_path):
    matrix = []
    for index, row in csv_input.iterrows():
        rec = SeqRecord(Seq(row["sseq"]), 
                        id = f"Seq_{index}_{row['sseqid']}",
                        description = "Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, output_path, "fasta")

def blastn_dic(path_input, path_output):
    # "parse_seqids" is used to keep the sequence ID in the output.
    cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def blastn_blaster(query_path, dict_path):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -outfmt '10 qseqid sseqid pident length qstart qend sstart send evalue bitscore qlen slen'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    data.columns = ["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen"]
    data[['pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']] = data[['pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']].apply(pd.to_numeric)
    return data

In [17]:
path_main_fasta = "./data/consensus+LmSIDER2A+B/neg_matched_with_LmSIDER2A+B.fasta"
path_genome = "./data/L_infantum/dict/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"
path_folder = "./data/consensus+LmSIDER2A+B/neg_dat_filter_decider"
os.makedirs(path_folder, exist_ok=True)

In [20]:
# Create BLASTn dict
blastn_dic(path_input=path_genome, path_output=path_genome)

In [23]:
for count, fasta in enumerate(SeqIO.parse(open(path_main_fasta), "fasta"), start=0):
    path_tmp_fasta = f"{path_folder}/tmp_fasta.fasta"
    SeqIO.write(fasta, path_tmp_fasta, "fasta")
    data = blastn_blaster(query_path=path_tmp_fasta, dict_path=path_genome)
    data.sort_values(by="evalue",  ascending=True, inplace=True,)
    path_csv = f"{path_folder}/{fasta.id}.csv"
    data.to_csv(path_csv, index=False, sep=",", header=True)