In [2]:
# Needed modules
import os
import subprocess
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [18]:
# Needed functions
def fasta_creator(csv_input, output_path):
    matrix = []
    for index, row in csv_input.iterrows():
        rec = SeqRecord(Seq(row["sseq"]), 
                        id = f"Seq_{index}_{row['sseqid']}",
                        description = "Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, output_path, "fasta")

def blastn_dic(path_input, path_output):
    # "parse_seqids" is used to keep the sequence ID in the output.
    cmd = f"makeblastdb -in {path_input} -dbtype nucl -parse_seqids -out {path_output}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def blastn_blaster(query_path, dict_path):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -outfmt '10 qseqid sseqid pident length qstart qend sstart send evalue bitscore qlen slen'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    if not data.empty:  # If the dataframe is not empty
        data.columns = ["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen"]
        data[['pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']] = data[['pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qlen', 'slen']].apply(pd.to_numeric)
    else:  # If the dataframe is empty
        data = pd.DataFrame(columns=["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen"])  # Create an empty dataframe
    return data  

In [3]:
path_neg_data_no_LmSIDER2AB = "./data/consensus+LmSIDER2A+B/neg_dat_filter_decider/contrast/neg_data_no_LmSIDER2A+B/neg_data_no_LmSIDER2A+B.fasta"
path_neg_data_no_LmSIDER2AB_and_evalue = "./data/consensus+LmSIDER2A+B/neg_dat_filter_decider/contrast/neg_data_no_LmSIDER2A+B_and_evalue/neg_data_no_LmSIDER2A+B_and_evalue.fasta"
path_genome = "./data/L_infantum/dict/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"
path_folder_neg_data_no_LmSIDER2AB = os.path.dirname(path_neg_data_no_LmSIDER2AB)  
path_folder_neg_data_no_LmSIDER2AB_and_evalue = os.path.dirname(path_neg_data_no_LmSIDER2AB_and_evalue)
# os.makedirs(path_folder, exist_ok=True)

In [20]:
# Create BLASTn dict
# blastn_dic(path_input=path_genome, path_output=path_genome)

In [19]:
# Cretae CSV function creator with BLASTn
def csv_blasting(path_fasta, path_folder, genome_path):
    for count, fasta in enumerate(SeqIO.parse(path_fasta, "fasta"), start=0):
        path_tmp_fasta = f"{path_folder}/tmp.fasta"
        SeqIO.write(fasta, path_tmp_fasta, "fasta")
        # BLASTn
        data = blastn_blaster(query_path=path_tmp_fasta, dict_path=genome_path)
        data.sort_values(by="evalue", ascending=True, inplace=True)
        path_csv = f"{path_folder}/{fasta.id}.csv"
        data.to_csv(path_csv, index=False, sep=",", header=True)

In [20]:
# Create CSV with BLASTn with the data without LmSIDER2A+B
csv_blasting(path_fasta=path_neg_data_no_LmSIDER2AB, path_folder=path_folder_neg_data_no_LmSIDER2AB, genome_path=path_genome)

In [21]:
# Create CSV with BLASTn with the data with LmSIDER2A+B and evalue > 10 ** -3
csv_blasting(path_fasta=path_neg_data_no_LmSIDER2AB_and_evalue, path_folder=path_folder_neg_data_no_LmSIDER2AB_and_evalue, genome_path=path_genome)

In [22]:
neg_data = pd.read_csv("./data/consensus+LmSIDER2A+B/negatives_testing_elements.csv", sep=",", header=0)

In [23]:
neg_data.iloc[317]

sseqid                                               LinJ.27
sstart                                                     3
send                                                     114
sstrand                                                 plus
sseq       CCCTACACCCTAACCCTAACCCTAACCCAGTACACCCAGTACACCA...
Name: 317, dtype: object