In [5]:
import pandas as pd
import subprocess
from Bio import SeqIO

- `main_data`=  Consensus+LmSIDER2A|B matched with LmSIDER and evalue $\leq 10^{-3}$
- `contrast_1` = Consensus+LmSIDER2A|B matched with LmSIDER and evalue $> 10^{-3}$
- `contrast_2` = Consensus+LmSIDER2A|B NOT matched with LmSIDER

In [2]:
path_genome = "./data/L_infantum/dict/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"
path_main = "./data/consensus+LmSIDER2A+B/neg_matched_with_LmSIDER2A+B_and_evalue_10pow-3.fasta"
contrast_1 = "./data/consensus+LmSIDER2A+B/neg_dat_filter_decider/contrast/neg_data_LmSIDER2A+B_and_no_evalue/neg_data_LmSIDER2A+B_and_no_evalue.fasta"
contrast_2 = "./data/consensus+LmSIDER2A+B/neg_dat_filter_decider/contrast/neg_data_no_LmSIDER2A+B/neg_data_no_LmSIDER2A+B.fasta"

In [18]:
def blastn_blaster(query_path, dict_path):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -outfmt '10 qseqid sseqid pident qstart qend sstart send evalue bitscore length qlen qcovs slen'"
    data = subprocess.run(cmd, shell=True, capture_output=True, text=True, universal_newlines=True, executable='/usr/bin/bash')  # Important the E value
    data = data.stdout
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    if not data.empty:  # If the dataframe is not empty
        data.columns = ["qseqid", "sseqid", "pident", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "length", "qlen", "qcovs", "slen"]
        data[['pident',  'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'length', 'qlen', 'qcovs', 'slen']] = data[['pident',  'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'length', 'qlen', 'qcovs', 'slen']].apply(pd.to_numeric)
    else:  # If the dataframe is empty
        data = pd.DataFrame(columns=["qseqid", "sseqid", "pident", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "length", "qlen", "qcovs", "slen"])  # Create an empty dataframe
    return data  

In [25]:
def filter_checker(fasta_input, evalue_filter):
    main_dict = {}
    for _, fasta in enumerate(SeqIO.parse(open(fasta_input), "fasta"), start=0):
        query = f"<(echo -e '>{fasta.id}\n{fasta.seq}')"
        data = blastn_blaster(query_path=query, dict_path=path_genome)
        data.sort_values(by="evalue", ascending=True, inplace=True)
        data = data[data["evalue"] <= evalue_filter]
        main_dict[fasta.id]= [data.shape[0], data["sseqid"].nunique(), evalue_filter]
    return main_dict

In [31]:
main_check = filter_checker(fasta_input=path_main, evalue_filter=1.0E-09)
for key, value in main_check.items(): print(f"{key}: {value}")

Seq_11_LinJ.02: [9, 2, 1e-09]
Seq_13_LinJ.03: [4, 2, 1e-09]
Seq_26_LinJ.04: [4, 1, 1e-09]
Seq_27_LinJ.04: [5, 2, 1e-09]
Seq_28_LinJ.04: [4, 1, 1e-09]
Seq_29_LinJ.04: [4, 1, 1e-09]
Seq_102_LinJ.09: [6, 3, 1e-09]
Seq_189_LinJ.16: [18, 4, 1e-09]
Seq_190_LinJ.16: [20, 5, 1e-09]
Seq_205_LinJ.17: [10, 2, 1e-09]
Seq_206_LinJ.17: [10, 2, 1e-09]
Seq_207_LinJ.17: [10, 2, 1e-09]
Seq_211_LinJ.18: [7, 4, 1e-09]
Seq_279_LinJ.24: [12, 3, 1e-09]
Seq_280_LinJ.24: [10, 3, 1e-09]
Seq_281_LinJ.24: [10, 3, 1e-09]
Seq_294_LinJ.25: [11, 4, 1e-09]
Seq_340_LinJ.28: [14, 6, 1e-09]
Seq_378_LinJ.29: [5, 1, 1e-09]
Seq_400_LinJ.31: [5, 3, 1e-09]
Seq_646_LinJ.35: [10, 2, 1e-09]
Seq_671_LinJ.36: [3, 1, 1e-09]
Seq_681_LinJ.36: [10, 4, 1e-09]


In [32]:
contrast_1_check = filter_checker(fasta_input=contrast_1, evalue_filter=1.0E-09)
for key, value in contrast_1_check.items(): print(f"{key}: {value}")

Seq_21_LinJ.04: [4, 1, 1e-09]
Seq_23_LinJ.04: [8, 3, 1e-09]
Seq_33_LinJ.05: [1, 1, 1e-09]
Seq_34_LinJ.05: [3, 1, 1e-09]
Seq_35_LinJ.05: [3, 1, 1e-09]
Seq_50_LinJ.08: [3, 2, 1e-09]
Seq_51_LinJ.08: [7, 2, 1e-09]
Seq_61_LinJ.08: [102, 4, 1e-09]
Seq_65_LinJ.08: [102, 4, 1e-09]
Seq_68_LinJ.08: [102, 4, 1e-09]
Seq_106_LinJ.09: [3, 2, 1e-09]
Seq_130_LinJ.12: [22, 1, 1e-09]
Seq_133_LinJ.12: [25, 1, 1e-09]
Seq_136_LinJ.12: [13, 1, 1e-09]
Seq_140_LinJ.12: [13, 1, 1e-09]
Seq_143_LinJ.12: [17, 1, 1e-09]
Seq_147_LinJ.12: [29, 1, 1e-09]
Seq_149_LinJ.12: [28, 1, 1e-09]
Seq_151_LinJ.12: [39, 1, 1e-09]
Seq_152_LinJ.12: [23, 1, 1e-09]
Seq_168_LinJ.13: [8, 3, 1e-09]
Seq_210_LinJ.18: [1, 1, 1e-09]
Seq_236_LinJ.19: [18, 2, 1e-09]
Seq_238_LinJ.19: [29, 3, 1e-09]
Seq_239_LinJ.19: [29, 3, 1e-09]
Seq_271_LinJ.23: [5, 2, 1e-09]
Seq_275_LinJ.24: [10, 2, 1e-09]
Seq_276_LinJ.24: [10, 3, 1e-09]
Seq_293_LinJ.25: [1, 1, 1e-09]
Seq_301_LinJ.26: [1, 1, 1e-09]
Seq_302_LinJ.26: [1, 1, 1e-09]
Seq_305_LinJ.26: [3, 1, 1e-09

In [33]:
contrast_2_check = filter_checker(fasta_input=contrast_2, evalue_filter=1.0E-09)
for key, value in contrast_2_check.items(): print(f"{key}: {value}")

Seq_0_LinJ.01: [4, 1, 1e-09]
Seq_1_LinJ.01: [5, 4, 1e-09]
Seq_2_LinJ.01: [138, 3, 1e-09]
Seq_3_LinJ.02: [6, 2, 1e-09]
Seq_4_LinJ.02: [3, 2, 1e-09]
Seq_5_LinJ.02: [9, 5, 1e-09]
Seq_6_LinJ.02: [13, 4, 1e-09]
Seq_7_LinJ.02: [6, 2, 1e-09]
Seq_8_LinJ.02: [8, 4, 1e-09]
Seq_9_LinJ.02: [3, 3, 1e-09]
Seq_10_LinJ.02: [3, 1, 1e-09]
Seq_12_LinJ.02: [9, 2, 1e-09]
Seq_14_LinJ.03: [7, 3, 1e-09]
Seq_15_LinJ.03: [10, 4, 1e-09]
Seq_16_LinJ.04: [3, 2, 1e-09]
Seq_17_LinJ.04: [3, 2, 1e-09]
Seq_18_LinJ.04: [3, 2, 1e-09]
Seq_19_LinJ.04: [4, 2, 1e-09]
Seq_20_LinJ.04: [3, 2, 1e-09]
Seq_22_LinJ.04: [4, 1, 1e-09]
Seq_24_LinJ.04: [6, 2, 1e-09]
Seq_25_LinJ.04: [3, 2, 1e-09]
Seq_30_LinJ.05: [1, 1, 1e-09]
Seq_31_LinJ.05: [2, 1, 1e-09]
Seq_32_LinJ.05: [1, 1, 1e-09]
Seq_36_LinJ.05: [2, 2, 1e-09]
Seq_37_LinJ.05: [3, 3, 1e-09]
Seq_38_LinJ.06: [1, 1, 1e-09]
Seq_39_LinJ.06: [2, 2, 1e-09]
Seq_40_LinJ.06: [3, 2, 1e-09]
Seq_41_LinJ.06: [5, 4, 1e-09]
Seq_42_LinJ.06: [3, 2, 1e-09]
Seq_43_LinJ.07: [10, 5, 1e-09]
Seq_44_LinJ.07: