In [39]:
import os
import pandas as pd

In [14]:
os.chdir("/home/viskuit/Documents/Work_CBMSO/Testing_Leishmania_project/11.True_Positive_coorectCoorCSV")
os.getcwd()

'/home/viskuit/Documents/Work_CBMSO/Testing_Leishmania_project/11.True_Positive_coorectCoorCSV'

Prepare SIDERs data for each cromosome:

In [None]:
import glob
import re

In [8]:
# Path to BLAST results
results_files = glob.glob("./data/SIDERs/*.fasta")
results_files

# Let's order them by the cromosome number
pattern = re.compile(r"Chr(\d+)")
results_files = sorted(results_files, key=lambda x: int(pattern.search(x).group(1)))

# Let's create a dictionary with the chromosome number as key and the file path as value
files_dict = {}
for i in range(len(results_files)):
    files_dict[f"LinJ.{str(i+1).zfill(2)}"] = results_files[i]

Prepare the genome for each cromosome:

In [11]:
from Bio import SeqIO

In [15]:
genome_path = "./data/dict/Whole_genome/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"
folder_path = "./data/dict"

chromosomes_dict = {}
for record in SeqIO.parse(genome_path, "fasta"):
    chr_folder_path = os.path.join(folder_path, record.id)
    os.makedirs(chr_folder_path, exist_ok=True)
    output_file = f"{record.id}.fasta"
    output_path = os.path.join(chr_folder_path, output_file)
    chromosomes_dict[record.id] = output_path
    with open(output_path, "w") as f:
        f.write(f">{record.id}\n{record.seq}\n")

Create a BLASTn dict in each cromosome file

In [18]:
import subprocess

In [23]:
for key, value in chromosomes_dict.items():
    cmd = f"makeblastdb -in {value} -dbtype nucl -parse_seqids -out {value}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

Now let's start making the BLASTn to find the coordinates

In [64]:
def columns_to_numeric(data_input, columns_to_convert = ["pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen"]):
    for column in columns_to_convert:
        data_input[column] = pd.to_numeric(data_input[column], errors='coerce')
    return data_input

In [25]:
def blastn_blaster(query_path, dict_path, perc_identity):
    cmd = "blastn -word_size 15 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -perc_identity " + str(perc_identity) \
        + " -outfmt '10 qseqid sseqid pident length qstart qend sstart send evalue bitscore qlen slen sstrand sseq'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    data.columns = ["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen", "sstrand", "sseq"]
    return data

In [162]:
whole_SIDERs = pd.DataFrame()
for key, value in files_dict.items():
    print("")
    print(f"{key}:")
    cmd = f"cat {value} | grep '^>' | wc -l"
    n = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    print(f"\t- Number of SIDERs: {n}")

    data = blastn_blaster(value, chromosomes_dict[key], 0)
    data = columns_to_numeric(data)
    print("")
    print(f"\t- Number of hits: {data.shape[0]}")
    print(f"\t- Number of unique hits: {data['qseqid'].nunique()}")

    data_filtered = data[data["length"] == data["qlen"]]
    print(f"\t- Number of hits where length == qlen: {data_filtered.shape[0]}")
    print(f"\t\t- Unique hits {data_filtered['qseqid'].nunique()}")

    data_filtered2 = data_filtered[data_filtered["pident"] >= 99.00]
    print(f"\t- Number of hits where pident == 100.00: {data_filtered2.shape[0]}")
    print(f"\t\t- Unique hits {data_filtered2['qseqid'].nunique()}")

    if data_filtered2.shape[0] == int(n):
        print(f"\t- Original SIDERs == Hits: TRUE")
    elif data_filtered2.shape[0] > int(n):
        print(f"\t- Original SIDERs == Hits: TRUE +")
    else:
        print(f"\t- Original SIDERs == Hits: FALSE")
    
    whole_SIDERs = pd.concat([whole_SIDERs, data_filtered], ignore_index=True)


LinJ.01:
	- Number of SIDERs: 7


	- Number of hits: 86
	- Number of unique hits: 7
	- Number of hits where length == qlen: 7
		- Unique hits 7
	- Number of hits where pident == 100.00: 7
		- Unique hits 7
	- Original SIDERs == Hits: TRUE

LinJ.02:
	- Number of SIDERs: 9


	- Number of hits: 60
	- Number of unique hits: 9
	- Number of hits where length == qlen: 18
		- Unique hits 9
	- Number of hits where pident == 100.00: 14
		- Unique hits 9
	- Original SIDERs == Hits: TRUE +

LinJ.03:
	- Number of SIDERs: 8


	- Number of hits: 24
	- Number of unique hits: 8
	- Number of hits where length == qlen: 8
		- Unique hits 8
	- Number of hits where pident == 100.00: 8
		- Unique hits 8
	- Original SIDERs == Hits: TRUE

LinJ.04:
	- Number of SIDERs: 16


	- Number of hits: 96
	- Number of unique hits: 16
	- Number of hits where length == qlen: 18
		- Unique hits 15
	- Number of hits where pident == 100.00: 15
		- Unique hits 15
	- Original SIDERs == Hits: FALSE

LinJ.05:
	- Number of SIDERs

In [182]:
data = blastn_blaster(files_dict["LinJ.21"], chromosomes_dict["LinJ.21"], 0)
data = columns_to_numeric(data)
print(data.shape)

(816, 14)


In [183]:
data["qseqid"].nunique()

35

In [184]:
data["qseqid"].value_counts()

qseqid
21A-294619d      44
21A-104152d      40
21A-201975r      36
21A-266095d      35
21A-276680d      34
21-513961r       34
21A-223734r      32
21-8C-644995d    29
21A-219487r      28
21A-372468d      27
21A-242592d      26
21B-541286d      26
21B-573417r      26
21B-566924r      26
21A-163187r      26
21A-249545r      25
21D-693075r      25
21A-306832d      25
21B-577624r      24
21A-195942r      24
21D-652705d      23
21B-570089r      23
21A-226888d      22
21D-656836d      18
21B-561422r      18
21E-717300r      16
21C-658115r      16
21E-729870d      16
21C-618739r      16
21-15431d        15
21B-534180r      15
21-13466d        12
21-47696d        10
21-85406d         3
21-397676r        1
Name: count, dtype: int64

In [185]:
data_2 = data[data["length"] == data["qlen"]]
print(data_2.shape)
data_2.head()

(38, 14)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,21-13466d,LinJ.21,100.0,464,1,464,13844,14307,0.0,857.0,464,764861,plus,CTTCCCTGGTGGAGAGCATGTCAGCGTGGTGTATCAGGGCCCAGTT...
12,21-15431d,LinJ.21,100.0,379,1,379,15809,16187,0.0,701.0,379,764861,plus,CCTGATGCCGGAGAGCACCTCAGTGTGTTCTCACGGTCCAGTGTCT...
27,21-47696d,LinJ.21,100.0,471,1,471,48904,49374,0.0,870.0,471,764861,plus,GCACCCCTATTCCTGCCAGATGCAGAGCCACCTCTGCTGGTGTCAG...
37,21-85406d,LinJ.21,100.0,171,1,171,86612,86782,4.71e-88,316.0,171,764861,plus,GAGGAGGCACACATGCCTCCGTGCGCGGTATCTCAGGGCCCAGTGA...
40,21A-104152d,LinJ.21,99.587,484,1,484,105359,105840,0.0,881.0,484,764861,plus,CACATGCAACCCTCTCAGCGCGTGGCACCTCAGAGTCCAGTGCCCT...


In [186]:
data_2["qseqid"].nunique()

33

In [187]:
data_2["qseqid"].value_counts()

qseqid
21A-226888d    4
21B-577624r    2
21B-561422r    2
21-13466d      1
21-15431d      1
21A-163187r    1
21A-195942r    1
21-85406d      1
21-47696d      1
21A-219487r    1
21A-223734r    1
21A-242592d    1
21A-249545r    1
21A-266095d    1
21A-276680d    1
21A-201975r    1
21A-104152d    1
21A-306832d    1
21A-294619d    1
21-513961r     1
21A-372468d    1
21B-541286d    1
21B-566924r    1
21B-570089r    1
21-397676r     1
21B-573417r    1
21C-618739r    1
21D-652705d    1
21D-656836d    1
21C-658115r    1
21D-693075r    1
21E-717300r    1
21E-729870d    1
Name: count, dtype: int64

In [188]:
data["qseqid"].value_counts().index.difference(data_2["qseqid"].value_counts().index)


Index(['21-8C-644995d', '21B-534180r'], dtype='object', name='qseqid')

In [199]:
data[data["qseqid"] == "21B-534180r"]

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
499,21B-534180r,LinJ.21,99.31,145,1,144,537199,537055,1.84e-71,261.0,144,764861,minus,TGATGACAAGGGGGGGAAGCACGCAGACACCCACCGCTCAGTGCGT...
500,21B-534180r,LinJ.21,98.54,137,1,135,544305,544441,2.3999999999999997e-65,241.0,144,764861,plus,TGATGACAAGGGGGGGGGAAGCACGCAGACACCCACCGCTCAGTGC...
501,21B-534180r,LinJ.21,95.205,146,1,144,576435,576291,1.87e-61,228.0,144,764861,minus,TGATGACGAGGGGGGGGGAAGCACGCAGACACCCACCGCTCAGTGC...
502,21B-534180r,LinJ.21,94.521,146,1,144,569945,569800,2.4200000000000002e-60,224.0,144,764861,minus,TGATGACGAGGGGGGGAAGCACGCAGACACCCACCGCTCAGTGCGT...
503,21B-534180r,LinJ.21,93.86,114,32,144,573108,572995,3.1899999999999996e-44,171.0,144,764861,minus,CACTGCTCAGTGCGTGGCATCAGAGGGTCCAGCCCCACTGCACCCC...
504,21B-534180r,LinJ.21,94.595,111,36,144,580642,580532,3.1899999999999996e-44,171.0,144,764861,minus,GCTCAGTGCGTGGCATCAGAGGGTCCAGCCCCACCGCACACCCCCC...
505,21B-534180r,LinJ.21,87.85,107,37,140,516971,516867,3.26e-29,121.0,144,764861,minus,CTCAATGCGTGATATCGCAGGGTCCAGTACCACCGCATCCCCCTCA...
506,21B-534180r,LinJ.21,78.571,98,37,134,656871,656780,7.21e-11,60.2,144,764861,minus,CTCAGTGCATGGTATCACAAGATCCAGTACCACCGC----CCCCCT...
507,21B-534180r,LinJ.21,94.737,38,100,135,652140,652177,2.59e-10,58.4,144,764861,plus,CTCTGCGTGTGTGTGAGGGAGGAAGCCCAGCAGCCCCC
508,21B-534180r,LinJ.21,94.737,38,100,135,655634,655671,2.59e-10,58.4,144,764861,plus,CTCTGCGTGTGTGTGAGGGAGGAAGCCCAGCAGCCCCC


In [190]:
duplicate_qseqids = data_2["qseqid"].value_counts()[lambda x: x > 1].index
filtered_data_2 = data_2[data_2["qseqid"].isin(duplicate_qseqids)]
filtered_data_2

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
226,21A-226888d,LinJ.21,100.0,512,1,512,228087,228598,0.0,946.0,512,764861,plus,CCGGGCAGCCCCCTCCCCCATCCCCTGCCAAGTGCCGAGCCGCTTC...
229,21A-226888d,LinJ.21,97.07,512,1,512,203099,202588,0.0,863.0,512,764861,minus,CCGGGCAGCCTCCTCCCCCATTCCCTGCCAAGTGCCGAGCCGCTTC...
230,21A-226888d,LinJ.21,96.68,512,1,512,220628,220123,0.0,846.0,512,764861,minus,CCGGGCAGCCCCCTCCCCCATCCCTTGCCAAGTGCCGAGCCGCTTC...
232,21A-226888d,LinJ.21,95.703,512,1,512,197105,196596,0.0,822.0,512,764861,minus,CCGGGCAGCCCCCTCCCCCATGCCTTGCCAAGTGCCGAGCCGCTTC...
540,21B-561422r,LinJ.21,100.0,333,1,333,564443,564111,8.5e-178,616.0,333,764861,minus,GCAGACACCCACCTCACCCTCCCCAATGCCGAGCCACTCCTGGTGG...
541,21B-561422r,LinJ.21,97.898,333,1,333,573012,572681,1.44e-165,575.0,333,764861,minus,GCAG-CCCCCACCTCACCCTCCCCAATGCCGAGCCACTCCTGGTGG...
633,21B-577624r,LinJ.21,100.0,420,1,420,580642,580223,0.0,776.0,420,764861,minus,GCTCAGTGCGTGGCATCAGAGGGTCCAGCCCCACCGCACACCCCCC...
635,21B-577624r,LinJ.21,97.619,420,1,420,544342,544759,0.0,719.0,420,764861,plus,GCTCAGTGCGTGGCATCAGAGGGTCCAGCCCCACCG--CACCCCCC...


In [191]:
unique_qseqids = data_2["qseqid"].value_counts()[lambda x: x == 1].index
filtered_unique_data_2 = data_2[data_2["qseqid"].isin(unique_qseqids)]
filtered_unique_data_2

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,21-13466d,LinJ.21,100.0,464,1,464,13844,14307,0.0,857.0,464,764861,plus,CTTCCCTGGTGGAGAGCATGTCAGCGTGGTGTATCAGGGCCCAGTT...
12,21-15431d,LinJ.21,100.0,379,1,379,15809,16187,0.0,701.0,379,764861,plus,CCTGATGCCGGAGAGCACCTCAGTGTGTTCTCACGGTCCAGTGTCT...
27,21-47696d,LinJ.21,100.0,471,1,471,48904,49374,0.0,870.0,471,764861,plus,GCACCCCTATTCCTGCCAGATGCAGAGCCACCTCTGCTGGTGTCAG...
37,21-85406d,LinJ.21,100.0,171,1,171,86612,86782,4.71e-88,316.0,171,764861,plus,GAGGAGGCACACATGCCTCCGTGCGCGGTATCTCAGGGCCCAGTGA...
40,21A-104152d,LinJ.21,99.587,484,1,484,105359,105840,0.0,881.0,484,764861,plus,CACATGCAACCCTCTCAGCGCGTGGCACCTCAGAGTCCAGTGCCCT...
80,21A-163187r,LinJ.21,100.0,521,1,521,164385,163865,0.0,963.0,521,764861,minus,AGGCCAGGCAGCCCCCTCCCCCACCCCCTGCCAAGTGCCGAGCCGC...
106,21A-195942r,LinJ.21,100.0,551,1,551,197143,196593,0.0,1018.0,551,764861,minus,GTGACACTTCAGCACCCACTCTCTCTGTCTGGGAAAAGCCGGGCAG...
130,21A-201975r,LinJ.21,100.0,594,1,594,203175,202582,0.0,1098.0,594,764861,minus,GCGCGTGGCACACACACACCCCTCAGTGCGTGGCACCTCAGGGACC...
166,21A-219487r,LinJ.21,99.646,565,1,565,220687,220123,0.0,1033.0,565,764861,minus,CGCACGCACACCCCTCAGTGCGTGGCACTTCAGCACCCACTCTCTC...
194,21A-223734r,LinJ.21,100.0,558,1,558,224934,224377,0.0,1031.0,558,764861,minus,GGCGCTTCGAGGGCCCCGTGCGCCAACTTTGCTCCGGGGAAATCCG...


In [192]:
data_3 = data_2[data_2["pident"] == 100.00]
print(data_3.shape)
data_3.head()

(30, 14)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,21-13466d,LinJ.21,100.0,464,1,464,13844,14307,0.0,857.0,464,764861,plus,CTTCCCTGGTGGAGAGCATGTCAGCGTGGTGTATCAGGGCCCAGTT...
12,21-15431d,LinJ.21,100.0,379,1,379,15809,16187,0.0,701.0,379,764861,plus,CCTGATGCCGGAGAGCACCTCAGTGTGTTCTCACGGTCCAGTGTCT...
27,21-47696d,LinJ.21,100.0,471,1,471,48904,49374,0.0,870.0,471,764861,plus,GCACCCCTATTCCTGCCAGATGCAGAGCCACCTCTGCTGGTGTCAG...
37,21-85406d,LinJ.21,100.0,171,1,171,86612,86782,4.71e-88,316.0,171,764861,plus,GAGGAGGCACACATGCCTCCGTGCGCGGTATCTCAGGGCCCAGTGA...
80,21A-163187r,LinJ.21,100.0,521,1,521,164385,163865,0.0,963.0,521,764861,minus,AGGCCAGGCAGCCCCCTCCCCCACCCCCTGCCAAGTGCCGAGCCGC...


In [193]:
data_3["qseqid"].nunique()

30

In [194]:
data_3["qseqid"].value_counts()

qseqid
21-13466d      1
21-15431d      1
21-47696d      1
21-85406d      1
21A-163187r    1
21A-195942r    1
21A-201975r    1
21A-223734r    1
21A-226888d    1
21A-242592d    1
21A-249545r    1
21A-266095d    1
21A-276680d    1
21A-294619d    1
21A-306832d    1
21A-372468d    1
21-397676r     1
21-513961r     1
21B-541286d    1
21B-561422r    1
21B-566924r    1
21B-570089r    1
21B-577624r    1
21C-618739r    1
21D-652705d    1
21D-656836d    1
21C-658115r    1
21D-693075r    1
21E-717300r    1
21E-729870d    1
Name: count, dtype: int64

In [195]:
duplicate_qseqids = data_3["qseqid"].value_counts()[lambda x: x > 1].index
filtered_data_3 = data_3[data_3["qseqid"].isin(duplicate_qseqids)]
filtered_data_3

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
