In [39]:
import os
import pandas as pd

In [14]:
os.chdir("/home/viskuit/Documents/Work_CBMSO/Testing_Leishmania_project/11.True_Positive_coorectCoorCSV")
os.getcwd()

'/home/viskuit/Documents/Work_CBMSO/Testing_Leishmania_project/11.True_Positive_coorectCoorCSV'

Prepare SIDERs data for each cromosome:

In [None]:
import glob
import re

In [8]:
# Path to BLAST results
results_files = glob.glob("./data/SIDERs/*.fasta")
results_files

# Let's order them by the cromosome number
pattern = re.compile(r"Chr(\d+)")
results_files = sorted(results_files, key=lambda x: int(pattern.search(x).group(1)))

# Let's create a dictionary with the chromosome number as key and the file path as value
files_dict = {}
for i in range(len(results_files)):
    files_dict[f"LinJ.{str(i+1).zfill(2)}"] = results_files[i]

Prepare the genome for each cromosome:

In [11]:
from Bio import SeqIO

In [15]:
genome_path = "./data/dict/Whole_genome/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"
folder_path = "./data/dict"

chromosomes_dict = {}
for record in SeqIO.parse(genome_path, "fasta"):
    chr_folder_path = os.path.join(folder_path, record.id)
    os.makedirs(chr_folder_path, exist_ok=True)
    output_file = f"{record.id}.fasta"
    output_path = os.path.join(chr_folder_path, output_file)
    chromosomes_dict[record.id] = output_path
    with open(output_path, "w") as f:
        f.write(f">{record.id}\n{record.seq}\n")

Create a BLASTn dict in each cromosome file

In [18]:
import subprocess

In [200]:
for key, value in chromosomes_dict.items():
    cmd = f"makeblastdb -in {value} -dbtype nucl -parse_seqids -out {value}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

Now let's start making the BLASTn to find the coordinates

In [None]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def get_fasta_format(sequence):
    seq = Seq(sequence)
    seq_record = SeqRecord(seq)
    fasta_format = seq_record.format("fasta")
    print(fasta_format)

In [64]:
def columns_to_numeric(data_input, columns_to_convert = ["pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen"]):
    for column in columns_to_convert:
        data_input[column] = pd.to_numeric(data_input[column], errors='coerce')
    return data_input

In [25]:
def blastn_blaster(query_path, dict_path, perc_identity):
    cmd = "blastn -word_size 15 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -perc_identity " + str(perc_identity) \
        + " -outfmt '10 qseqid sseqid pident length qstart qend sstart send evalue bitscore qlen slen sstrand sseq'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    data.columns = ["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen", "sstrand", "sseq"]
    return data

In [205]:
whole_SIDERs = pd.DataFrame()
for key, value in files_dict.items():
    print("")
    print(f"{key}:")
    cmd = f"cat {value} | grep '^>' | wc -l"
    n = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    print(f"\t- Number of SIDERs: {n}")

    data = blastn_blaster(value, chromosomes_dict[key], 0)
    data = columns_to_numeric(data)
    print(f"\t- Number of hits: {data.shape[0]}")
    print(f"\t- Number of unique hits: {data['qseqid'].nunique()}")

    data_filtered = data[data["length"] == data["qlen"]]
    print(f"\t- Number of hits where length == qlen: {data_filtered.shape[0]}")
    print(f"\t\t- Unique hits {data_filtered['qseqid'].nunique()}")

    data_filtered2 = data_filtered[data_filtered["pident"] >= 99.00]
    print(f"\t- Number of hits where pident == 100.00: {data_filtered2.shape[0]}")
    print(f"\t\t- Unique hits {data_filtered2['qseqid'].nunique()}")

    if data_filtered2['qseqid'].nunique() == int(n) and data_filtered2.shape[0] == int(n):
        print(f"\t- Original SIDERs == Hits: TRUE")
    elif data_filtered2['qseqid'].nunique() == int(n) and data_filtered2.shape[0] > int(n):
        print(f"\t- Original SIDERs == Hits: TRUE +")
    else:
        print(f"\t- Original SIDERs == Hits: FALSE")
    
    whole_SIDERs = pd.concat([whole_SIDERs, data_filtered], ignore_index=True)


LinJ.01:
	- Number of SIDERs: 7

	- Number of hits: 86
	- Number of unique hits: 7
	- Number of hits where length == qlen: 7
		- Unique hits 7
	- Number of hits where pident == 100.00: 7
		- Unique hits 7
	- Original SIDERs == Hits: TRUE

LinJ.02:
	- Number of SIDERs: 9

	- Number of hits: 60
	- Number of unique hits: 9
	- Number of hits where length == qlen: 18
		- Unique hits 9
	- Number of hits where pident == 100.00: 14
		- Unique hits 9
	- Original SIDERs == Hits: TRUE +

LinJ.03:
	- Number of SIDERs: 8

	- Number of hits: 24
	- Number of unique hits: 8
	- Number of hits where length == qlen: 8
		- Unique hits 8
	- Number of hits where pident == 100.00: 8
		- Unique hits 8
	- Original SIDERs == Hits: TRUE

LinJ.04:
	- Number of SIDERs: 15

	- Number of hits: 93
	- Number of unique hits: 15
	- Number of hits where length == qlen: 18
		- Unique hits 15
	- Number of hits where pident == 100.00: 15
		- Unique hits 15
	- Original SIDERs == Hits: TRUE

LinJ.05:
	- Number of SIDERs: 15


In [221]:
data = blastn_blaster(files_dict["LinJ.06"], chromosomes_dict["LinJ.06"], 0)
data = columns_to_numeric(data)
print(data.shape)

(290, 14)


In [222]:
data["qseqid"].nunique()

17

In [223]:
data["qseqid"].value_counts()

qseqid
6C-451046d    25
6B-246729d    25
6B-189883d    24
6C-492436d    23
6C-501040r    22
6C-467907d    21
6B-180315d    21
6B-218084d    21
6B-137877d    21
6C-513440r    18
6C-455785d    15
6C-461271d    15
6-518081d     10
6A-8987d       9
6A-22189r      9
6-292261r      9
6-511666r      2
Name: count, dtype: int64

In [224]:
data_2 = data[data["length"] == data["qlen"]]
print(data_2.shape)
data_2.head()

(21, 14)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,6A-8987d,LinJ.06,100.0,565,1,565,7034,7598,0.0,1044.0,565,525234,plus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
1,6A-8987d,LinJ.06,99.646,565,1,565,20238,19674,0.0,1033.0,565,525234,minus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
9,6A-22189r,LinJ.06,100.0,565,1,565,20238,19674,0.0,1044.0,565,525234,minus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
10,6A-22189r,LinJ.06,99.646,565,1,565,7034,7598,0.0,1033.0,565,525234,plus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
18,6B-137877d,LinJ.06,100.0,499,1,499,138605,139103,0.0,922.0,499,525234,plus,ACACGCACACATGCCTCTCAGTGCGTGCTGTCTCAGGCTCCAGTAC...


In [225]:
data_2["qseqid"].nunique()

16

In [226]:
data_2["qseqid"].value_counts()

qseqid
6A-8987d      2
6A-22189r     2
6B-137877d    2
6B-189883d    2
6C-513440r    2
6B-180315d    1
6B-246729d    1
6B-218084d    1
6-292261r     1
6C-455785d    1
6C-467907d    1
6C-461271d    1
6C-492436d    1
6C-501040r    1
6-511666r     1
6-518081d     1
Name: count, dtype: int64

In [227]:
data["qseqid"].value_counts().index.difference(data_2["qseqid"].value_counts().index)


Index(['6C-451046d'], dtype='object', name='qseqid')

In [237]:
data.iloc[139]["sseq"]

'CTCAGCGTCCAGCGCCTACGACGTGGGAGGGAGGGGGAGCGCAGAGCGGTGTGGCGCTACGCATGTCGGCCATCAGATCCTGGATGGCGTTGCGTCGGAGCCACCCGCGGCCGCGGACACGTTTGTGGCACCCGTATGACAGGCGAAGCGTCCGCGTGCCTCGAACCTGTCCCACCCACGCCCCGGCCTTCACAGCGCCCGCTGCTGGTGTGGCGGGGAGCCTGAGGGCCGCCCCTCCCCCCCCCCCCCCGAGCGGGGATGCGCCAAGGCGGCGGACCTGCGAGGCGGCGGTGCGGGCAGAGGTTGAGGCAGGGGCCGTGCGCCGATGGCTGCGTCGGCGCATTGCTGTGCCGCGTGTGTCTACGGCCGGGCGTCGAGTGGCGCGGCGTTGAGCTCGTCGTGTATGGCTGAGAGATGGGCCACACGTTGGAAGCAACACGACAAAAGCACAGCGGCGCGTGG'

In [235]:
data[data["qseqid"] == "6C-451046d"]

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
139,6C-451046d,LinJ.06,98.701,462,1,460,452826,453287,0.0,821.0,460,525234,plus,CTCAGCGTCCAGCGCCTACGACGTGGGAGGGAGGGGGAGCGCAGAG...
140,6C-451046d,LinJ.06,97.638,381,18,396,469688,470068,0.0,654.0,460,525234,plus,ACGACGTGGGAGGGAGGGGGGAGCGCAGAGCGGTGTGGCGCTACGC...
141,6C-451046d,LinJ.06,97.29,369,1,368,502801,502434,0.0,627.0,460,525234,minus,CTCGGCGTCCAGCGCCGACGACGTGGGAGGGAGGGGGGAGCGCAGA...
142,6C-451046d,LinJ.06,95.355,366,1,365,494224,494584,1.0799999999999999e-166,579.0,460,525234,plus,CTCGGCGTCCAGCGCCGACGACGTGGGAGGGAGGGGGGAGCGCAGA...
143,6C-451046d,LinJ.06,91.304,414,50,458,457704,458114,1.4e-160,558.0,460,525234,plus,TGTGTCGCCACGGATGTCGGCGGTCAGGTTCCCCGCATAGGCGTTG...
144,6C-451046d,LinJ.06,90.181,387,50,431,463188,463571,3.1e-142,497.0,460,525234,plus,TGTGTCGCCACGGATGTCGGCGGTCAGGTTCCCCGCATAGGCGTTG...
145,6C-451046d,LinJ.06,93.827,324,42,365,515209,514893,8.68e-138,483.0,460,525234,minus,CAGAGCGGTGTGGCGCTACGCATGTCGGCCATCAGATCCTGGATGG...
146,6C-451046d,LinJ.06,78.049,82,273,354,520161,520238,3.75e-07,49.1,460,525234,plus,GACCTGCGAGGCGGC--T-C-GGTGGCGTTTGAGACAGGGGCCGTG...
147,6C-451046d,LinJ.06,79.661,59,63,121,138798,138856,1.75e-05,43.6,460,525234,plus,ATGTCGGCGGTGAGGGCCTGGATGGCGTTGCGTCGGGGCGACCTGC...
148,6C-451046d,LinJ.06,79.661,59,63,121,181237,181295,1.75e-05,43.6,460,525234,plus,ATGTCGGCGGTGAGGGCCTGGATGGCGTTGCGTCGGGGCGACCTGC...


In [229]:
duplicate_qseqids = data_2["qseqid"].value_counts()[lambda x: x > 1].index
filtered_data_2 = data_2[data_2["qseqid"].isin(duplicate_qseqids)]
filtered_data_2

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,6A-8987d,LinJ.06,100.0,565,1,565,7034,7598,0.0,1044.0,565,525234,plus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
1,6A-8987d,LinJ.06,99.646,565,1,565,20238,19674,0.0,1033.0,565,525234,minus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
9,6A-22189r,LinJ.06,100.0,565,1,565,20238,19674,0.0,1044.0,565,525234,minus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
10,6A-22189r,LinJ.06,99.646,565,1,565,7034,7598,0.0,1033.0,565,525234,plus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
18,6B-137877d,LinJ.06,100.0,499,1,499,138605,139103,0.0,922.0,499,525234,plus,ACACGCACACATGCCTCTCAGTGCGTGCTGTCTCAGGCTCCAGTAC...
19,6B-137877d,LinJ.06,99.8,499,1,499,247487,247985,0.0,917.0,499,525234,plus,ACACGCACACATGCCTCTCAGTGCGTGCTGTCTCAGGCTCCAGTAC...
60,6B-189883d,LinJ.06,100.0,588,1,588,190612,191199,0.0,1086.0,588,525234,plus,TTCTCGCCTGCTCGACACACGTGCACACGCACACATGCCTCTCAGT...
61,6B-189883d,LinJ.06,99.15,588,1,588,247463,248050,0.0,1059.0,588,525234,plus,TTCTCGCCTGCTCGACACACGTGCACACGCACACATGCCTCTCAGT...
262,6C-513440r,LinJ.06,99.385,325,1,325,515213,514891,1.2400000000000001e-169,588.0,325,525234,minus,AAGTCAGAGCGGTGTGGCGCTACGCATGTCGGCCATCAGATCCTGG...
266,6C-513440r,LinJ.06,95.077,325,5,323,469713,470037,3.57e-145,507.0,325,525234,plus,CAGAGCGGTGTGGCGCTACGCATGTCGGCCATCAGATCCTGGATGG...


In [230]:
unique_qseqids = data_2["qseqid"].value_counts()[lambda x: x == 1].index
filtered_unique_data_2 = data_2[data_2["qseqid"].isin(unique_qseqids)]
filtered_unique_data_2

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
39,6B-180315d,LinJ.06,100.0,628,1,628,181044,181671,0.0,1160.0,628,525234,plus,ACACACACACATGCCTCTCAGTGCGTGCTGTCTCAGGCTCCAGTAC...
84,6B-218084d,LinJ.06,100.0,609,1,609,218818,219426,0.0,1125.0,609,525234,plus,AGGTATGAAGGGTTGAGTGGGGGCAGCTGCGCGAAGCGACGGCCTC...
105,6B-246729d,LinJ.06,100.0,631,1,631,247463,248093,0.0,1166.0,631,525234,plus,TTCTCGCCTGCTCGACACACGTGCACACGCACACATGCCTCTCAGT...
130,6-292261r,LinJ.06,100.0,367,1,367,294045,293679,0.0,678.0,367,525234,minus,ACACACGCACACGCCTCTCAGTGCGTGATGTCTTAGGGTCCAGCAC...
164,6C-455785d,LinJ.06,99.638,552,1,552,457567,458116,0.0,1007.0,552,525234,plus,CTGATGGCGGAGGGGACACCTCCGTGAGTGGCATCTCAGGGTCCAG...
179,6C-461271d,LinJ.06,99.808,522,1,522,463051,463571,0.0,957.0,522,525234,plus,CTGATGGCGGAGGGGACACCTCCGTGAGTGGCATCTCAGGGTCCAG...
194,6C-467907d,LinJ.06,99.746,394,1,394,469686,470078,0.0,721.0,394,525234,plus,CGACGACGTGGGAGGGAGGGGGGAGCGCAGAGCGGTGTGGCGCTAC...
215,6C-492436d,LinJ.06,100.0,462,1,462,494211,494672,0.0,854.0,462,525234,plus,CCCTGCGCGGCATCTCGGCGTCCAGCGCCGACGACGTGGGAGGGAG...
238,6C-501040r,LinJ.06,99.785,466,1,466,502814,502350,0.0,854.0,466,525234,minus,CCCCGCGCGGCATCTCGGCGTCCAGCGCCGACGACGTGGGAGGGAG...
260,6-511666r,LinJ.06,100.0,428,1,428,513441,513014,0.0,791.0,428,525234,minus,TCTCTGTGAAAGCCAGGCGGCGGTTCTATCCCTAGCAATGCGAATC...


In [231]:
data_3 = data_2[data_2["pident"] == 100.00]
print(data_3.shape)
data_3.head()

(11, 14)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,6A-8987d,LinJ.06,100.0,565,1,565,7034,7598,0.0,1044.0,565,525234,plus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
9,6A-22189r,LinJ.06,100.0,565,1,565,20238,19674,0.0,1044.0,565,525234,minus,TGACGACGTGGGGACACCTCAGCGCGTGGTGTCTCAGTGTCTTGTG...
18,6B-137877d,LinJ.06,100.0,499,1,499,138605,139103,0.0,922.0,499,525234,plus,ACACGCACACATGCCTCTCAGTGCGTGCTGTCTCAGGCTCCAGTAC...
39,6B-180315d,LinJ.06,100.0,628,1,628,181044,181671,0.0,1160.0,628,525234,plus,ACACACACACATGCCTCTCAGTGCGTGCTGTCTCAGGCTCCAGTAC...
60,6B-189883d,LinJ.06,100.0,588,1,588,190612,191199,0.0,1086.0,588,525234,plus,TTCTCGCCTGCTCGACACACGTGCACACGCACACATGCCTCTCAGT...


In [232]:
data_3["qseqid"].nunique()

11

In [233]:
data_3["qseqid"].value_counts()

qseqid
6A-8987d      1
6A-22189r     1
6B-137877d    1
6B-180315d    1
6B-189883d    1
6B-218084d    1
6B-246729d    1
6-292261r     1
6C-492436d    1
6-511666r     1
6-518081d     1
Name: count, dtype: int64

In [234]:
duplicate_qseqids = data_3["qseqid"].value_counts()[lambda x: x > 1].index
filtered_data_3 = data_3[data_3["qseqid"].isin(duplicate_qseqids)]
filtered_data_3

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
