In [39]:
import os
import pandas as pd

In [14]:
os.chdir("/home/viskuit/Documents/Work_CBMSO/Testing_Leishmania_project/11.True_Positive_coorectCoorCSV")
os.getcwd()

'/home/viskuit/Documents/Work_CBMSO/Testing_Leishmania_project/11.True_Positive_coorectCoorCSV'

Prepare SIDERs data for each cromosome:

In [None]:
import glob
import re

In [8]:
# Path to BLAST results
results_files = glob.glob("./data/SIDERs/*.fasta")
results_files

# Let's order them by the cromosome number
pattern = re.compile(r"Chr(\d+)")
results_files = sorted(results_files, key=lambda x: int(pattern.search(x).group(1)))

# Let's create a dictionary with the chromosome number as key and the file path as value
files_dict = {}
for i in range(len(results_files)):
    files_dict[f"LinJ.{str(i+1).zfill(2)}"] = results_files[i]

Prepare the genome for each cromosome:

In [11]:
from Bio import SeqIO

In [15]:
genome_path = "./data/dict/Whole_genome/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"
folder_path = "./data/dict"

chromosomes_dict = {}
for record in SeqIO.parse(genome_path, "fasta"):
    chr_folder_path = os.path.join(folder_path, record.id)
    os.makedirs(chr_folder_path, exist_ok=True)
    output_file = f"{record.id}.fasta"
    output_path = os.path.join(chr_folder_path, output_file)
    chromosomes_dict[record.id] = output_path
    with open(output_path, "w") as f:
        f.write(f">{record.id}\n{record.seq}\n")

Create a BLASTn dict in each cromosome file

In [18]:
import subprocess

In [200]:
for key, value in chromosomes_dict.items():
    cmd = f"makeblastdb -in {value} -dbtype nucl -parse_seqids -out {value}"
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

Now let's start making the BLASTn to find the coordinates

In [238]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def get_fasta_format(sequence):
    seq = Seq(sequence)
    seq_record = SeqRecord(seq)
    fasta_format = seq_record.format("fasta")
    print(fasta_format)

In [64]:
def columns_to_numeric(data_input, columns_to_convert = ["pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen"]):
    for column in columns_to_convert:
        data_input[column] = pd.to_numeric(data_input[column], errors='coerce')
    return data_input

In [25]:
def blastn_blaster(query_path, dict_path, perc_identity):
    cmd = "blastn -word_size 15 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -perc_identity " + str(perc_identity) \
        + " -outfmt '10 qseqid sseqid pident length qstart qend sstart send evalue bitscore qlen slen sstrand sseq'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    data.columns = ["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen", "sstrand", "sseq"]
    return data

In [205]:
whole_SIDERs = pd.DataFrame()
for key, value in files_dict.items():
    print("")
    print(f"{key}:")
    cmd = f"cat {value} | grep '^>' | wc -l"
    n = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    print(f"\t- Number of SIDERs: {n}")

    data = blastn_blaster(value, chromosomes_dict[key], 0)
    data = columns_to_numeric(data)
    print(f"\t- Number of hits: {data.shape[0]}")
    print(f"\t- Number of unique hits: {data['qseqid'].nunique()}")

    data_filtered = data[data["length"] == data["qlen"]]
    print(f"\t- Number of hits where length == qlen: {data_filtered.shape[0]}")
    print(f"\t\t- Unique hits {data_filtered['qseqid'].nunique()}")

    data_filtered2 = data_filtered[data_filtered["pident"] >= 99.00]
    print(f"\t- Number of hits where pident == 100.00: {data_filtered2.shape[0]}")
    print(f"\t\t- Unique hits {data_filtered2['qseqid'].nunique()}")

    if data_filtered2['qseqid'].nunique() == int(n) and data_filtered2.shape[0] == int(n):
        print(f"\t- Original SIDERs == Hits: TRUE")
    elif data_filtered2['qseqid'].nunique() == int(n) and data_filtered2.shape[0] > int(n):
        print(f"\t- Original SIDERs == Hits: TRUE +")
    else:
        print(f"\t- Original SIDERs == Hits: FALSE")
    
    whole_SIDERs = pd.concat([whole_SIDERs, data_filtered], ignore_index=True)


LinJ.01:
	- Number of SIDERs: 7

	- Number of hits: 86
	- Number of unique hits: 7
	- Number of hits where length == qlen: 7
		- Unique hits 7
	- Number of hits where pident == 100.00: 7
		- Unique hits 7
	- Original SIDERs == Hits: TRUE

LinJ.02:
	- Number of SIDERs: 9

	- Number of hits: 60
	- Number of unique hits: 9
	- Number of hits where length == qlen: 18
		- Unique hits 9
	- Number of hits where pident == 100.00: 14
		- Unique hits 9
	- Original SIDERs == Hits: TRUE +

LinJ.03:
	- Number of SIDERs: 8

	- Number of hits: 24
	- Number of unique hits: 8
	- Number of hits where length == qlen: 8
		- Unique hits 8
	- Number of hits where pident == 100.00: 8
		- Unique hits 8
	- Original SIDERs == Hits: TRUE

LinJ.04:
	- Number of SIDERs: 15

	- Number of hits: 93
	- Number of unique hits: 15
	- Number of hits where length == qlen: 18
		- Unique hits 15
	- Number of hits where pident == 100.00: 15
		- Unique hits 15
	- Original SIDERs == Hits: TRUE

LinJ.05:
	- Number of SIDERs: 15


In [425]:
data = blastn_blaster(files_dict["LinJ.35"], chromosomes_dict["LinJ.35"], 0)
data = columns_to_numeric(data)
print(data.shape)

(2793, 14)


In [426]:
data["qseqid"].nunique()

72

In [427]:
data["qseqid"].value_counts()

qseqid
35M-1693985r      67
35M-1676847d      67
35N-1818131d      64
35B-385760d       64
35N-1830577d      64
                  ..
35B-364332d       14
35K-1390003r      14
35A-2072704r      14
19D-35-858746d    14
19D-35-852414d    13
Name: count, Length: 72, dtype: int64

In [428]:
data_2 = data[data["length"] == data["qlen"]]
print(data_2.shape)
data_2.head()

(100, 14)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,35A-41620r,LinJ.35,100.0,487,1,487,47291,46805,0.0,900.0,487,2019672,minus,CCCACTCTGTATGGGAAAGCCATACAGCCCCCTACTTGCTGACAAA...
50,35A-58693d,LinJ.35,100.0,502,1,502,64364,64865,0.0,928.0,502,2019672,plus,CCCGCTCTGTGTGGGAAAGCCATACAGCCCCCTACTTGCTGACAAA...
100,35-103785r,LinJ.35,100.0,588,1,588,109732,109145,0.0,1086.0,588,2019672,minus,TGATGGCGGGAGACACATACGTGCGTGATATCTCTCAGCGTCCAGT...
119,35-260414d,LinJ.35,100.0,559,1,559,193800,194358,0.0,1033.0,559,2019672,plus,CTCAGTGCGTGATATCTCAGCGTCCAGTGCACCCCCACACACACAC...
134,35-315582d,LinJ.35,100.0,578,1,578,248724,249301,0.0,1068.0,578,2019672,plus,GGAGGTGACACACCTCAGCGCATGGTATCCAGGCTCCAATGCCCCC...


In [429]:
data_2["qseqid"].nunique()

67

In [430]:
data_2["qseqid"].value_counts()

qseqid
35F-2074715r    6
35O-1891302d    5
35C-416008d     3
35A-2069688r    3
35D-595714d     3
               ..
35-336806d      1
35B-364332d     1
35B-385760d     1
35A-58693d      1
35A-41620r      1
Name: count, Length: 67, dtype: int64

In [431]:
data["qseqid"].value_counts().index.difference(data_2["qseqid"].value_counts().index)


Index(['19D-35-852414d', '19D-35-858746d', '35A-2072704r', '35F-659109r',
       '35L-1442543r'],
      dtype='object', name='qseqid')

In [445]:
get_fasta_format(data.iloc[1646]["sseq"].replace("-", ""))

><unknown id> <unknown description>
CGAGCGCCCCCACTCTGCATGGATGCCGAACCACCCCCACTGCCCTATCCCTGCCAACGC
GGCGCCACTGATGGTGGCGACAGGGTCCAGCGCCCACGACGTGGCGTGGCGTCGGGGCGA
CCTGCGAGGCGGGTGGTGGGCAGAGTTCGAGGCCGCGGCCGTGCTCATGTGACTGAGTCG
GCGCATTGCTGTGACGCGCGTGTGTAGCGCTGCTTCGAGCAACGAGGTGTGGGGCCTGCG
GCAGCCCGGGGTCGAGTGGAGCTCGACTCGTGTTCAGA



In [444]:
data[data["qseqid"] == "35L-1442543r"]

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
1646,35L-1442543r,LinJ.35,99.64,278,1,277,1374166,1373889,1.15e-144,507.0,277,2019672,minus,CGAGCGCCCCCACTCTGCATGGATGCCGAACCACCCCCACTGCCCT...
1647,35L-1442543r,LinJ.35,99.267,273,1,272,1409928,1409656,3.2300000000000004e-140,492.0,277,2019672,minus,CGAGCGCCCCCACTCTGCATGGATGCCGAACCACCCCCACTGCCCT...
1648,35L-1442543r,LinJ.35,99.617,261,1,260,1368439,1368179,3.2499999999999997e-135,475.0,277,2019672,minus,CGAGCGCCCCCACTCTGCATGGATGCCGAACCACCCCCACTGCCCT...
1649,35L-1442543r,LinJ.35,99.563,229,1,228,1387812,1387584,2e-117,416.0,277,2019672,minus,CGAGCGCCCCCACTCTGCATGGATGCCGAACCACCCCCACTGCCCT...
1650,35L-1442543r,LinJ.35,75.581,172,100,266,270335,270502,3.86e-15,76.8,277,2019672,plus,GTGGCGCGGCTGT-GCGGCGACCTGC-AGAGCGGATGCTGAGTAGA...
1651,35L-1442543r,LinJ.35,79.787,94,115,204,319239,319328,1.08e-10,62.1,277,2019672,plus,GGCGGCACTGCGAGGCGGGTGGGTGGGTAGGGCTCCATGCAGGGGC...
1652,35L-1442543r,LinJ.35,82.54,63,142,204,109306,109244,5.02e-09,56.5,277,2019672,minus,GAGTTCCAGGCAGAGGTCCTGCTCAGATGACCGAGTCGGCGCATTG...
1653,35L-1442543r,LinJ.35,76.923,104,115,214,774063,774165,1.81e-08,54.7,277,2019672,plus,GGCGGCCTGCAGAGTGCAGGGTGGGCGGGTGGAGTGTGAGGCGGGG...
1654,35L-1442543r,LinJ.35,76.923,104,115,214,781388,781490,1.81e-08,54.7,277,2019672,plus,GGCGGCCTGCAGAGTGCAGGGTGGGCGGGTGGAGTGTGAGGCGGGG...
1655,35L-1442543r,LinJ.35,75.0,116,102,214,406437,406325,8.41e-07,49.1,277,2019672,minus,GGCGTTGGGTCGGGGCGACCTGCAGAGCGGTGGGTGG-GGGTGTAT...


In [434]:
duplicate_qseqids = data_2["qseqid"].value_counts()[lambda x: x > 1].index
filtered_data_2 = data_2[data_2["qseqid"].isin(duplicate_qseqids)]
filtered_data_2

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
298,35C-416008d,LinJ.35,100.0,421,1,421,349148,349568,0.0,778.0,421,2019672,plus,GGAAGCCATGCGGCCCCTCCCCCTTATCCCTGCCAATGCCGAAGCA...
299,35C-416008d,LinJ.35,99.287,421,1,421,360979,360559,0.0,761.0,421,2019672,minus,GGACGCCAAGCGGCCCCTCCCCCTTATCCCTGCCAATGCCGAAGCA...
300,35C-416008d,LinJ.35,99.05,421,1,421,409597,409177,0.0,756.0,421,2019672,minus,GGACGCCAAGCGGCCCCTCCCCCTTATCCCTGCCAATGCCGAAGCA...
343,35C-427900r,LinJ.35,100.0,494,1,494,361039,360546,0.0,913.0,494,2019672,minus,GACGACGAGGACTACCCCAGCGTGACATCACAGGGTCCAGCGCCCT...
344,35C-427900r,LinJ.35,99.19,494,1,494,409657,409164,0.0,891.0,494,2019672,minus,GACGACGAGGACTACCCCAGCGTGACATCACAGGGTCCAGCGCCCT...
536,35-498559r,LinJ.35,100.0,567,1,567,431694,431128,0.0,1048.0,567,2019672,minus,ATGACGGGGGATACTTCATTACGTGGCATCACAGGGTCCAGCGCCC...
537,35-498559r,LinJ.35,99.824,567,1,567,436889,436323,0.0,1042.0,567,2019672,minus,ATGACGGGGGATACTTCATTACGTGGCATCACAGGGTCCAGCGCCC...
585,35D-544895d,LinJ.35,100.0,518,1,518,483224,483741,0.0,957.0,518,2019672,plus,AGGCTCCAGTACACCCGCTCTGTGTGGAGAGGTCAAGCAGCCACTC...
586,35D-544895d,LinJ.35,99.807,518,1,518,492087,491570,0.0,952.0,518,2019672,minus,AGGGTCCAGTACACCCGCTCTGTGTGGAGAGGTCAAGCAGCCACTC...
629,35D-553758r,LinJ.35,100.0,518,1,518,492087,491570,0.0,957.0,518,2019672,minus,AGGGTCCAGTACACCCGCTCTGTGTGGAGAGGTCAAGCAGCCACTC...


In [435]:
unique_qseqids = data_2["qseqid"].value_counts()[lambda x: x == 1].index
filtered_unique_data_2 = data_2[data_2["qseqid"].isin(unique_qseqids)]
filtered_unique_data_2

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,35A-41620r,LinJ.35,100.0,487,1,487,47291,46805,0.0,900.0,487,2019672,minus,CCCACTCTGTATGGGAAAGCCATACAGCCCCCTACTTGCTGACAAA...
50,35A-58693d,LinJ.35,100.0,502,1,502,64364,64865,0.0,928.0,502,2019672,plus,CCCGCTCTGTGTGGGAAAGCCATACAGCCCCCTACTTGCTGACAAA...
100,35-103785r,LinJ.35,100.0,588,1,588,109732,109145,0.0,1086.0,588,2019672,minus,TGATGGCGGGAGACACATACGTGCGTGATATCTCTCAGCGTCCAGT...
119,35-260414d,LinJ.35,100.0,559,1,559,193800,194358,0.0,1033.0,559,2019672,plus,CTCAGTGCGTGATATCTCAGCGTCCAGTGCACCCCCACACACACAC...
134,35-315582d,LinJ.35,100.0,578,1,578,248724,249301,0.0,1068.0,578,2019672,plus,GGAGGTGACACACCTCAGCGCATGGTATCCAGGCTCCAATGCCCCC...
175,35-336806d,LinJ.35,100.0,587,1,587,269955,270541,0.0,1085.0,587,2019672,plus,CCCATTGATGACGGGGAACACCTCACTGTTGCACCTCAGGGTTCAT...
220,35B-364332d,LinJ.35,100.0,453,1,453,297479,297931,0.0,837.0,453,2019672,plus,CCCACTCTCAGTGTGGAGGGAAAGCCACGCAGCCCCCTACCCCCTA...
234,35B-385760d,LinJ.35,100.0,522,1,522,318904,319425,0.0,965.0,522,2019672,plus,CCCGCTCTCTGTGAGGGGGGAAGTCGAGCAGCCCCCTACCCCCTAT...
395,35C-452591d,LinJ.35,100.0,527,1,527,385729,386255,0.0,974.0,527,2019672,plus,GATGACGAGGACTACCCCAGCGTGACATCACAGGGTCCAGCGCCCT...
451,35-473451r,LinJ.35,100.0,351,1,351,406586,406236,0.0,649.0,351,2019672,minus,GCCCCTTCTTTTCCCCGGCATTTGTTGAGCTCTGATGACGAGGAAC...


In [436]:
data_3 = data_2[data_2["pident"] == 100.00]
print(data_3.shape)
data_3.head()

(69, 14)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
0,35A-41620r,LinJ.35,100.0,487,1,487,47291,46805,0.0,900.0,487,2019672,minus,CCCACTCTGTATGGGAAAGCCATACAGCCCCCTACTTGCTGACAAA...
50,35A-58693d,LinJ.35,100.0,502,1,502,64364,64865,0.0,928.0,502,2019672,plus,CCCGCTCTGTGTGGGAAAGCCATACAGCCCCCTACTTGCTGACAAA...
100,35-103785r,LinJ.35,100.0,588,1,588,109732,109145,0.0,1086.0,588,2019672,minus,TGATGGCGGGAGACACATACGTGCGTGATATCTCTCAGCGTCCAGT...
119,35-260414d,LinJ.35,100.0,559,1,559,193800,194358,0.0,1033.0,559,2019672,plus,CTCAGTGCGTGATATCTCAGCGTCCAGTGCACCCCCACACACACAC...
134,35-315582d,LinJ.35,100.0,578,1,578,248724,249301,0.0,1068.0,578,2019672,plus,GGAGGTGACACACCTCAGCGCATGGTATCCAGGCTCCAATGCCCCC...


In [437]:
data_3["qseqid"].nunique()

66

In [438]:
data_3["qseqid"].value_counts()

qseqid
35A-2069688r    2
35N-1818131d    2
35N-1830577d    2
35-1554674d     1
35L-1477408r    1
               ..
35E-629742r     1
35E-623371r     1
35E-617016d     1
35D-595714d     1
35F-735969d     1
Name: count, Length: 66, dtype: int64

In [439]:
duplicate_qseqids = data_3["qseqid"].value_counts()[lambda x: x > 1].index
filtered_data_3 = data_3[data_3["qseqid"].isin(duplicate_qseqids)]
filtered_data_3

Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen,sstrand,sseq
2054,35N-1818131d,LinJ.35,100.0,610,1,610,1771565,1772174,0.0,1127.0,610,2019672,plus,GATGACGGGAGGGAGGGAACACACCTCAGTGCGGTGGTGTAACAGA...
2055,35N-1818131d,LinJ.35,100.0,610,1,610,1784011,1784620,0.0,1127.0,610,2019672,plus,GATGACGGGAGGGAGGGAACACACCTCAGTGCGGTGGTGTAACAGA...
2118,35N-1830577d,LinJ.35,100.0,610,1,610,1771565,1772174,0.0,1127.0,610,2019672,plus,GATGACGGGAGGGAGGGAACACACCTCAGTGCGGTGGTGTAACAGA...
2119,35N-1830577d,LinJ.35,100.0,610,1,610,1784011,1784620,0.0,1127.0,610,2019672,plus,GATGACGGGAGGGAGGGAACACACCTCAGTGCGGTGGTGTAACAGA...
2690,35A-2069688r,LinJ.35,100.0,498,1,498,8482,7985,0.0,920.0,498,2019672,minus,CTCTGTATGGGAAAGCCATACAGCCCCCTACTTGCTGACAAATGCG...
2691,35A-2069688r,LinJ.35,100.0,498,1,498,11831,11334,0.0,920.0,498,2019672,minus,CTCTGTATGGGAAAGCCATACAGCCCCCTACTTGCTGACAAATGCG...
