In [52]:
import numpy as np
import pandas as pd
import subprocess
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [53]:
# Prepare functions
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)
    
def blastn_blaster(query, db, perc_indentity):
    cmd = "blastn -word_size 11 " \
    + " -query " + query \
    + " -db "  + db \
    + " -perc_identity " + str(perc_indentity) \
    + " -outfmt '10 qseqid sseqid pident length qlen slen mismatch gapopen qstart qend sstart send evalue bitscore qcovhsp sstrand sseq'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    data.columns = ["qseqid", "sseqid", "pident", "length", "qlen", "slen", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovhsp", "sstrand", "sseq"]

    return data

In [54]:
blastn_dic("./genome_data/blastn_dicts/positive_filtered_elements/positives.fasta")  # Positive elements > 5 chromosomes and E-value < 1e-10



Building a new DB, current time: 04/24/2024 10:55:37
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/7.LmSIDERa_and_LmSIDERb_hallmark_BLASTn/genome_data/blastn_dicts/positive_filtered_elements/positives.fasta
New DB title:  ./genome_data/blastn_dicts/positive_filtered_elements/positives.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/7.LmSIDERa_and_LmSIDERb_hallmark_BLASTn/genome_data/blastn_dicts/positive_filtered_elements/positives.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 1951 sequences in 0.194401 seconds.



Blast Dictionary created in ./genome_data/blastn_dicts/positive_filtered_elements/positives.fasta


In [55]:
# Launch BLASTn vs my elements
LmSIDER2a = blastn_blaster("./genome_data/LmSIDER2a.fasta",
                           "./genome_data/blastn_dicts/positive_filtered_elements/positives.fasta",
                           60)
LmSIDER2b = blastn_blaster("./genome_data/LmSIDER2b.fasta",
                            "./genome_data/blastn_dicts/positive_filtered_elements/positives.fasta",
                            60)

print(f"LmSIDER2a: {LmSIDER2a.shape[0]} hits")
print(f"LmSIDER2b: {LmSIDER2b.shape[0]} hits")

LmSIDER2a: 339 hits
LmSIDER2b: 538 hits


In [62]:
# Let's see the unique elements by row[1]
LmSIDER2a_unique = LmSIDER2a.copy().drop_duplicates(subset="sseqid")
LmSIDER2b_unique = LmSIDER2b.copy().drop_duplicates(subset="sseqid")

print(f"LmSIDER2a: {LmSIDER2a_unique.shape[0]} unique hits")
print(f"LmSIDER2b: {LmSIDER2b_unique.shape[0]} unique hits")

LmSIDER2a: 331 unique hits
LmSIDER2b: 500 unique hits


In [63]:
LmSIDER2a_unique.head()

Unnamed: 0,qseqid,sseqid,pident,length,qlen,slen,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qcovhsp,sstrand,sseq
0,LmSIDER2a,Seq_1027,88.608,79,79,985,7,2,1,78,545,468,1.7e-21,95.3,99,minus,CCCTGATAACGGGGGACACCTCAGCGTGGTATCAGGGTCCAGTACC...
1,LmSIDER2a,Seq_1138,87.179,78,79,1133,8,2,1,77,695,619,2.85e-19,87.9,97,minus,CCCTGATGGCGGGGGGACACCTCAGCGTGGTATCAGGGTCCAGTAC...
2,LmSIDER2a,Seq_523,88.312,77,79,580,3,5,1,73,5,79,2.85e-19,87.9,92,plus,CCCTGATGACGAGGAACGCCTCAGTGCGTGGTATCCGCGGTCCAGT...
3,LmSIDER2a,Seq_299,84.81,79,79,878,9,3,2,78,520,443,6.18e-16,76.8,97,minus,CCTGATGACGGGGAACACCTCAGCGTGCTATCAGAGTCCGGTAC-C...
4,LmSIDER2a,Seq_1031,83.951,81,79,990,9,4,1,78,552,473,2.22e-15,75.0,99,minus,CCCTGATAACATAGGTGACACCTCAGCGTGGTATCAGGGCCCAGTG...


In [64]:
LmSIDER2b_unique.head()

Unnamed: 0,qseqid,sseqid,pident,length,qlen,slen,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qcovhsp,sstrand,sseq
0,LmSIDER2b,Seq_1485,89.041,73,77,583,7,1,5,77,102,173,7.67e-20,89.8,95,plus,GCCAATGCCGCACCGCATCTGGTGGTGACAGGGTCCAGTGCCTACG...
1,LmSIDER2b,Seq_666,88.0,75,77,548,8,1,3,77,97,170,2.76e-19,87.9,97,plus,CTGCCAATGCCGAGCCACTTCTGGTGGTGACAGGGTCAAGCGCCTA...
2,LmSIDER2b,Seq_665,88.0,75,77,924,8,1,3,77,434,361,2.76e-19,87.9,97,minus,CTGCCAATGCCGAGCCACTTCTGGTGGTGACAGGGTCAAGCGCCTA...
3,LmSIDER2b,Seq_1914,87.013,77,77,880,9,1,1,77,400,325,9.92e-19,86.1,100,minus,CCCTGCCAATGCCGCGCCACATCTGGTGGTGACAGGGTCAAATGCC...
4,LmSIDER2b,Seq_1400,86.076,79,77,867,9,2,1,77,435,357,3.57e-18,84.2,100,minus,CCCTGCCAAATGCCGGGCCACCTCTGGTGGTGACAGGGCCGGGTGC...


In [71]:
# Sort them by the seq values
LmSIDER2a_unique["Seq_num"] = LmSIDER2a_unique["sseqid"].str.extract(r"_(\d+)$").astype(int)
LmSIDER2a_unique = LmSIDER2a_unique.sort_values(by="Seq_num")
LmSIDER2a_unique.head()


Unnamed: 0,qseqid,sseqid,pident,length,qlen,slen,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qcovhsp,sstrand,sseq,Seq_num
198,LmSIDER2a,Seq_5,88.889,27,79,868,1,1,18,42,265,291,0.014,32.5,32,plus,ACCTCAGCGTGGCATCCCAGGGTCCAG,5
338,LmSIDER2a,Seq_39,100.0,12,79,856,0,0,1,12,504,493,8.2,23.3,15,minus,CCCTGATGACGA,39
252,LmSIDER2a,Seq_45,100.0,15,79,798,0,0,20,34,436,422,0.18,28.8,19,minus,CTCAGCGTGGTATCA,45
71,LmSIDER2a,Seq_46,88.889,36,79,716,2,2,1,34,8,43,6.26e-06,43.6,43,plus,CCCTGATGACGAGGAGTGCAACTCAGCGTGGTATCA,46
251,LmSIDER2a,Seq_55,94.444,18,79,1000,1,0,60,77,405,422,0.18,28.8,23,plus,GGAAGCCGAGCAGCCCCC,55


In [72]:
# Nos the same with b
LmSIDER2b_unique["Seq_num"] = LmSIDER2b_unique["sseqid"].str.extract(r"_(\d+)$").astype(int)
LmSIDER2b_unique = LmSIDER2b_unique.sort_values(by="Seq_num")
LmSIDER2b_unique.head()

Unnamed: 0,qseqid,sseqid,pident,length,qlen,slen,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qcovhsp,sstrand,sseq,Seq_num
301,LmSIDER2b,Seq_1,94.595,37,77,699,2,0,1,37,123,159,2.16e-10,58.4,48,plus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG,1
300,LmSIDER2b,Seq_2,94.595,37,77,845,2,0,1,37,501,465,2.16e-10,58.4,48,minus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG,2
299,LmSIDER2b,Seq_4,94.595,37,77,888,2,0,1,37,544,508,2.16e-10,58.4,48,minus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG,4
298,LmSIDER2b,Seq_5,94.595,37,77,868,2,0,1,37,354,390,2.16e-10,58.4,48,plus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG,5
134,LmSIDER2b,Seq_26,84.507,71,77,616,10,1,1,71,145,214,9.99e-14,69.4,92,plus,CCCTGCCAATGCCGAACCACCTCTGGCAGTGACAGGGTGAGGCGCC...,26


Now I've got both tables with "sseqid" orderer. With the table in `genome_data/positives_testing_elements.csv` we ca nextract the data

In [80]:
data = pd.read_csv("./genome_data/positives_testing_elements.csv", sep=",", header=0)
data.head()

Unnamed: 0,LinJ.01,173,1,173.1,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCGTCACGCCCCCGTCCTGTTGGAGAGGGTGTCGCTGTGCAGGGAATCAGTCGAGAGAAAAACCCTAACCCGTACCGGTACC
0,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
1,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
2,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
3,LinJ.01,888,54885,55772,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...
4,LinJ.01,868,75659,76526,plus,GAGCGCCGCTGAGCAGGCAAGCGAGGCAACCTACGAAAACATGGCA...


Now I've got the index in python numering for LmSIDER2a and LmSIDER2b tables and the data with the index

In [86]:
# Extract rows form data by the index in LmSIDEra["Seq_num"]
LmSIDER2a_real = data.loc[LmSIDER2a_unique["Seq_num"].values]
print(f"LmSIDER2a_real: {LmSIDER2a_real.shape[0]} elements")
LmSIDER2a_real.head()

LmSIDER2a_real: 331 elements


Unnamed: 0,LinJ.01,173,1,173.1,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCGTCACGCCCCCGTCCTGTTGGAGAGGGTGTCGCTGTGCAGGGAATCAGTCGAGAGAAAAACCCTAACCCGTACCGGTACC
5,LinJ.01,306,86582,86887,plus,ATTTGGGCACATGCAGCTGGCGCTGCTGGCCCGTCGCGTGCGCCTC...
39,LinJ.04,423,37307,37729,plus,CCCGGATGACATCGGGGCACCTCAGAGTGTGGTATCGCAGAGTCTA...
45,LinJ.04,716,437658,438373,plus,TTCTATACCCTGATGACGAGGAGTGCAACTCAGCGTGGTATCACAT...
46,LinJ.05,215,162,376,plus,TACACCAGTACACCAGTACACCAGTACACCGTCACGCCCCCGTCCT...
55,LinJ.05,458,227384,227841,plus,CTCAAGCACTCCTCCACCCCCTCTATCTCATGCCAAATCCCGAGCC...


In [87]:
# Now for b
LmSIDER2b_real = data.loc[LmSIDER2b_unique["Seq_num"].values]
print(f"LmSIDER2b_real: {LmSIDER2b_real.shape[0]} elements")
LmSIDER2b_real.head()

LmSIDER2b_real: 500 elements


Unnamed: 0,LinJ.01,173,1,173.1,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCGTCACGCCCCCGTCCTGTTGGAGAGGGTGTCGCTGTGCAGGGAATCAGTCGAGAGAAAAACCCTAACCCGTACCGGTACC
1,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
2,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
4,LinJ.01,868,75659,76526,plus,GAGCGCCGCTGAGCAGGCAAGCGAGGCAACCTACGAAAACATGGCA...
5,LinJ.01,306,86582,86887,plus,ATTTGGGCACATGCAGCTGGCGCTGCTGGCCCGTCGCGTGCGCCTC...
26,LinJ.02,806,335928,336733,plus,CCCCACTCCATTCCATCTCCCTCTCTCTCACGCTGAAGGCAGCGTG...


In [94]:
# Let's check the index that are the asme in LmSIDER2a and LmSIDER2b
print(LmSIDER2a_real.index.intersection(LmSIDER2b_real.index))  # elements in both LmSIDER2a and LmSIDER2b
print(LmSIDER2a_real.index.difference(LmSIDER2b_real.index))  # elements only in LmSIDER2a
print(LmSIDER2b_real.index.difference(LmSIDER2a_real.index))  # elements only in LmSIDER2b


Index([   5,   39,   67,   69,  183,  197,  198,  199,  200,  202,
       ...
       1713, 1721, 1722, 1723, 1724, 1725, 1730, 1795, 1870, 1894],
      dtype='int64', length=145)
Index([  45,   46,   55,   62,   63,   65,   68,   80,   81,   87,
       ...
       1874, 1875, 1876, 1884, 1885, 1891, 1901, 1904, 1921, 1924],
      dtype='int64', length=186)
Index([   1,    2,    4,   26,   28,   30,   38,   49,   50,   51,
       ...
       1889, 1899, 1902, 1906, 1914, 1927, 1928, 1931, 1938, 1939],
      dtype='int64', length=355)


In [107]:
# Sum of all them.
LmSIDER2a_real.index.intersection(LmSIDER2b_real.index).shape[0] \
+ LmSIDER2a_real.index.difference(LmSIDER2b_real.index).shape[0] \
+ LmSIDER2b_real.index.difference(LmSIDER2a_real.index).shape[0] 


686