In [7]:
import numpy as pd
import pandas as pd
import subprocess
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
# Get path directions
data_path = "../BEDOPS_join_strands/filtered_data/positives_testing_elements.csv"

In [4]:
# Get data
data_df = pd.read_csv(data_path, sep=",", header=None)
data_df.head()

Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,173,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
3,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
4,LinJ.01,888,54885,55772,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...


In [6]:
# Extract the first chromosome 1 from our elements
data_df_chr1 = data_df[data_df[0] == "LinJ.01"]
data_df_chr1

Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,173,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
3,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
4,LinJ.01,888,54885,55772,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...
5,LinJ.01,868,75659,76526,plus,GAGCGCCGCTGAGCAGGCAAGCGAGGCAACCTACGAAAACATGGCA...
6,LinJ.01,306,86582,86887,plus,ATTTGGGCACATGCAGCTGGCGCTGCTGGCCCGTCGCGTGCGCCTC...
7,LinJ.01,803,130118,130920,plus,CCTCTGCCGCGCAGCGCGAAGTCATCCCATCACGCCAACGCCGACT...
8,LinJ.01,609,137465,138073,plus,GGGGTGGGGCGGCGGCGCACAGACACACACACACACACACACACAC...
9,LinJ.01,681,145352,146032,plus,GGGGTCGGGCACGCGCTGCAGGCCCTTGTACGGACTGGGCAAGTGG...


In [9]:
# Get fasta f# Define fasta_creator file from the first chromosome
def fasta_creator(data, fasta_output_path):
    matrix = []
    for index, sequence in data.iterrows():
        index += 1
        rec = SeqRecord(Seq(sequence[5]),  # In the 5 position is the seq
                        id="Seq_" + str(index),
                        description="Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, fasta_output_path, "fasta")

In [10]:
# Create fasta from mine elements of chr1
fasta_creator(data_df_chr1, "./ingi_vs_mine/mine_chr1.fasta")

In [11]:
# Prepare functions
# Let's define the BLASTn dictionary function
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)

#And the blaster
def blastn_blaster(query_path, dict_path, perc_identity):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -perc_identity " + str(perc_identity) \
        + " -outfmt 10"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    return data

In [13]:
# Create a dictionary
blastn_dic("./ingi_vs_mine/dict/mine_chr1.fasta")



Building a new DB, current time: 04/17/2024 16:19:04
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/Compare_OrigPositvSIDER_vs_LastFiltered/ingi_vs_mine/dict/mine_chr1.fasta
New DB title:  ./ingi_vs_mine/dict/mine_chr1.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/Compare_OrigPositvSIDER_vs_LastFiltered/ingi_vs_mine/dict/mine_chr1.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 12 sequences in 0.0764821 seconds.



Blast Dictionary created in ./ingi_vs_mine/dict/mine_chr1.fasta


In [16]:
# Make blastn with ingi vs mine in chr1
blastn_chr1_mine = blastn_blaster("./ingi.fasta", "./ingi_vs_mine/dict/mine_chr1.fasta", 60)
blastn_chr1_mine = pd.DataFrame([x.split(",") for x in blastn_chr1_mine.split("\n") if x])
print(blastn_chr1_mine.shape)
blastn_chr1_mine.head()

(4, 12)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,ingi_1,Seq_6,85.185,27,2,1,18,42,265,291,0.004,27.0
1,ingi_1,Seq_5,88.235,17,2,0,32,48,619,603,0.18,21.4
2,ingi_1,Seq_3,88.235,17,2,0,32,48,577,561,0.18,21.4
3,ingi_1,Seq_2,100.0,11,0,0,32,42,48,58,0.18,21.4


In [18]:
print(blastn_chr1_mine[1].value_counts())
print(blastn_chr1_mine[1].value_counts().shape)

1
Seq_6    1
Seq_5    1
Seq_3    1
Seq_2    1
Name: count, dtype: int64
(4,)


In [19]:
# NOW LET'S MAKE THE SAME WITH THE TRUE POSITIVES
blastn_dic("./ingi_vs_TP/dict/SIDER2_Chr1_L_Infantum.fasta")



Building a new DB, current time: 04/17/2024 16:25:34
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/Compare_OrigPositvSIDER_vs_LastFiltered/ingi_vs_TP/dict/SIDER2_Chr1_L_Infantum.fasta
New DB title:  ./ingi_vs_TP/dict/SIDER2_Chr1_L_Infantum.fasta
Sequence type: Nucleotide
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 7 sequences in 0.00102806 seconds.



Blast Dictionary created in ./ingi_vs_TP/dict/SIDER2_Chr1_L_Infantum.fasta


In [20]:
blastn_chr1_TP = blastn_blaster("./ingi.fasta", "./ingi_vs_TP/dict/SIDER2_Chr1_L_Infantum.fasta", 60)
blastn_chr1_TP = pd.DataFrame([x.split(",") for x in blastn_chr1_TP.split("\n") if x])
print(blastn_chr1_TP.shape)
blastn_chr1_TP.head()

(3, 12)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,ingi_1,1-55910r,100.0,11,0,0,32,42,31,41,0.083,21.4
1,ingi_1,1-36305r,100.0,11,0,0,32,42,42,52,0.083,21.4
2,ingi_1,1-24471d,100.0,11,0,0,32,42,42,52,0.083,21.4


In [21]:
print(blastn_chr1_TP[1].value_counts())
print(blastn_chr1_TP[1].value_counts().shape)

1
1-55910r    1
1-36305r    1
1-24471d    1
Name: count, dtype: int64
(3,)
