In [23]:
import numpy as np
import pandas as pd
import subprocess
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
data_positive = pd.read_csv("./positives_testing_elements.csv", sep="," , header=None)
data_positive.head()

Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,173,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
3,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...
4,LinJ.01,888,54885,55772,plus,TGTTTGGTCTTCCGCGTGTCCGTTTTCGCTGCCGCACACTGCGAGG...


In [12]:
# Define fasta_creator file
def fasta_creator(data, fasta_output_path):
    matrix = []
    for index, sequence in data.iterrows():
        index += 1
        rec = SeqRecord(Seq(sequence[5]),  # In the 5 position is the seq
                        id="Seq_" + str(index),
                        description="Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, fasta_output_path, "fasta")

In [13]:
 # Create fasta
fasta_creator(data_positive, "./blaster/positives.fasta")

Now let's launch the BLASTN 

Let's prepare the "ingi" data in fasta

In [21]:
ingi = "CCCTGGCGATGCCGGCCACCTCAACGTGGTGCCAGGGTCCAGTACCCCGTATCATCGGGGGAAGCCAAGAGCCAGCAGC"
len(ingi)

79

In [61]:
# Prepare functions
# Let's define the BLASTn dictionary function
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)

#And the blaster
def blastn_blaster(query_path, dict_path, perc_identity):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -perc_identity " + str(perc_identity) \
        + " -outfmt 10"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    return data

In [62]:
# Let's make the dict
blastn_dic("./blaster/positives.fasta")



Building a new DB, current time: 04/17/2024 13:57:12
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/BEDOPS_join_strands/filtered_data/blaster/positives.fasta
New DB title:  ./blaster/positives.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/BEDOPS_join_strands/filtered_data/blaster/positives.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 1951 sequences in 0.137699 seconds.



Blast Dictionary created in ./blaster/positives.fasta


In [63]:
# Let's make a fasta only for ingi:
def fasta_creator2(sequence, index, fasta_output_path):
    rec = SeqRecord(Seq(sequence),
                    id="ingi_" + str(index),
                    description="Trypanosoma brucei ingi element"
                    )
    SeqIO.write(rec, fasta_output_path, "fasta")

In [64]:
fasta_creator2(ingi, 1, "./blaster/ingi.fasta")

In [65]:
blastn_data = blastn_blaster("./blaster/ingi.fasta", "./blaster/positives.fasta", 0)

In [66]:
blastn_data.strip().split("\n")

['ingi_1,Seq_1028,83.333,60,6,4,17,73,529,471,1.04e-08,52.8',
 'ingi_1,Seq_1948,86.957,46,3,3,1,44,17,61,1.35e-07,49.1',
 'ingi_1,Seq_1943,86.957,46,3,3,1,44,17,61,1.35e-07,49.1',
 'ingi_1,Seq_701,87.805,41,5,0,2,42,792,832,1.35e-07,49.1',
 'ingi_1,Seq_688,87.805,41,5,0,2,42,706,666,1.35e-07,49.1',
 'ingi_1,Seq_685,87.805,41,5,0,2,42,706,666,1.35e-07,49.1',
 'ingi_1,Seq_682,87.805,41,5,0,2,42,706,666,1.35e-07,49.1',
 'ingi_1,Seq_862,79.710,69,11,3,7,73,559,492,4.84e-07,47.3',
 'ingi_1,Seq_862,84.211,38,4,2,10,46,471,435,0.001,36.2',
 'ingi_1,Seq_1139,81.034,58,9,2,17,73,678,622,1.74e-06,45.4',
 'ingi_1,Seq_482,91.176,34,1,2,8,39,8,41,1.74e-06,45.4',
 'ingi_1,Seq_1710,77.108,83,14,4,1,79,677,758,6.26e-06,43.6',
 'ingi_1,Seq_768,88.889,36,2,2,14,47,554,519,6.26e-06,43.6',
 'ingi_1,Seq_725,80.000,65,8,3,5,68,1,61,6.26e-06,43.6',
 'ingi_1,Seq_1032,80.000,60,8,4,17,73,534,476,2.25e-05,41.7',
 'ingi_1,Seq_887,77.922,77,9,7,9,79,420,346,2.25e-05,41.7',
 'ingi_1,Seq_180,85.000,40,6,0,8,47,313,

In [67]:
# To data frame
blastn_data_df = pd.DataFrame([x.split(",") for x in blastn_data.strip().split("\n")])
blastn_data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,ingi_1,Seq_1028,83.333,60,6,4,17,73,529,471,1.04e-08,52.8
1,ingi_1,Seq_1948,86.957,46,3,3,1,44,17,61,1.35e-07,49.1
2,ingi_1,Seq_1943,86.957,46,3,3,1,44,17,61,1.35e-07,49.1
3,ingi_1,Seq_701,87.805,41,5,0,2,42,792,832,1.35e-07,49.1
4,ingi_1,Seq_688,87.805,41,5,0,2,42,706,666,1.35e-07,49.1


In [68]:
blastn_data_df.shape

(210, 12)

In [69]:
blastn_data_df[1].nunique()

204

In [71]:
204/1951

0.10456176319835982