In [3]:
# Load needed libraries
import numpy as np
import pandas as pd
import subprocess

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
# Let's load and check the data
data = pd.read_csv("./sequences.csv", sep=",", header=None)
print(data.shape)
data.head()

(3009, 6)


Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,173,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,1000,13302,14301,plus,GTCGACCGTGCAAGAGGAACTCAACACCGCTCTACTCGGCGTGTCA...
2,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
3,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
4,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...


In [6]:
# Load needed information
dict_path = "../Data/genome/TriTrypDB-67_LinfantumJPCM5_Genome.fasta"  # Path to the genome

In [7]:
# Prepare functions
def blastn_blaster(query_path, dict_path, evalue):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -evalue " + str(evalue) \
        + " -outfmt 10"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    return data

def fasta_creator(sequence, index, fasta_output_path):
    rec = SeqRecord(Seq(sequence),
                    id="Seq_" + str(index),
                    description="Leishmania infantum"
                    )
    SeqIO.write(rec, fasta_output_path, "fasta")

Let's make some tests:

In [21]:
query = "GTCACCACCATGGCCGCCCACACAACGCACAAGGTGAGGATGCCAGCAACGTTCAGTGCCAGGCAGACCCAACGCAAGCAGGGGCAGCAGAAGAGCAGAAGCAACCCAAACGTGAATGCCGCAAAGTATACAAGGATGGAGATGATAGCGAACGCCTGTGCAGCGTGGAAGCGGTTCAGGCGG"

fasta_creator(query, 1, "./filtered_data/test1.fasta")
test = blastn_blaster("./filtered_data/test1.fasta", dict_path, 1.0E-09).strip().split("\n")

In [26]:
test

['Seq_1,LinJ.34,100.000,183,0,0,1,183,767842,768024,4.50e-93,339',
 'Seq_1,LinJ.34,100.000,183,0,0,1,183,771173,771355,4.50e-93,339',
 'Seq_1,LinJ.34,100.000,183,0,0,1,183,784281,784463,4.50e-93,339',
 'Seq_1,LinJ.34,100.000,183,0,0,1,183,787612,787794,4.50e-93,339',
 'Seq_1,LinJ.34,100.000,183,0,0,1,183,790944,791126,4.50e-93,339',
 'Seq_1,LinJ.34,100.000,183,0,0,1,183,794278,794460,4.50e-93,339',
 'Seq_1,LinJ.34,100.000,183,0,0,1,183,797610,797792,4.50e-93,339',
 'Seq_1,LinJ.34,99.454,183,0,1,1,183,780954,781135,7.54e-91,331',
 'Seq_1,LinJ.34,98.901,182,0,2,1,182,774493,774672,1.26e-88,324',
 'Seq_1,LinJ.34,99.432,176,0,1,8,183,777838,778012,5.87e-87,318',
 'Seq_1,LinJ.34,79.096,177,33,3,1,175,433072,432898,6.39e-27,119',
 'Seq_1,LinJ.34,79.096,177,33,3,1,175,446253,446079,6.39e-27,119',
 'Seq_1,LinJ.34,79.096,177,33,3,1,175,450672,450498,6.39e-27,119',
 'Seq_1,LinJ.34,78.378,185,36,3,1,183,515549,515731,2.30e-26,117',
 'Seq_1,LinJ.34,78.531,177,34,3,1,175,424292,424118,2.97e-25,113'

In [22]:
type(test)

list

In [25]:
test_df = pd.DataFrame([x.split(",") for x in test if x])
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Seq_1,LinJ.34,100.0,183,0,0,1,183,767842,768024,4.5e-93,339
1,Seq_1,LinJ.34,100.0,183,0,0,1,183,771173,771355,4.5e-93,339
2,Seq_1,LinJ.34,100.0,183,0,0,1,183,784281,784463,4.5e-93,339
3,Seq_1,LinJ.34,100.0,183,0,0,1,183,787612,787794,4.5e-93,339
4,Seq_1,LinJ.34,100.0,183,0,0,1,183,790944,791126,4.5e-93,339


In [42]:
test_df[1].nunique()

4