Load modules

In [7]:
import numpy as np
import pandas as pd
import os
import subprocess


from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import data

In [2]:
all_data = pd.read_csv("./data/3009_sequences.csv", sep=",", header=None)
all_data.head()

Unnamed: 0,0,1,2,3,4,5
0,LinJ.01,173,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,1000,13302,14301,plus,GTCGACCGTGCAAGAGGAACTCAACACCGCTCTACTCGGCGTGTCA...
2,LinJ.01,699,24093,24791,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
3,LinJ.01,845,35316,36160,plus,GAGGTGGAGGCCGCTCTGCCCCCCCCGCCGCCGAGTGCTGCAGGCA...
4,LinJ.01,892,39698,40589,plus,CTCACCCTCATCCCACCCCTCCTCGTCCATCGACGGGAGTGGGCGG...


Prepare fasta creation function

In [5]:
# Define fasta_creator file
def fasta_creator(data, fasta_output_path):
    matrix = []
    for index, sequence in data.iterrows():
        # index += 1 # To start the index in 1
        rec = SeqRecord(Seq(sequence[5]),  # In the 5 position is the seq
                        id="Seq_" + str(index),
                        description="Leishmania infantum"
                        )
        matrix.append(rec)
    SeqIO.write(matrix, fasta_output_path, "fasta")

Create the fasta file from the 3009 elements

In [6]:
fasta_creator(all_data, "./data/3009_sequences.fasta")

Prepare the rest of the functions

In [66]:
# Prepare functions
# Let's define the BLASTn dictionary function
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)

#And the blaster
def blastn_blaster(query_path, dict_path, perc_identity):
    cmd = "blastn -word_size 11 -query " \
        + query_path + " -db " \
        + dict_path \
        + " -perc_identity " + str(perc_identity) \
        + " -outfmt '10 qseqid sseqid pident length qstart qend sstart send evalue bitscore qlen slen'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)  # Important the E value
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    data.columns = ["qseqid", "sseqid", "pident", "length", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qlen", "slen"]
    return data

In [67]:
blastn_result = blastn_blaster("./data/3009_sequences.fasta",
                               "./data/dict/SIDER2_all_36Chr.fasta",
                               0)

* row[8] = subject start alignment
* row[9] = subject end alignment
* row[12] = Subject sequence length

In [70]:
print(blastn_result.shape)
blastn_result.head()

(866192, 12)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen
0,Seq_0,21A-306832d,86.957,23,113,135,551,573,0.63,27.0,173,625
1,Seq_0,21A-294619d,86.957,23,113,135,555,577,0.63,27.0,173,613
2,Seq_1,22C-279447d,84.444,45,9,52,488,444,3.96e-05,43.6,1000,492
3,Seq_1,22C-279447d,91.304,23,38,60,444,466,0.086,32.5,1000,492
4,Seq_1,22C-271803r,84.091,44,10,52,487,444,0.000142,41.7,1000,510


In [72]:
len(
    blastn_result["sseqid"].unique()
)

1112

In [75]:
blastn_result_v2 = blastn_result[blastn_result["length"].astype(int) > 100]
print(blastn_result_v2.shape)
blastn_result_v2.head()

(428230, 12)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen
36,Seq_2,1-24471d,100.0,660,7,666,1,660,0.0,1219,699,660
37,Seq_2,1-55910r,99.38,645,23,666,6,650,0.0,1168,699,650
38,Seq_2,1-36305r,98.708,619,7,624,1,618,0.0,1098,699,621
39,Seq_2,1-204526d,86.207,174,189,362,194,364,4.31e-48,185,699,432
40,Seq_2,1-136422d,86.207,174,189,362,222,392,4.31e-48,185,699,461


In [76]:
blastn_result_v3 = blastn_result_v2[
    blastn_result_v2["length"].astype(int)/blastn_result_v2["slen"].astype(int) > 0.8
]
print(blastn_result_v3.shape)
blastn_result_v3.head()

(22965, 12)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen
36,Seq_2,1-24471d,100.0,660,7,666,1,660,0.0,1219.0,699,660
37,Seq_2,1-55910r,99.38,645,23,666,6,650,0.0,1168.0,699,650
38,Seq_2,1-36305r,98.708,619,7,624,1,618,0.0,1098.0,699,621
82,Seq_2,24I-780953d,72.829,357,38,387,32,372,1.61e-22,100.0,699,384
509,Seq_2,14D-534755d,70.105,475,98,563,65,517,7.58e-11,62.1,699,586


In [77]:
blastn_result_v3_unique = blastn_result_v3.drop_duplicates(subset="sseqid")
print(blastn_result_v3_unique.shape)
blastn_result_v3_unique.head()

(1065, 12)


Unnamed: 0,qseqid,sseqid,pident,length,qstart,qend,sstart,send,evalue,bitscore,qlen,slen
36,Seq_2,1-24471d,100.0,660,7,666,1,660,0.0,1219.0,699,660
37,Seq_2,1-55910r,99.38,645,23,666,6,650,0.0,1168.0,699,650
38,Seq_2,1-36305r,98.708,619,7,624,1,618,0.0,1098.0,699,621
82,Seq_2,24I-780953d,72.829,357,38,387,32,372,1.61e-22,100.0,699,384
509,Seq_2,14D-534755d,70.105,475,98,563,65,517,7.58e-11,62.1,699,586
