In [1]:
import numpy as np
import pandas as pd
import subprocess
import os

In [4]:
# Prepare functions
def blastn_dic(path_input):
    os.system("makeblastdb -in " + path_input + " -dbtype nucl -parse_seqids")
    print("\nBlast Dictionary created in", path_input)
    
def blastn_blaster(query, db, perc_indentity):
    cmd = "blastn -word_size 11 " \
    + " -query " + query \
    + " -db "  + db \
    + " -perc_identity " + str(perc_indentity) \
    + " -outfmt '10 qseqid sseqid pident length qlen slen mismatch gapopen qstart qend sstart send evalue bitscore qcovhsp sstrand sseq'"
    data = subprocess.check_output(cmd, shell=True, universal_newlines=True)
    data = pd.DataFrame([x.split(",") for x in data.split("\n") if x])
    data.columns = ["qseqid", "sseqid", "pident", "length", "qlen", "slen", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore", "qcovhsp", "sstrand", "sseq"]

    return data

In [8]:
# Create dictionaries
blastn_dic("./genome_data/blastn_dicts/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta")  # L. infantum JPCM5 genome



Building a new DB, current time: 04/24/2024 10:11:37
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/7.LmSIDERa_and_LmSIDERb_hallmark_BLASTn/genome_data/blastn_dicts/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta
New DB title:  ./genome_data/blastn_dicts/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/7.LmSIDERa_and_LmSIDERb_hallmark_BLASTn/genome_data/blastn_dicts/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 36 sequences in 0.389548 seconds.



Blast Dictionary created in ./genome_data/blastn_dicts/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta


Building a new DB, current time: 04/24/2024 10:11:37
New DB name:   /home/rfpacheco/Desktop/Projects/Testing_Leishmania_project/7.LmSIDERa_and_LmSIDERb_hallmark_BLASTn/genome_data/b

In [7]:
# Now the blasters vs the whole genome
LmSIDER2a = blastn_blaster("./genome_data/LmSIDER2a.fasta",
                           "./genome_data/blastn_dicts/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta",
                           60)
LmSIDER2b = blastn_blaster("./genome_data/LmSIDER2b.fasta",
                           "./genome_data/blastn_dicts/L_infantum/TriTrypDB-67_LinfantumJPCM5_Genome.fasta",
                           60)

print(f"LmSIDER2a has {LmSIDER2a.shape[0]} hits")
print(f"LmSIDER2b has {LmSIDER2b.shape[0]} hits")

LmSIDER2a has 258 hits
LmSIDER2b has 620 hits


In [10]:
# Let's sort them by "sseqid" and "sstart"
LmSIDER2a.sort_values(["sseqid", "sstart"], inplace=True)
LmSIDER2b.sort_values(["sseqid", "sstart"], inplace=True)

In [11]:
LmSIDER2a.head()

Unnamed: 0,qseqid,sseqid,pident,length,qlen,slen,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qcovhsp,sstrand,sseq
245,LmSIDER2a,LinJ.01,88.889,27,79,278267,1,1,18,42,75923,75949,0.3,32.5,32,plus,ACCTCAGCGTGGCATCCCAGGGTCCAG
256,LmSIDER2a,LinJ.03,86.207,29,79,389657,3,1,46,73,263663,263691,1.1,30.7,35,plus,ACCCCTCGCGGTGCAGGAAGCCGAGCAGC
257,LmSIDER2a,LinJ.03,100.0,16,79,389657,0,0,62,77,281600,281615,1.1,30.7,20,plus,AAGCCGAGCAGCTCCC
212,LmSIDER2a,LinJ.04,78.125,64,79,466507,7,4,16,72,331652,331589,0.085,34.4,72,minus,ACACCTCAGCGCGTGGTGCCTCAGCGGCCAGTGCACGCCCACTCTC...
213,LmSIDER2a,LinJ.04,78.125,64,79,466507,7,4,16,72,362905,362968,0.085,34.4,72,plus,ACACCTCAGCGCGTGGTGCCTCAGCGGCCAGTGCACGCCCACTCTC...


In [12]:
LmSIDER2b.head()

Unnamed: 0,qseqid,sseqid,pident,length,qlen,slen,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qcovhsp,sstrand,sseq
587,LmSIDER2b,LinJ.01,94.595,37,77,278267,2,0,1,37,24215,24251,4.86e-09,58.4,48,plus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG
588,LmSIDER2b,LinJ.01,94.595,37,77,278267,2,0,1,37,35816,35780,4.86e-09,58.4,48,minus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG
589,LmSIDER2b,LinJ.01,94.595,37,77,278267,2,0,1,37,55428,55392,4.86e-09,58.4,48,minus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG
590,LmSIDER2b,LinJ.01,94.595,37,77,278267,2,0,1,37,76012,76048,4.86e-09,58.4,48,plus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG
489,LmSIDER2b,LinJ.02,95.455,22,77,356299,1,0,56,77,204357,204336,0.023,36.2,29,minus,GGGGAGGTCAGAGCGATGTATC


In [18]:
# Let's get only the needed data
LmSIDER2a_real = LmSIDER2a[["sseqid", "sstart", "send", "sstrand", "sseq"]].copy()
# Remove "-" from "sseq"
LmSIDER2a_real["sseq"] = LmSIDER2a_real["sseq"].str.replace("-", "")
print(f"LmSIDER2a_real: {LmSIDER2a_real.shape[0]} elements")
LmSIDER2a_real.head()


LmSIDER2a_real: 258 elements


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
245,LinJ.01,75923,75949,plus,ACCTCAGCGTGGCATCCCAGGGTCCAG
256,LinJ.03,263663,263691,plus,ACCCCTCGCGGTGCAGGAAGCCGAGCAGC
257,LinJ.03,281600,281615,plus,AAGCCGAGCAGCTCCC
212,LinJ.04,331652,331589,minus,ACACCTCAGCGCGTGGTGCCTCAGCGGCCAGTGCACGCCCACTCTC...
213,LinJ.04,362905,362968,plus,ACACCTCAGCGCGTGGTGCCTCAGCGGCCAGTGCACGCCCACTCTC...


In [19]:
# The same with b
LmSIDER2b_real = LmSIDER2b[["sseqid", "sstart", "send", "sstrand", "sseq"]].copy()
# Remove "-" from "sseq"
LmSIDER2b_real["sseq"] = LmSIDER2b_real["sseq"].str.replace("-", "")
print(f"LmSIDER2b_real: {LmSIDER2b_real.shape[0]} elements")
LmSIDER2b_real.head()

LmSIDER2b_real: 620 elements


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
587,LinJ.01,24215,24251,plus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG
588,LinJ.01,35816,35780,minus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG
589,LinJ.01,55428,55392,minus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG
590,LinJ.01,76012,76048,plus,CCCTGCCAACGCCGAACCACTTCTGGTGCTGACAGGG
489,LinJ.02,204357,204336,minus,GGGGAGGTCAGAGCGATGTATC


In [21]:
LmSIDER2a_real.to_csv("./output_csv/Hallmarks_LmSIDER2a.csv", index=False, sep=",", header=True)

LmSIDER2b_real.to_csv("./output_csv/Hallmarks_LmSIDER2b.csv", index=False, sep=",", header=True)