In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from Bio import Entrez, SeqIO

In [6]:
sacc_id = 'GCF_000146045.2_R64'
yl_id = 'GCF_000002525.2_ASM252v1'
ids = [sacc_id, yl_id]


In [11]:
# Define a function to make blast database for either protein of nucleotide
def make_blast_db(id,folder='prot',db_type='prot'):
    import os
    out_file ='%s/%s.fa.pin'%(folder, id)
    files =glob('%s/*.fa.pin'%folder)
    
    if out_file in files:
        print (id, 'already has a blast db')
        return
    if db_type=='nucl':
        ext='fna'
    else:
        ext='faa'

    cmd_line='makeblastdb -in %s/%s.%s -dbtype %s' %(folder, id, ext, db_type)
    
    print ('making blast db with following command line...')
    print (cmd_line)
    os.system(cmd_line)

In [12]:
for seq in ids:
    new_seq = seq+'_protein'
    make_blast_db(new_seq,folder='prot',db_type='prot')

making blast db with following command line...
makeblastdb -in prot/GCF_000146045.2_R64_protein.faa -dbtype prot


Building a new DB, current time: 10/14/2022 12:03:08
New DB name:   /home/kkrishnan/SBRG/Sequencing/Y-Lipolytica/blast/prot/GCF_000146045.2_R64_protein.faa
New DB title:  prot/GCF_000146045.2_R64_protein.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 6016 sequences in 0.109475 seconds.
making blast db with following command line...
makeblastdb -in prot/GCF_000002525.2_ASM252v1_protein.faa -dbtype prot


Building a new DB, current time: 10/14/2022 12:03:08
New DB name:   /home/kkrishnan/SBRG/Sequencing/Y-Lipolytica/blast/prot/GCF_000002525.2_ASM252v1_protein.faa
New DB title:  prot/GCF_000002525.2_ASM252v1_protein.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 6472 sequences in 0.115157 seconds.


In [None]:
# define a function to run BLASTp
def run_blastp(seq,db,in_folder='prot', out_folder='bbh', out=None,outfmt=6,evalue=0.001,threads=1):
    import os
    if out==None:
        out='%s/%s_vs_%s.txt'%(out_folder, seq, db)
        print(out)
    
    files =glob('%s/*.txt'%out_folder)
#    if out in files:
#        print (seq, 'already blasted')
#        return
    
    print ('blasting %s vs %s'%(seq, db))
    
    db = '%s/%s.faa'%(in_folder, db)
    seq = '%s/%s.faa'%(in_folder, seq)
    cmd_line='blastp -db %s -query %s -out %s -evalue %s -outfmt %s -num_threads %i' \
    %(db, seq, out, evalue, outfmt, threads)
    
    print ('running blastp with following command line...')
    print (cmd_line)
    os.system(cmd_line)
    return out