In [10]:
from Bio.Blast.Applications import NcbiblastpCommandline
from Bio.Blast.Applications import NcbimakeblastdbCommandline
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Blast import NCBIXML
import pandas as pd

In [2]:
# translate DNA to amino acids

def translate(seq): 
    seq = str(seq)
       
    table = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    rest = len(seq)%3
    seq = seq[:-rest]
    protein ="" 
    if len(seq)%3 == 0: 
        for i in range(0, len(seq), 3): 
            codon = seq[i:i+3] 
            protein+= table[codon] 
    return Seq(protein)

db = []
for record in SeqIO.parse('data/genes_e_coli_new.fa', 'fasta'):
    db.append(record)
for i in range(len(db)):
    db[i].seq = translate(db[i].seq)
SeqIO.write(db, 'data/db.fa', 'fasta')

2550

In [3]:
# create BLAST database
cline = NcbimakeblastdbCommandline(dbtype="prot", input_file='data/db.fa')
print(cline)
stdout, stderr = cline()

makeblastdb -dbtype prot -in data/db.fa


In [4]:
# BLASTp algorithm
cline = NcbiblastpCommandline(query='data/protein_fragments.fa', db='data/db.fa',
                              evalue=0.0001, outfmt=5, out='data/blastp.xml')
cline
print(cline)
stdout, stderr = cline()

blastp -out data/blastp.xml -outfmt 5 -query data/protein_fragments.fa -db data/db.fa -evalue 0.0001


In [5]:
# parse BLAST result
result_handle = open('data/blastp.xml')
blast_records = NCBIXML.parse(result_handle)
blast_records = list(blast_records)

In [6]:
# print BLAST results
counter = 0
for blast_record in blast_records:
    counter += 1
    print('\n\nNEW RECORD', counter)
    alignment = blast_record.alignments[0]
    for hsp in alignment.hsps:
        print("****Alignment****")
        print("sequence:", alignment.title)
        print("length:", alignment.length)
        print("e value:", hsp.expect)
        print(hsp.query[0:50] + "...")
        print(hsp.match[0:50] + "...")
        print(hsp.sbjct[0:50] + "...")



NEW RECORD 1
****Alignment****
sequence: gnl|BL_ORD_ID|1841 queA coding sequence
length: 356
e value: 6.43504e-101
FTDLLDKLNPMDLLVFNNTRVIPARLFGRKASGGKIEVLVEYMLDDKRIL...
FTDLLDKLNP DLLVFNNTRVIPARLFGRKASGGKIEVLVE MLDDKRIL...
FTDLLDKLNPGDLLVFNNTRVIPARLFGRKASGGKIEVLVERMLDDKRIL...


NEW RECORD 2
****Alignment****
sequence: gnl|BL_ORD_ID|81 hupA coding sequence
length: 90
e value: 3.99289e-57
MNKTQLIDVIAEKAELSKTQAKPALESTLAAITESLKEGDAVQLVGFGTF...
MNKTQLIDVIAEKAELSKTQAK ALESTLAAITESLKEGDAVQLVGFGTF...
MNKTQLIDVIAEKAELSKTQAKAALESTLAAITESLKEGDAVQLVGFGTF...


NEW RECORD 3
****Alignment****
sequence: gnl|BL_ORD_ID|88 hupB coding sequence
length: 90
e value: 5.66038e-55
VNKSQLIDKIAAGADILKAAAGRALDACIASVTESLKEGDDVALVGFGTF...
VNKSQLIDKIAAGADI KAAAGRALDA IASVTESLKEGDDVALVGFGTF...
VNKSQLIDKIAAGADISKAAAGRALDAIIASVTESLKEGDDVALVGFGTF...


NEW RECORD 4
****Alignment****
sequence: gnl|BL_ORD_ID|1057 marR coding sequence
length: 144
e value: 2.14227e-95
VKSTSDLFNECIPLGRLIHMVNQKKDRLLNEYLSPLDITAAAFKVLCSIR...
V

In [7]:
protein_ids = []
for record in SeqIO.parse('data/protein_fragments.fa', 'fasta'):
    protein_ids.append(record.id)
ecoli_ids = []
for record in SeqIO.parse('data/db.fa', 'fasta'):
    ecoli_ids.append(record.id)

In [8]:
results = []
ind = 0
for blast_record in blast_records:
    alignment = blast_record.alignments[0]
    index = ''
    for i in range(14,len(alignment.title)):
        if alignment.title[i] != ' ':
            index = index + alignment.title[i]
        else:
            break
    index = int(index)
    results.append([protein_ids[ind],ecoli_ids[index],alignment.hsps[0].expect])
    ind += 1

In [9]:
for el in results:
    print(el)

['groupA_0', 'queA', 6.43504e-101]
['groupA_1', 'hupA', 3.99289e-57]
['groupA_2', 'hupB', 5.66038e-55]
['groupA_3', 'marR', 2.14227e-95]
['groupA_4', 'nanA', 3.37178e-100]
['groupA_5', 'acnB', 3.30561e-89]
['groupA_6', 'proP', 8.5255e-84]
['groupA_7', 'fadB', 8.06442e-92]
['groupA_8', 'rplM', 1.32952e-97]
['groupA_9', 'dmsA', 5.80295e-94]
['groupA_10', 'narK', 1.46921e-77]
['groupA_11', 'nirB', 1.22552e-99]
['groupA_12', 'mazE', 6.58137e-52]
['groupA_13', 'narG', 6.23408e-93]
['groupA_14', 'deoC', 1.20258e-101]
['groupA_15', 'aldB', 1.95169e-94]
['groupA_16', 'mglA', 2.51712e-97]
['groupA_17', 'pyrD', 2.31555e-98]
['groupA_18', 'lpd', 4.41091e-95]
['groupA_19', 'ndh', 2.83778e-97]
['groupA_20', 'glnA', 1.00564e-95]
['groupA_21', 'pflB', 7.86608e-99]
['groupA_22', 'trg', 2.38412e-86]
['groupA_23', 'fumB', 2.58052e-99]
['groupA_24', 'nrfA', 1.18563e-100]
['groupA_25', 'trmA', 8.76267e-102]
['groupA_26', 'cbpA', 2.5208e-96]
['groupA_27', 'nrdA', 1.37646e-97]
['groupA_28', 'glnQ', 2.65771e

In [28]:
# save results of problem 1 to a csv file
df = pd.DataFrame(results, columns=['protein_id','ecoli_id','e_value'])
df.to_csv('data/ex1.csv', index=False)

In [36]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from Bio import motifs

In [31]:
group_ids_A = np.asarray(pd.read_csv('data/ex1.csv')['ecoli_id'][:63])
group_ids_B = np.asarray(pd.read_csv('data/ex1.csv')['ecoli_id'][63:])

proms_A, proms_B = [], []
for record in SeqIO.parse('data/proms_e_coli_new.fa', 'fasta'):
    if record.id in group_ids_A:
        proms_A.append(record)
    if record.id in group_ids_B:
        proms_B.append(record)

SeqIO.write(proms_A, 'data/proms_A.fa', 'fasta')
SeqIO.write(proms_B, 'data/proms_B.fa', 'fasta')

35

In [37]:
with open('data/meme_A.xml') as f:
    record = motifs.parse(f, 'MEME')

ValueError: Improper input file. File should contain a line starting MEME version.