In [3]:
def test_function():
    print 'all functions in this stage found in ssbio4'

In [4]:
# ESSENTIAL
import os
import ast
import json
import pickle
import operator
import warnings
import numpy as np
import pandas as pd
import collections
from collections import defaultdict
from dateutil.parser import parse as dateparse

In [13]:
def run_alignment2(a_id, a_seq, b_id, b_seq):
    '''
    Runs the needle alignment program and returns a raw text dump of the alignment
    
    Input:  a_id - sequence ID #1 (string)
            a_seq - sequence #1 (string)
            b_id - sequence ID #2 (string)
            b_seq - sequence #2 (string)
    Output: alignment_file - file name of alignment
    
    DEPENDENCIES:
    get_alignment_allpos_df
    '''
    
    from Bio.Emboss.Applications import NeedleCommandline
    from Bio import AlignIO
    import os.path

    alignment_file = "/tmp/%s_%s_align.txt" % (a_id, b_id)

    needle_cline = NeedleCommandline(asequence="asis::"+a_seq, bsequence="asis::"+b_seq, gapopen=10, gapextend=0.5, outfile=alignment_file)
    stdout, stderr = needle_cline()
    
    return get_alignment_allpos_df(alignment_file, a_id, b_id)

def get_corresponding_resnum(alignment_table, ref_id, ref_pos, new_id):
    '''
    Input: ref_ID, and reference position you are interested in that is in new_id
    Output: Corresponding new resnum for that reference resnum
    '''
    id_a = alignment_table.id_a.unique()[0]
    id_b = alignment_table.id_b.unique()[0]

    if ref_id == id_a:
        ref_id_col = 'id_a'
        ref_pos_col = 'id_a_pos'
        new_id_col = 'id_b'
        new_pos_col = 'id_b_pos'
    elif ref_id == id_b:
        ref_id_col = 'id_b'
        ref_pos_col = 'id_b_pos'
        new_id_col = 'id_a'
        new_pos_col = 'id_a_pos'
    else:
        print 'Reference ID not in alignment table!'
        return None

    try:
        return int(alignment_table[(alignment_table[ref_id_col] == ref_id) & (alignment_table[ref_pos_col] == ref_pos)][new_pos_col].values[0])
    except ValueError:
        return np.nan
    
def get_alignment_allpos_df(alignment_file, a_seq_id=None, b_seq_id=None):
    alignments = list(AlignIO.parse(alignment_file, "emboss"))

    appender = defaultdict(dict)
    idx = 0
    for alignment in alignments:
    #         if not switch:
        if not a_seq_id:
            a_seq_id = list(alignment)[0].id
        a_seq = str(list(alignment)[0].seq)
        if not b_seq_id:
            b_seq_id = list(alignment)[1].id
        b_seq = str(list(alignment)[1].seq)

        a_idx = 1
        b_idx = 1

        for i, (a,b) in enumerate(zip(a_seq,b_seq)):
            if a == b and a != '-' and b != '-':
                aa_flag = 'match'
            if a != b and a == '-' and b != '-':
                aa_flag = 'insertion'
            if a != b and a != '-' and b == '-':
                aa_flag = 'deletion'
            if a != b and a != '-' and b == 'X':
                aa_flag = 'unresolved'
            if a != b and b != '-' and a == 'X':
                aa_flag = 'unresolved'
            elif a != b and a != '-' and b != '-':
                aa_flag = 'mutation'
                
            appender[idx]['id_a'] = a_seq_id
            appender[idx]['id_b'] = b_seq_id
            appender[idx]['type'] = aa_flag
            
            if aa_flag == 'match' or aa_flag == 'unresolved' or aa_flag == 'mutation':
                appender[idx]['id_a_aa'] = a
                appender[idx]['id_a_pos'] = a_idx
                appender[idx]['id_b_aa'] = b
                appender[idx]['id_b_pos'] = b_idx
                a_idx += 1
                b_idx += 1

            if aa_flag == 'deletion':
                appender[idx]['id_a_aa'] = a
                appender[idx]['id_a_pos'] = a_idx
                a_idx += 1

            if aa_flag == 'insertion':
                appender[idx]['id_b_aa'] = b
                appender[idx]['id_b_pos'] = b_idx
                b_idx += 1
            
            idx += 1

    alignment_df = pd.DataFrame.from_dict(appender, orient='index')
    alignment_df = alignment_df[['id_a', 'id_b', 'type', 'id_a_aa', 'id_a_pos', 'id_b_aa', 'id_b_pos']].fillna(value=np.nan)
    
    return alignment_df

In [17]:
def get_struct_faa(path_to_file, file_name):
    
    from Bio.PDB.PDBParser import PDBParser
    pdb_res = ''
    
    homol_file_name = path_to_file+file_name
    if os.path.exists(homol_file_name):
        file_path = homol_file_name
    else:
        file_path = None
        print "Wrong file path in code 'get_struct_faa' "
        
    if file_path != None:

        parser = PDBParser()
        structure = parser.get_structure(file_name, file_path)
        model = structure[0]
        chains = [i.id for i in model.child_list]

        for chain_id in chains:
            chain = model[chain_id]
            #print chain_id
            residue_list = chain.child_list
            for residue in list(residue_list):
                if residue.id[0] == ' ':                
                    #print residue.id[1], residue.resname.upper(), AAdict[str(residue.resname.upper())]
                    try:
                        pdb_res = pdb_res+ str(AAdict[str(residue.resname.upper())])
                    except KeyError:
                        if str(residue.resname.upper()) == 'HIE':
                            pdb_res = pdb_res+'H'
        
        return pdb_res, chains, file_path
    
    else:
        return None, None

In [18]:
from Bio import Struct 
from Bio import PDB

#calculate combinations
def combinations(iterable, r):
    # combinations('ABCD', 2) --> AB AC AD BC BD CD
    # combinations(range(4), 3) --> 012 013 023 123
    pool = tuple(iterable)
    n = len(pool)
    if r > n:
        return
    indices = range(r)
    yield list(pool[i] for i in indices)
    while True:
        for i in reversed(range(r)):
            if indices[i] != i + n - r:
                break
        else:
            return
        indices[i] += 1
        for j in range(i+1, r):
            indices[j] = indices[j-1] + 1
        yield list(pool[i] for i in indices)
                
#calculate the magnitude of distance vector
def magni(a,b,c):
    return pow((pow(a,2)+pow(b,2)+pow(c,2)),1.0/2.0)
 
def generate_distance_matrices(struct_dir, df):
    closelist=[]

    for i in df.gene.unique():
        if len(df[df.gene==i])>=2:
            filepath=df[df.gene==i].pdb_file.values[0]
            s= Struct.read(struct_dir+filepath)
            model=s[0]
            res_list = PDB.Selection.unfold_entities(model, 'R')
            pdb_start_list=df[df.gene==i].p_pdb_aa.tolist()
            ires_list=[]
            coord_list=[]
            for j in res_list:
                if j.id[1] in pdb_start_list and j.resname!='HOH':
                    ires_list.append(j)
            paired=combinations(ires_list,2)
            for k in paired:
                chainA=PDB.Selection.unfold_entities(k[0], 'C')[0]
                chainB=PDB.Selection.unfold_entities(k[1], 'C')[0]
                vec=list(np.array([x.get_coord() for x in k[0]]).mean(axis=0)-np.array([x.get_coord() for x in k[1]]).mean(axis=0))
                distance=magni(vec[0],vec[1],vec[2])
                if distance<=35: #cut off 
                     closelist.append([i,filename,chainA.id,chainB.id,k[0].id[1],k[1].id[1],k[0].resname,k[1].resname,distance])
                        
    df=pd.DataFrame(closelist)
    df.columns=['gene','pdb','chainA','chainB','locationA','locationB','resA','resB','distance']
    return df        

def calculate_res_distance(gene, res_1, res_2, df, struct_dir):
    
    filepath=df[df.gene==gene].pdb_file.values[0]
    s= Struct.read(struct_dir+filepath)
    model=s[0]
    res_list = PDB.Selection.unfold_entities(model, 'R')
    
    ires_list=[]
    res_chk_1 = ''
    res_chk_2 = ''
    name_chk_1 = ''
    name_chk_2 = ''
    for j in res_list:
        if j.id[1] in [res_1,res_2] and j.resname!='HOH':
            ires_list.append(j)
            #print j.id[1], j.resname, AAdict[j.resname]
            if res_chk_1 == '' and res_chk_2 == '':
                res_chk_1 = j.id[1]
                name_chk_1 = AAdict[j.resname]
            else:
                res_chk_2 = j.id[1]
                name_chk_2 = AAdict[j.resname]
                
    paired=combinations(ires_list,2)
    try:
        for k in paired:
            chainA=PDB.Selection.unfold_entities(k[0], 'C')[0]
            chainB=PDB.Selection.unfold_entities(k[1], 'C')[0]
            vec=list(np.array([x.get_coord() for x in k[0]]).mean(axis=0)-np.array([x.get_coord() for x in k[1]]).mean(axis=0))
            distance=magni(vec[0],vec[1],vec[2])
            
        return (res_chk_1,name_chk_1,res_chk_2,name_chk_2, distance, len(res_list))
    except UnboundLocalError:
        return "Unknown interaction"

In [6]:
strandict={
'iAPECO1_1312':'EP',
 'ic_1306':'EP',
 'iE2348C_1286':'EP',
 'iEC042_1314':'EP',
 'iEC55989_1330':'IP',
 'iECABU_c1320':'EP',
 'iECB_1328':'CM',
 'iEcE24377_1341':'IP',
 'iECH74115_1262':'IP',
 'iECIAI1_1343':'IP',
 'iECIAI39_1322':'EP',
 'iECNA114_1301':'EP',
 'iECO103_1326':'IP',
 'iECO111_1330':'IP',
 'iECO26_1355':'IP',
 'iECOK1_1307':'EP',
 'iEcolC_1368':'CM',
 'iECP_1309':'EP',
 'iECs_1301':'IP',
 'iECS88_1305':'EP',
 'iECSE_1348':'IP',
 'iECSF_1327':'EP',
 'iEcSMS35_1347':'EP',
 'iECSP_1301':'IP',
 'iECUMN_1333':'EP',
 'iETEC_1333':'CM',
 'iG2583_1286':'IP',
 'iLF82_1304':'EP',
 'iNRG857_1313':'EP',
 'iS_1188':'SG',
 'iSbBS512_1146':'SG',
 'iSBO_1134':'SG',
 'iSDY_1059':'SG',
 'iSFV_1184':'SG',
 'iSFxv_1172':'SG',
 'iSSON_1240':'SG',
 'iUMNK88_1353':'CM',
 'iUTI89_1310':'EP',
 'iBWG_1329':'CM',
 'iECDH10B_1368':'CM',
 'iEcDH1_1363':'CM',
 'iJO1366':'CM',
 'iECDH1ME8569_1439':'CM'
}

In [19]:
def find_residue_depth(res,gene_name,df):
    for i in ast.literal_eval(df[df.gene == gene_name].msms.values[0]):
        if i[1] == res:
            return i[2], i[3]

In [23]:
# + = red, - = blue, hydrophobic = yellow, polar = green, neutral = grey
from brewer2mpl import * 

positive_charge_colors = brewer2mpl.get_map('reds', 'sequential', 8).mpl_colors
negative_charge_colors = brewer2mpl.get_map('Blues', 'sequential', 8).mpl_colors
hydrophobic_colors = brewer2mpl.get_map('greens', 'sequential', 8).mpl_colors
polar_colors = brewer2mpl.get_map('purples', 'sequential', 8).mpl_colors
neutral_colors = brewer2mpl.get_map('greys', 'sequential', 8).mpl_colors
special_colors = brewer2mpl.get_map('YlOrBr', 'sequential', 8).mpl_colors

color_dict = {'R':positive_charge_colors[4],
 'K':positive_charge_colors[5],
 'H':positive_charge_colors[6],
 'D':negative_charge_colors[5],
 'E':negative_charge_colors[7],
 'S':polar_colors[3],
 'T':polar_colors[4],
 'N':polar_colors[5],
 'Q':polar_colors[6],
 'C':special_colors[2],
 'U':special_colors[1],
 'P':special_colors[3],
 'G':neutral_colors[2],
 'A':neutral_colors[5],
 'I':hydrophobic_colors[7],
 'L':hydrophobic_colors[6],
 'M':hydrophobic_colors[5],
 'F':hydrophobic_colors[4],
 'W':hydrophobic_colors[3],
 'V':hydrophobic_colors[2],
 'Y':hydrophobic_colors[1],
 }

color_dict_course = [positive_charge_colors[6],
                     negative_charge_colors[7],
                     polar_colors[6],
                     hydrophobic_colors[6]]

In [25]:
def describe_mutations_per_gene(df):
    dplow=df[df.ca_res_depth <4]
    dpmed=df[df.ca_res_depth >=4][df.ca_res_depth <7.5]
    dphigh=df[df.ca_res_depth >=7.5]

    dplowdn=dplow.drop_duplicates(cols=['p_pdb_aa'])
    dpmeddn=dpmed.drop_duplicates(cols=['p_pdb_aa'])
    dphighdn=dphigh.drop_duplicates(cols=['p_pdb_aa'])

    lowlist=dplowdn.ix[:,10:37].mean().values.tolist()
    medlist=dpmeddn.ix[:,10:37].mean().values.tolist()
    highlist=dphighdn.ix[:,10:37].mean().values.tolist()

    lmh=pd.DataFrame([lowlist,medlist,highlist])
    lmh.columns=[u'R', u'H', u'K', u'D', u'E', u'S', u'T', u'N', u'Q', u'C', u'U', u'Y', u'P', u'G', u'A', u'I', u'L', u'M', u'F', u'W', u'V', u'X', u'positive', u'negative', u'polar', u'nonpolar', u'unknown']
    lmh.index=['surface','med','deep']

    f = plt.figure()
    lmh[['positive','negative','polar','nonpolar']].plot(kind='bar', ax=f.gca(),colors = color_dict_course,alpha=0.5)
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    f.suptitle('gene: %s'%str(df.gene.unique()[0]))
    return lmh
