In [1]:
import sys
sys.path.append("./SimilarityRegression/")

In [2]:
import pandas as pd
import itertools
import os
import gzip
import glob

from tqdm.notebook import tqdm_notebook

from similarityregression import PairwiseAlignment as pwsaln
from similarityregression import AlignmentTools as alntools
from similarityregression import PredictSimilarity as srpred

In [3]:
matchOnlyMotifs = True

# Read Data

## Sponge

In [4]:
# Collate Sponge Data
df_sponge = pd.read_table('Sponge/SpongeDomains.tab', header=None)
df_sponge.columns=['protein', 'tf', 'domain', 'start', 'end', 'seq', 'domain_order']
df_sponge.head()

  


Unnamed: 0,protein,tf,domain,start,end,seq,domain_order
0,NP_001001768.2,ECB-GENE-23082762,HLH,24,81,RKSSKPQMEKRRRARINDSLGQLKALILEATNKDSSRHSKLEKADI...,HLH
1,NP_001005725.1,ECB-GENE-23171730,GATA,261,295,CVNCGAISTPLWRRDGTGHYLCNACGLYHKMNGYN,GATA
2,NP_001005725.1,ECB-GENE-23171730,GATA,316,350,CANCHTSTTTLWRRNKDGEPVCNACGLYFKLHGVN,GATA
3,NP_001009577.1,ECB-GENE-23077688,Homeobox,294,350,RRARTAFTYEQLVALENKFKTTRYLSVCERLNLALSLSLTETQVKI...,Homeobox
4,NP_001015510.1,ECB-GENE-23031747,Pou,445,516,TDLEELEQFARTFKQRRIKLGFTQGDVGLAMGKLYGNDFSQTTISR...,"Pou,Homeobox"


In [28]:
dinc = []
for pid, pfeats in df_sponge.groupby('protein'):
    if pfeats.shape[0] > 1:
        mcopies = pfeats['start']
        dinc.append(mcopies.is_monotonic_increasing)

In [30]:
all(dinc)

True

In [169]:
print 'N proteins:', len(df_sponge['protein'].unique())
print 'N genes:', len(df_sponge['tf'].unique())

N proteins: 1564
N genes: 1013


## Cis-BP

In [170]:
# 2.0
loc_DBFiles = 'REF/DB_2.00/'
domains = pd.read_csv(loc_DBFiles + 'domains.tab', sep = '\t', skiprows=[1], index_col=0)

tfs = pd.read_csv(loc_DBFiles + 'tfs.tab', sep = '\t', skiprows=[1], index_col=0)
tf_families = pd.read_csv(loc_DBFiles + 'tf_families.tab', sep = '\t', skiprows=[1], index_col=0)

motifs = pd.read_csv(loc_DBFiles + 'motifs.tab', sep = '\t', skiprows=[1], index_col=0)
motif_features = pd.read_csv(loc_DBFiles + 'motif_features.tab', sep = '\t', skiprows=[1], index_col=0)
motif_features['Pfam'] = [domains['Pfam_Name'].get(x) for x in motif_features['Domain_ID']]

proteins = pd.read_csv(loc_DBFiles + 'proteins.tab', sep = '\t', skiprows=[1], index_col=0)
prot_features = pd.read_csv(loc_DBFiles + 'prot_features.tab', sep = '\t', skiprows=[1], index_col=0)
prot_features['Pfam'] = [domains['Pfam_Name'].get(x) for x in prot_features['Domain_ID']]

In [171]:
sr_models = {}
# Collate SR Models (1.97d):
for loc_SRModel in glob.glob('REF/DB_1.97/SRModels/*json'):
    SRModel = srpred.ReadSRModel(loc_SRModel)
    if SRModel['Family_Name'] == 'NO_THRESHOLD':
        sr_models['NO_THRESHOLD'] = SRModel
    else:
        fid_2 = tf_families.reset_index().set_index('Family_Name')['Family_ID'].get(SRModel['Family_Name'])
        if fid_2 is not None:
            SRModel['Family_ID'] = fid_2
            sr_models[fid_2] = SRModel
        else:
            print('Warning missing:',SRModel)
sr_models

{'F009_2.00': {u'Baseline': {u'Class': u'SequenceIdentity',
   u'Name': u'PctID_L',
   u'Threshold.Dis': 0.538461538462,
   u'Threshold.HSim': 0.775510204082},
  u'Family_ID': 'F009_2.00',
  u'Family_Name': u'AP2',
  u'Model.Class': u'SimilarityRegression',
  u'Model.Name': u'AvgB62.Regression',
  u'SR.FeatureScales.mean': array([ 0.05555556, -0.51463616,  2.82580911,  2.16233766,  5.46856318,
          3.62090291,  1.54442383,  0.21057514,  1.86940837,  0.9992785 ,
          0.03483818,  2.03061224,  2.13739435,  1.70428778,  6.22892187,
          0.51937745,  2.02494331,  2.19202226,  0.9837147 ,  1.4102247 ,
          1.24056895,  0.73613688,  0.40878169,  0.06936714, -0.00752422,
         -0.06019377,  0.01463616,  0.22716966,  2.0861678 ,  1.49299114,
          4.210369  ,  2.41486291,  2.46846011,  1.78324057,  3.84559885,
          0.60039167,  1.31467739,  1.75582354,  0.66996496,  3.64708308,
          1.67862296,  0.50257679,  1.35106164,  2.1002886 ,  2.18315811,
         -0

### Assign TF Families

In [172]:
Pfams2Fam = {}
for FID, info in tf_families.iterrows():
    DBDs = info['DBDs'].split(',')
    DBDs.sort()
    Pfams2Fam[tuple(DBDs)] = FID
Pfams2Fam

{('AFT',): 'F007_2.00',
 ('AFT', 'FAR1'): 'F008_2.00',
 ('AP2',): 'F009_2.00',
 ('AP2', 'B3'): 'F010_2.00',
 ('AP2', 'DUF260'): 'F019_2.00',
 ('AP2', 'DUF573'): 'F022_2.00',
 ('AP2', 'FAR1'): 'F014_2.00',
 ('AP2', 'GATA'): 'F015_2.00',
 ('AP2', 'HSF_DNA-bind'): 'F018_2.00',
 ('AP2', 'HSF_DNA-bind', 'MBD', 'Myb_DNA-binding', 'zf-CXXC'): 'F013_2.00',
 ('AP2', 'Homeobox'): 'F016_2.00',
 ('AP2', 'Homeobox', 'Myb_DNA-binding'): 'F017_2.00',
 ('AP2', 'Myb_DNA-binding'): 'F020_2.00',
 ('AP2', 'TCR'): 'F023_2.00',
 ('AP2', 'zf-BED'): 'F011_2.00',
 ('AP2', 'zf-CXXC'): 'F012_2.00',
 ('ARID',): 'F028_2.00',
 ('ARID', 'AT_hook'): 'F029_2.00',
 ('ARID', 'CENP-B_N', 'RFX_DNA_binding'): 'F032_2.00',
 ('ARID', 'HMG_box'): 'F038_2.00',
 ('ARID', 'Myb_DNA-binding'): 'F033_2.00',
 ('ARID', 'Myb_DNA-binding', 'Rap1-DNA-bind'): 'F034_2.00',
 ('ARID', 'RFX_DNA_binding'): 'F036_2.00',
 ('ARID', 'RFX_DNA_binding', 'zf-C2H2'): 'F031_2.00',
 ('ARID', 'Rap1-DNA-bind'): 'F035_2.00',
 ('ARID', 'SAND'): 'F037_2.00'

In [173]:
motif_features['Family_ID'] = ''
for mid, mdata in motif_features.groupby('Motif_ID'):
    m_pfams = list(set(mdata['Pfam']))
    m_pfams.sort()
    motif_features.loc[motif_features['Motif_ID'] == mid, 'Family_ID'] = Pfams2Fam.get(tuple(m_pfams))
    

In [174]:
df_sponge['Family_ID'] = ''
for mid, mdata in df_sponge.groupby('protein'):
    m_pfams = list(set(mdata['domain']))
    m_pfams.sort()
    df_sponge.loc[df_sponge['protein'] == mid, 'Family_ID'] = Pfams2Fam.get(tuple(m_pfams))
    

## Parse Domains For Alignment

In [175]:
# Check no new domains
set(df_sponge['domain']).isdisjoint(set(domains['Pfam_Name']))

False

In [176]:
# Collect Sequences & Run Alignment
for pfam_name in domains['Pfam_Name'].unique():
    DBDseqs = set()
    if matchOnlyMotifs is False:
        DBDseqs.update(set(prot_features.loc[prot_features['Pfam'] == pfam_name, 'ProtFeature_Sequence']))
    
    DBDseqs.update(set(motif_features.loc[motif_features['Pfam'] == pfam_name, 'MotifFeature_Sequence']))
    DBDseqs.update(set(df_sponge.loc[df_sponge['domain'] == pfam_name, 'seq']))
    
    if len(DBDseqs) > 0:
        with open('Domains/{}.fa'.format(pfam_name), 'w') as outfile:
            for DBDseq in DBDseqs:
                outfile.write('>' + DBDseq + '\n' + DBDseq + '\n')

        print(pfam_name)
        os.system('python RunAPHID.py REF/Pfam_HMMs/{}.hmm Domains/{}.fa semiglobal'.format(pfam_name, pfam_name))

BAF1_ABF1
AFT
AP2
TF_AP-2
KilA-N
ARID
AT_hook
B3
GAGA_bind
zf-BED
HLH
BrkDBD
bZIP_1
zf-C2H2
zf-C2HC
CBFB_NFYA
zf-CCCH
CENP-B_N
CG-1
Copper-fist
CSD
LAG1-DNAbind
CUT
zf-CXXC
DM
zf-Dof
E2F_TDP
COE1_DBD
EIN3
Ets
FAR1
FLYWCH
Forkhead
GATA
GCM
GCR1_C
CP2
GRAS
GTF2I
Homeobox
HSF_DNA-bind
IBD
IRF
FLO_LFY
DUF260
MADF_DNA_bdg
SRF-TF
MATalpha_HMGbox
MBD
mTERF
Myb_DNA-binding
NAM
NDT80_PhoG
zf-NF-X1
zf-C4
P53
PAX
HTH_psq
PLATZ
Pou
HPD
Rap1-DNA-bind
RHD_DNA_bind
RFX_DNA_binding
RRM_1
Runt
RWP-RK
S1FA
SAND
SBP
MH1
HMG_box
STAT_bind
STE
DUF573
T-box
TBP
TCP
TCR
TEA
THAP
UNKNOWN
Vhr1
WRC
WRKY
Zn_clus


In [177]:
#Read results into dictionary that maps the DBD sequence to its reference alignment
AlnDict_ByPfam = {} # Pfam : {Seq : Reference Alignment}
AlnLength_ByPfam = {}
for loc_aligned in glob.glob('DBDMatchPos_aphid/*matchpos_semiglobal.fa'):
    pfam_name = loc_aligned.split('/')[-1].split('.')[0]
    AlnDict_ByPfam[pfam_name] = {}
    lengths = []
    for seq, aln in alntools.FastaIter(fileloc=loc_aligned):
        AlnDict_ByPfam[pfam_name][seq] = aln.upper().replace('.', '-')
        lengths.append(len(aln))
    if all([x == lengths[0] for x in lengths]):
        AlnLength_ByPfam[pfam_name] = lengths[0]
    else:
        print('Variable Alignment Length', pfam_name)

In [178]:
len(motif_features.groupby(['Pfam', 'MotifFeature_Sequence'])) + len(df_sponge.groupby(['domain', 'seq']))

13249

In [179]:
SR_Scores_i = []
SR_Scores = []
    
for tf_family, fdata in tf_families.iterrows():
    DBDs = fdata['DBDs'].split(',')
    if tf_family in set(df_sponge['Family_ID']):
        # Get SR Model
        SRModel_family = sr_models.get(tf_family)
        if SRModel_family is None:
            SRModel_family = sr_models.get('NO_THRESHOLD')
            print tf_family, fdata['Family_Name'], DBDs, 'NO_THRESHOLD'
        else:
            print tf_family, fdata['Family_Name'], DBDs, SRModel_family['Model.Class']
        
        JointSeqDict = {}
        for currentDBD in DBDs:
            currentDBD_dict = AlnDict_ByPfam[currentDBD].copy()
            for unaln, aln in currentDBD_dict.items():
                jointaln = '' # has to be in order of the family name
                for DBD in DBDs:
                    if DBD == currentDBD:
                        jointaln += aln
                    else:
                        jointaln += '-'*AlnLength_ByPfam[DBD]
                JointSeqDict[unaln] = jointaln
        
        # Motifs
        MotifSequences = {} # MID: [DBD Sequences]
        for motif_id, MID_mfeats in motif_features[motif_features['Family_ID'] == tf_family].groupby('Motif_ID'):
            alnseqs = []
            for ID_mfeat, mfeat in MID_mfeats.iterrows():
                unaln = mfeat['MotifFeature_Sequence']
                aln = JointSeqDict[unaln]
                alnseqs.append(aln)
            MotifSequences[motif_id] = ','.join(alnseqs)
        
        # Sponge Proteins
        ProteinSequences = {} # Sponge: [DBD Sequences]
        for protein, MID_mfeats in df_sponge[df_sponge['Family_ID'] == tf_family].groupby('protein'):
            alnseqs = []
            for ID_mfeat, mfeat in MID_mfeats.iterrows():
                unaln = mfeat['seq']
                aln = JointSeqDict[unaln]
                alnseqs.append(aln)
            ProteinSequences[protein] = ','.join(alnseqs)
            
        # Flipped Dict 
        uSeqs = {} # seq : [id, id, id]
        for key, seq in MotifSequences.items():
            if seq in uSeqs:
                uSeqs[seq].add(key)
            else:
                uSeqs[seq] = set([key])
        for key, seq in ProteinSequences.items():
            if seq in uSeqs:
                uSeqs[seq].add(key)
            else:
                uSeqs[seq] = set([key])
                
        # Score Against itself (e.g. identical)
        for uSeq, ids in tqdm_notebook(uSeqs.items()):
            l_ids = list(ids)
            l_ids.sort()
            sr_alignment = pwsaln.AlignDBDArrays(('i', uSeq.split(',')), 
                                                 ('j', uSeq.split(',')))
            SR_Score, SR_Class = srpred.ScoreAlignmentResult(resultDict=sr_alignment, scoreDict=SRModel_family)
            for i, j in itertools.combinations(l_ids, 2):
                pair = [i, j]
                SR_Scores_i.append(tuple([tf_family] + pair))
                SR_Scores.append([sr_alignment['PctID_L'], SR_Score, SR_Class, SRModel_family['Model.Class'], SRModel_family['Family_Name'] ,fdata['Family_Name']])
                
        
        #Score unique seq x seq
        combos = [i for i in itertools.combinations(uSeqs.keys(), 2)]
        for i, j in tqdm_notebook(combos):
            ids_i = list(uSeqs[i])
            ids_j = list(uSeqs[j])
            sr_alignment = pwsaln.AlignDBDArrays(('|'.join(ids_i), i.split(',')), 
                                                 ('|'.join(ids_j), j.split(',')))
            SR_Score, SR_Class = srpred.ScoreAlignmentResult(resultDict=sr_alignment, scoreDict=SRModel_family)
            for id_i in ids_i:
                for id_j in ids_j:
                    pair = [id_i, id_j]
                    pair.sort()
                    SR_Scores_i.append(tuple([tf_family] + pair))
                    SR_Scores.append([sr_alignment['PctID_L'], SR_Score, SR_Class, SRModel_family['Model.Class'], SRModel_family['Family_Name'] ,fdata['Family_Name']])
        
        
        
#         # Do Motif x Motif Alignments
#         print('Aligning Motifs x Motifs')
#         mkeys = MotifSequences.keys()
#         mkeys.sort()
#         for i, j in itertools.combinations(mkeys, 2):
#             sr_alignment = pwsaln.AlignDBDArrays((i, MotifSequences[i]), 
#                                                  (j, MotifSequences[j]))
#             SR_Score, SR_Class = srpred.ScoreAlignmentResult(resultDict=sr_alignment, scoreDict=SRModel_family)
#             SR_Scores_i.append((tf_family, i, j))
#             SR_Scores.append([sr_alignment['PctID_L'], SR_Score, SR_Class, SRModel_family['Model.Class'], SRModel_family['Family_Name'] ,fdata['Family_Name']])
        
#         # Do Motif x Sponge Alignments
#         print('Aligning Motifs x Sponge Proteins')
#         pkeys = ProteinSequences.keys()
#         pkeys.sort()
#         for mid in mkeys:
#             for pid in pkeys:                
#                 sr_alignment = pwsaln.AlignDBDArrays((mid, MotifSequences[mid]), 
#                                                      (pid, ProteinSequences[pid]))
#                 SR_Score, SR_Class = srpred.ScoreAlignmentResult(resultDict=sr_alignment, scoreDict=SRModel_family)
#                 SR_Scores_i.append((tf_family, mid, mid))
#                 SR_Scores.append([sr_alignment['PctID_L'], SR_Score, SR_Class, SRModel_family['Model.Class'], SRModel_family['Family_Name'] ,fdata['Family_Name']])
        
#         # Do Sponge x Sponge Alignments
#         print('Aligning Sponge x Sponge Proteins')
#         for i, j in itertools.combinations(pkeys, 2):
#             sr_alignment = pwsaln.AlignDBDArrays((i, ProteinSequences[i]), 
#                                                  (j, ProteinSequences[j]))
#             SR_Score, SR_Class = srpred.ScoreAlignmentResult(resultDict=sr_alignment, scoreDict=SRModel_family)
#             SR_Scores_i.append((tf_family, i, j))
#             SR_Scores.append([sr_alignment['PctID_L'], SR_Score, SR_Class, SRModel_family['Model.Class'], SRModel_family['Family_Name'] ,fdata['Family_Name']])

SR_Scores = pd.DataFrame(SR_Scores, columns=['AA %ID', 'SR_Score', 'SR_Class', 'SRModel_Class', 'SRModel_Name', 'Family_Name',])
SR_Scores.index = pd.MultiIndex.from_tuples(SR_Scores_i)
SR_Scores.index.names = ['Family_ID', 'ID_x', 'ID_y']
# Sort the DF
SR_Scores = SR_Scores.reset_index()
SR_Scores = SR_Scores.sort_values(['Family_ID', 'SR_Score', 'ID_x', 'ID_y'], ascending=[True, False, True, True])

# Display Output
SR_Scores.to_csv('SR_Sponge_All.csv.gz', compression='gzip')
SR_Scores.head()


F024_2.00 AP-2 ['TF_AP-2'] NO_THRESHOLD


  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

F028_2.00 ARID/BRIGHT ['ARID'] SimilarityRegression


  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/253 [00:00<?, ?it/s]

F036_2.00 ARID/BRIGHT,RFX ['ARID', 'RFX_DNA_binding'] NO_THRESHOLD


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

F039_2.00 AT hook ['AT_hook'] SimilarityRegression


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/666 [00:00<?, ?it/s]

F050_2.00 B3 ['B3'] NO_THRESHOLD


  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/171 [00:00<?, ?it/s]

F061_2.00 BBR ['GAGA_bind'] NO_THRESHOLD


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

F062_2.00 BED ZF,C2H2 ZF ['zf-BED', 'zf-C2H2'] NO_THRESHOLD


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

F077_2.00 BED ZF ['zf-BED'] NO_THRESHOLD


  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

F082_2.00 bHLH ['HLH'] SimilarityRegression


  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/44850 [00:00<?, ?it/s]

F086_2.00 bHLH,Sox ['HLH', 'HMG_box'] NO_THRESHOLD


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

F091_2.00 bZIP ['bZIP_1'] SimilarityRegression


  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/39060 [00:00<?, ?it/s]

F092_2.00 bZIP,C2H2 ZF ['bZIP_1', 'zf-C2H2'] NO_THRESHOLD


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

F111_2.00 C2H2 ZF,FAR1 ['zf-C2H2', 'FAR1'] NO_THRESHOLD


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

F116_2.00 C2H2 ZF,Homeodomain ['zf-C2H2', 'Homeobox'] NO_THRESHOLD


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

F119_2.00 C2H2 ZF,MADF ['zf-C2H2', 'MADF_DNA_bdg'] NO_THRESHOLD


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

F123_2.00 C2H2 ZF,Myb/SANT ['zf-C2H2', 'Myb_DNA-binding'] NO_THRESHOLD


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

F127_2.00 C2H2 ZF,Pipsqueak ['zf-C2H2', 'HTH_psq'] NO_THRESHOLD


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

F134_2.00 C2H2 ZF,THAP finger ['zf-C2H2', 'THAP'] NO_THRESHOLD


  0%|          | 0/1 [00:00<?, ?it/s]

0it [00:00, ?it/s]

F135_2.00 C2H2 ZF ['zf-C2H2'] SimilarityRegression


  0%|          | 0/1291 [00:00<?, ?it/s]

  0%|          | 0/832695 [00:00<?, ?it/s]

F137_2.00 C2HC ZF ['zf-C2HC'] NO_THRESHOLD


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

F138_2.00 CBF/NF-Y ['CBFB_NFYA'] NO_THRESHOLD


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

F141_2.00 CENPB ['CENP-B_N'] NO_THRESHOLD


  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

F152_2.00 CG-1 ['CG-1'] NO_THRESHOLD


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

F158_2.00 CSD ['CSD'] NO_THRESHOLD


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/190 [00:00<?, ?it/s]

F161_2.00 CSL ['LAG1-DNAbind'] NO_THRESHOLD


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

F162_2.00 CUT ['CUT'] NO_THRESHOLD


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

F164_2.00 CUT,Homeodomain ['CUT', 'Homeobox'] NO_THRESHOLD


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

F169_2.00 CxxC ['zf-CXXC'] NO_THRESHOLD


  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/325 [00:00<?, ?it/s]

F170_2.00 DM ['DM'] SimilarityRegression


  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/595 [00:00<?, ?it/s]

F174_2.00 E2F ['E2F_TDP'] SequenceIdentity


  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/595 [00:00<?, ?it/s]

F178_2.00 EBF1 ['COE1_DBD'] NO_THRESHOLD


  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

F180_2.00 Ets ['Ets'] SimilarityRegression


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/1770 [00:00<?, ?it/s]

F182_2.00 FAR1 ['FAR1'] NO_THRESHOLD


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

F190_2.00 FLYWCH ['FLYWCH'] NO_THRESHOLD


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

F196_2.00 Forkhead ['Forkhead'] SimilarityRegression


  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/7381 [00:00<?, ?it/s]

F201_2.00 GATA ['GATA'] SimilarityRegression


  0%|          | 0/77 [00:00<?, ?it/s]

  0%|          | 0/2926 [00:00<?, ?it/s]

F202_2.00 GATA,Myb/SANT ['GATA', 'Myb_DNA-binding'] NO_THRESHOLD


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

F212_2.00 GCM ['GCM'] SequenceIdentity


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

F215_2.00 Grainyhead ['CP2'] NO_THRESHOLD


  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

F219_2.00 GTF2I-like ['GTF2I'] NO_THRESHOLD


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

F223_2.00 Homeodomain ['Homeobox'] SimilarityRegression


  0%|          | 0/685 [00:00<?, ?it/s]

  0%|          | 0/234270 [00:00<?, ?it/s]

F230_2.00 Homeodomain,Paired box ['Homeobox', 'PAX'] NO_THRESHOLD


  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/666 [00:00<?, ?it/s]

F231_2.00 Homeodomain,POU ['Homeobox', 'Pou'] SequenceIdentity


  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/1081 [00:00<?, ?it/s]

F238_2.00 HSF ['HSF_DNA-bind'] SimilarityRegression


  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/406 [00:00<?, ?it/s]

F241_2.00 IBD ['IBD'] NO_THRESHOLD


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

F243_2.00 IRF ['IRF'] NO_THRESHOLD


  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/325 [00:00<?, ?it/s]

F251_2.00 MADF ['MADF_DNA_bdg'] SimilarityRegression


  0%|          | 0/78 [00:00<?, ?it/s]

  0%|          | 0/3003 [00:00<?, ?it/s]

F256_2.00 MADS box ['SRF-TF'] NO_THRESHOLD


  0%|          | 0/97 [00:00<?, ?it/s]

  0%|          | 0/4656 [00:00<?, ?it/s]

F259_2.00 MBD ['MBD'] NO_THRESHOLD


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

F263_2.00 mTERF ['mTERF'] NO_THRESHOLD


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

F266_2.00 Myb/SANT ['Myb_DNA-binding'] SimilarityRegression


  0%|          | 0/251 [00:00<?, ?it/s]

  0%|          | 0/31375 [00:00<?, ?it/s]

F275_2.00 Ndt80/PhoG ['NDT80_PhoG'] NO_THRESHOLD


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

F276_2.00 NFX ['zf-NF-X1'] NO_THRESHOLD


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

F278_2.00 Nuclear receptor ['zf-C4'] SimilarityRegression


  0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/14028 [00:00<?, ?it/s]

F279_2.00 p53 ['P53'] NO_THRESHOLD


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

F281_2.00 Paired box ['PAX'] NO_THRESHOLD


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

F282_2.00 Pipsqueak ['HTH_psq'] SequenceIdentity


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/435 [00:00<?, ?it/s]

F290_2.00 Prospero ['HPD'] NO_THRESHOLD


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

F293_2.00 Rel ['RHD_DNA_bind'] NO_THRESHOLD


  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/465 [00:00<?, ?it/s]

F294_2.00 RFX ['RFX_DNA_binding'] SimilarityRegression


  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/351 [00:00<?, ?it/s]

F296_2.00 Runt ['Runt'] NO_THRESHOLD


  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

F299_2.00 SAND ['SAND'] SimilarityRegression


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/231 [00:00<?, ?it/s]

F304_2.00 SMAD ['MH1'] NO_THRESHOLD


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

F305_2.00 Sox ['HMG_box'] SimilarityRegression


  0%|          | 0/97 [00:00<?, ?it/s]

  0%|          | 0/4656 [00:00<?, ?it/s]

F308_2.00 STAT ['STAT_bind'] NO_THRESHOLD


  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

F310_2.00 Storekeeper ['DUF573'] NO_THRESHOLD


  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

F312_2.00 T-box ['T-box'] SimilarityRegression


  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/946 [00:00<?, ?it/s]

F313_2.00 TBP ['TBP'] NO_THRESHOLD


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

F315_2.00 TCR/CxC ['TCR'] SimilarityRegression


  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/406 [00:00<?, ?it/s]

F317_2.00 TEA ['TEA'] NO_THRESHOLD


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/105 [00:00<?, ?it/s]

F318_2.00 THAP finger ['THAP'] NO_THRESHOLD


  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/630 [00:00<?, ?it/s]

Unnamed: 0,Family_ID,ID_x,ID_y,AA %ID,SR_Score,SR_Class,SRModel_Class,SRModel_Name,Family_Name
37,F024_2.00,M00111_2.00,M02768_2.00,1.0,1.0,HSim,SequenceIdentity,NO_THRESHOLD,AP-2
38,F024_2.00,M00111_2.00,M02769_2.00,1.0,1.0,HSim,SequenceIdentity,NO_THRESHOLD,AP-2
39,F024_2.00,M00111_2.00,M02770_2.00,1.0,1.0,HSim,SequenceIdentity,NO_THRESHOLD,AP-2
40,F024_2.00,M00111_2.00,M08704_2.00,1.0,1.0,HSim,SequenceIdentity,NO_THRESHOLD,AP-2
0,F024_2.00,M00112_2.00,M02753_2.00,1.0,1.0,HSim,SequenceIdentity,NO_THRESHOLD,AP-2


In [180]:
# Sponge -> Motif Inferences 
i_Sponge = SR_Scores['ID_x'].isin(df_sponge['protein']) | SR_Scores['ID_y'].isin(df_sponge['protein'])
i_Motif = SR_Scores['ID_x'].str.endswith('_2.00') | SR_Scores['ID_y'].str.endswith('_2.00')
Sponge_MotifInferences = SR_Scores[i_Sponge & i_Motif]
Sponge_MotifInferences = Sponge_MotifInferences.rename({'ID_x' : 'Motif_ID', 'ID_y' : 'Strongylocentrotus_Protein'}, axis = 1)

p2tf = { x[1]: x[2] for x in df_sponge[['protein','tf']].drop_duplicates().itertuples()}
Sponge_MotifInferences['Strongylocentrotus_TF'] = [p2tf.get(x) for x in Sponge_MotifInferences['Strongylocentrotus_Protein']]
Sponge_MotifInferences['Motif.TF_ID'] = [motifs['TF_ID'].get(x) for x in Sponge_MotifInferences['Motif_ID']]
Sponge_MotifInferences = pd.merge(Sponge_MotifInferences, 
                                    tfs[['TF_Species','DBID', 'TF_Name']].rename({'TF_Species' : 'Motif.Species', 'DBID' : 'Motif.TF_DBID', 'TF_Name': 'Motif.TF_Name'}, axis = 1), 
                                    how = 'left',
                                    left_on='Motif.TF_ID', right_index=True)
Sponge_MotifInferences.to_csv('Strongylocentrotus_MotifInferences.csv.gz', compression='gzip')
Sponge_MotifInferences.head()

Unnamed: 0,Family_ID,Motif_ID,Strongylocentrotus_Protein,AA %ID,SR_Score,SR_Class,SRModel_Class,SRModel_Name,Family_Name,Strongylocentrotus_TF,Motif.TF_ID,Motif.Species,Motif.TF_DBID,Motif.TF_Name
502,F024_2.00,M00112_2.00,XP_030839379.1,0.805128,0.805128,HSim,SequenceIdentity,NO_THRESHOLD,AP-2,ECB-GENE-23059216,T010907_2.00,Mus_musculus,ENSMUSG00000025927,Tfap2b
508,F024_2.00,M02753_2.00,XP_030839379.1,0.805128,0.805128,HSim,SequenceIdentity,NO_THRESHOLD,AP-2,ECB-GENE-23059216,T010820_2.00,Homo_sapiens,ENSG00000008196,TFAP2B
507,F024_2.00,M02754_2.00,XP_030839379.1,0.805128,0.805128,HSim,SequenceIdentity,NO_THRESHOLD,AP-2,ECB-GENE-23059216,T010820_2.00,Homo_sapiens,ENSG00000008196,TFAP2B
501,F024_2.00,M02755_2.00,XP_030839379.1,0.805128,0.805128,HSim,SequenceIdentity,NO_THRESHOLD,AP-2,ECB-GENE-23059216,T010820_2.00,Homo_sapiens,ENSG00000008196,TFAP2B
503,F024_2.00,M04046_2.00,XP_030839379.1,0.805128,0.805128,HSim,SequenceIdentity,NO_THRESHOLD,AP-2,ECB-GENE-23059216,T010820_2.00,Homo_sapiens,ENSG00000008196,TFAP2B


In [188]:
motif_x_motif[(motif_x_motif['SRModel_Name'] == 'Homeodomain,POU')].sample(n=10)

Unnamed: 0,Family_ID,ID_x,ID_y,AA %ID,SR_Score,SR_Class,SRModel_Class,SRModel_Name,Family_Name
6742564,F231_2.00,M01255_2.00,M10858_2.00,0.674419,0.674419,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6748476,F231_2.00,M05472_2.00,M10836_2.00,0.542636,0.542636,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6743020,F231_2.00,M05461_2.00,M09590_2.00,0.542636,0.542636,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6756151,F231_2.00,M00550_2.00,M05351_2.00,0.496124,0.496124,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6757564,F231_2.00,M00361_2.00,M05451_2.00,0.55814,0.55814,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6738201,F231_2.00,M00360_2.00,M02236_2.00,0.565891,0.565891,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6750189,F231_2.00,M03312_2.00,M10824_2.00,0.55814,0.55814,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6755957,F231_2.00,M03318_2.00,M05465_2.00,0.573643,0.573643,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6750644,F231_2.00,M00548_2.00,M08031_2.00,0.627907,0.627907,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"
6740010,F231_2.00,M05482_2.00,M07542_2.00,0.96124,0.96124,Amb,SequenceIdentity,"Homeodomain,POU","Homeodomain,POU"


In [182]:
for x in Sponge_MotifInferences.columns:
    print(x)

Family_ID
Motif_ID
Strongylocentrotus_Protein
AA %ID
SR_Score
SR_Class
SRModel_Class
SRModel_Name
Family_Name
Strongylocentrotus_TF
Motif.TF_ID
Motif.Species
Motif.TF_DBID
Motif.TF_Name


In [183]:
Sponge_MotifInferences[Sponge_MotifInferences['Strongylocentrotus_TF'].isnull()]

Unnamed: 0,Family_ID,Motif_ID,Strongylocentrotus_Protein,AA %ID,SR_Score,SR_Class,SRModel_Class,SRModel_Name,Family_Name,Strongylocentrotus_TF,Motif.TF_ID,Motif.Species,Motif.TF_DBID,Motif.TF_Name


In [184]:
df_sponge[df_sponge['protein'] == 'XP_030828650.1']

Unnamed: 0,protein,tf,domain,start,end,seq,domain_order,Family_ID
1816,XP_030828650.1,ECB-GENE-23166856,THAP,3,102,VCAIIGCINSTRLLNAWYVQLCDIHGRKNGSCICHPPFTLFPFPSE...,THAP,F318_2.00


In [185]:
p2tf.get('XP_030828650.1')

'ECB-GENE-23166856'