In [2]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Trun on the water mark
%reload_ext watermark
%watermark -a "Justin M Fear" -u -d -v

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')


Justin M Fear 
last updated: 2017-01-24 

CPython 3.5.2
IPython 5.1.0


In the last S2 cell RNAi project meeting we reviewed Yijie's network model. One concern that I had was the addition of edges from transcription factors to genes, when a gene is highly correlated. For example:

```
TFa -> A

A and B are highly correlated...

Add edge TFa ->B
```

This is a good idea, but does not necessarily hold true. We decided the addition of motif information would be important. Then we could only add `TFa->B` when `B` has a motif from `TFa`. 

Lee had started generating a TF motif list/weights, but Brian wanted him to stop this and have me do it. They basic analysis plan is:

1. Download annotated TF motifs from various online sources.
2. Map motifs to the genome and identify motifs within some range of the TSS. 
3. Do the same thing across Drosophila species and calculate a conservation score (see DSX motif paper).
4. Build a weight matrix where each row is a gene and each column is transcription factor. Valuse can either be binary or weights for if the TF motif was within the regulatory region of the gene.

Lee has looked for sources of motifs and concluded that the MEME website is a good source. Here is an email from Lee updating me on this information:

    Justin, 

    My R scripts seem to be not super-helpful.  There are just about how I defined TSS regions (+ first introns).  I assigned a motif to a gene when a motif is within the range between 1kb upstream to min(500bp downstream or first intron end).  Just briefly go over for fun.


    You can download the weight matrices from MEME, but I attach here, too. 

    OnTheFly data, used very weird IDs, which are mixture of Swissprot and others.  It is very nagging to deal with it, so I actually crawled their website (which also in in the Handling_motif_data.R).  I attach the ID conversion matrix.  This will be very useful.


    FIMO results are quite large, and I don’t think that you will use them.  But the links follow.
    https://www.dropbox.com/s/czse5ur5md8wm1u/OnTheFly_2014_p0.0001.txt?dl=0
    https://www.dropbox.com/s/hq812lsv6hijqet/fly_factor_survey_p0.0001.txt?dl=0

    An example of my FIMO code is below
    cd /data/leehang/motif/fly_factor_survey; fimo --qv-thresh --thresh 0.05 ./fly_factor_survey.meme ~/Annotation/Dmel.FB6_06.fa


    Lee

I think I am going to approach this cleanly so that I know exactly where and how files were downloaded.

In [3]:
# Imports
import os
import re
import tarfile
from urllib.request import urlretrieve, urlopen
from tempfile import mkstemp

import pandas as pd

from Bio import motifs

# My library
import meme

In [4]:
# Download MEME motif database
if not os.path.exists('../../data/external/meme/motif_databases/FLY/fly_factor_survey.meme'):
    # Download file
    temp = mkstemp(suffix='tgz')
    urlretrieve(url='http://meme-suite.org/meme-software/Databases/motifs/motif_databases.12.15.tgz', filename=temp[1])
    
    # Make sure output dir is there
    if not os.path.exists('../../data/external/meme'):
        os.mkdir('../../data/external/meme')

    # Open tar
    tar = tarfile.open(temp[1])

    # Extract only the Fly data
    def fly(members):
        for tarinfo in members:
            if 'FLY' in tarinfo.name:
                yield tarinfo
    
    tar.extractall(path='../../data/external/meme', members=fly(tar))
    
    # Clean up
    tar.close()
    os.unlink(temp[1])

In [5]:
%%cache -s flyfactory.pkl flyFactoryTFS
# Verify Meme downloads
flyFactoryTFS = meme.memeFile('../../data/external/meme/motif_databases/FLY/fly_factor_survey.meme')
# According to the meme website the fly factor database has 656 motifs, this will error if not
assert flyFactoryTFS .count() == 656

In [31]:
%%cache -s -f onthefly.pkl onTheFlyTFS
# map OnTheFly to FBgn

# The OnTheFly motifs are not as stright forward because they use their own identifier. There is 
# an added step to query their website and get the FBgn value.
URL = 'https://bhapp.c2b2.columbia.edu/OnTheFly/cgi-bin/protein_entry.php?protein_ID={0}'
def map_onthefly(query):
    try:
        with urlopen(URL.format(query)) as fh:
            return re.findall(r'FBgn\d+', fh.read().decode('utf-8'))[0]
    except:
        print(query)
        raise
    
onTheFlyTFS = meme.memeFile('../../data/external/meme/motif_databases/FLY/OnTheFly_2014_Drosophila.meme')
# According to the meme website the fly factor database has 608 motifs, this will error if not
assert onTheFlyTFS.count() == 608

In [40]:
results = []
for key, values in onTheFlyTFS.items():
    for value in values:
        # grab link to protein page from url provided
        with urlopen(value.url) as fh:
            page = fh.read().decode('UTF-8')
            name = re.findall(r'protein_entry.php\?protein_ID=(.*?)\'', page)[0]
            ID = re.findall(r'ID: (OTF\d+\.\d+)', page)[0]
        URL = 'https://bhapp.c2b2.columbia.edu/OnTheFly/cgi-bin/protein_entry.php?protein_ID={0}'
        try:
            with urlopen(URL.format(name)) as fh:
                fbgn = re.findall(r'FBgn\d+', fh.read().decode('utf-8'))[0]
            results.append((key, ID, value.name, fbgn))
        except:
            pass

In [41]:
df = pd.DataFrame(results, columns=['name', 'id', 'portein', 'FBgn'])

In [53]:
df.sort_values(by='id', inplace=True)
print(df.shape)
df.head()

(607, 4)


Unnamed: 0,name,id,portein,FBgn
230,OTF0001,OTF0001.1,7UP1_DROME_B1H,FBgn0003651
242,OTF0002,OTF0002.1,A0AQF9_DROME_B1H,FBgn0034599
362,OTF0003,OTF0003.1,A0JQ60_DROME_SELEX,FBgn0000567
363,OTF0003,OTF0003.2,A0JQ60_DROME_DNaseI,FBgn0000567
253,OTF0004,OTF0004.1,A1A6R5_DROME_B1H,FBgn0004914


In [62]:
onTheFlyTFS

<meme.memeFile at 0x7f8b24c39470>