In [37]:
# %load ../start.py
# Load useful extensions
import os
import sys

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Set up cashdir
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -v -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
sys.path.insert(0, '../../lcdb-wf/lib')
sys.path.insert(0, '../../lib/python')

# Set up references
import yaml
with open('../../config/config.yml') as fh:
    config = yaml.load(fh)

assembly = config['assembly']
tag = config['aligner']['tag']
REF = os.path.join(os.environ['REFERENCES_DIR'], assembly, tag)


last updated: 2017-09-01 

CPython 3.5.4
IPython 6.1.0
Git hash: 7c556f64840197dcf4c787f27f21abc390e73829


In [38]:
# imports
import re
from tempfile import TemporaryDirectory
import pandas as pd
import numpy as np

import GEOparse

In [87]:
# load flybase annotations
fb_anno = pd.read_table(os.path.join(REF, 'fb_annotation/dmel_{}.fb_annotation'.format(tag)))
fb_syn = pd.read_table(os.path.join(REF, 'fb_synonym/dmel_{}.fb_synonym'.format(tag)))
fb = fb_anno.merge(fb_syn, left_on='primary_FBgn', right_on='primary_FBid', how='outer')

In [88]:
fb.head()

Unnamed: 0,gene_symbol,primary_FBgn,secondary_FBgn,annotation_ID,secondary_annotation_ID,primary_FBid,current_symbol,current_fullname,fullname_synonym(s),symbol_synonym(s)
0,d,FBgn0262029,"FBgn0032045,FBgn0086896,FBgn0000410,FBgn002597...",CG42840,"CG10595,CG13087,CG31610",FBgn0262029,d,dachs,"Dachs,lethal (2) c00146,myosin 29D,unconventio...","AAF52683,l(2)c00146,dachs,29C3-D1,Myo3A,CG4284..."
1,CG32532,FBgn0052532,"FBgn0031027,FBgn0031028,FBgn0031029",CG32532,"CG14203,CG14202,CG14201",FBgn0052532,CG32532,,,"CG14203,CG14202,CG14201"
2,CG3156,FBgn0023536,,CG3156,,FBgn0023536,CG3156,,,EG:171D11.2
3,mRpL30,FBgn0029718,,CG7038,,FBgn0029718,mRpL30,mitochondrial ribosomal protein L30,Mitochondrial ribosomal protein L30,"CG7038,L30,mRPL30"
4,CG1631,FBgn0031101,,CG1631,,FBgn0031101,CG1631,,,


In [84]:
fb_syn.head()

Unnamed: 0,primary_FBid,current_symbol,current_fullname,fullname_synonym(s),symbol_synonym(s)
0,FBgn0262029,d,dachs,"Dachs,lethal (2) c00146,myosin 29D,unconventio...","AAF52683,l(2)c00146,dachs,29C3-D1,Myo3A,CG4284..."
1,FBgn0263393,Mmus\Itpr1,"Mus musculus inositol 1,4,5-trisphosphate rece...",,Itpr1
2,FBgn0052532,CG32532,,,"CG14203,CG14202,CG14201"
3,FBgn0023536,CG3156,,,EG:171D11.2
4,FBgn0029718,mRpL30,mitochondrial ribosomal protein L30,Mitochondrial ribosomal protein L30,"CG7038,L30,mRPL30"


In [78]:
new_records = []
for i, record in fb.iterrows():
    symbol = record.gene_symbol
    
    new_records.append({
        'gene_symbol': symbol,
        'fbgn': record.primary_FBgn,
        'type': 'primary'
    })
    
    if isinstance(record.secondary_FBgn, str):
        for fbgn in record.secondary_FBgn.split(','):
            new_records.append({
                'gene_symbol': symbol,
                'fbgn': fbgn,
                'type': 'secondary'
            })
            
fb_stack = pd.DataFrame(new_records) 

In [79]:
fb_stack

Unnamed: 0,fbgn,gene_symbol,type
0,FBgn0262029,d,primary
1,FBgn0032045,d,secondary
2,FBgn0086896,d,secondary
3,FBgn0000410,d,secondary
4,FBgn0025975,d,secondary
5,FBgn0032046,d,secondary
6,FBgn0051610,d,secondary
7,FBgn0069196,d,secondary
8,FBgn0052532,CG32532,primary
9,FBgn0031027,CG32532,secondary


In [None]:
# Build sample table
# Query GEO 
tmpDir = TemporaryDirectory()
gse = GEOparse.get_GEO("GSE81221", destdir=tmpDir.name, silent=True)

# Pull out sample attributes and build data frame
attributes = []
for gsm, dat in gse.gsms.items():
    try:
        attrs = re.match(r'^.*_(?P<fbgn>FBgn(\d+|NA))_(?P<symbol>.*?)_.*(?P<drsc>DRSC(\d+|NA))_replicate(?P<rep>\d)(\s\[Plate(?P<plate_id>\d+)-\d_F3\]|$)', dat.metadata['title'][0]).groupdict()
    except AttributeError:
        print(gsm, dat.metadata['title'])
        
    attrs['GEO'] = gsm
    attrs.update(re.match(r'.*_DRSC_Plate(?P<plate_id>\d+)-\d_(?P<well_id>\w\d+)_.*', 
                          dat.metadata['supplementary_file_2'][0]).groupdict())
    
    attrs.update(re.match(r'(?P<plate_row>\w)(?P<plate_column>\d+)', attrs['well_id']).groupdict())

    for x in dat.metadata['relation']:
        k, v = re.match(r'(\w+):.*[\/=](\w+\d+)$', x).groups()
        attrs[k] = v
    attributes.append(attrs)

df = pd.DataFrame(attributes)

df.set_index('SRA', inplace=True)
cols = [
    'BioSample', 'GEO', 'drsc', 'fbgn', 'symbol', 'rep',
    'plate_id', 'well_id', 'plate_row', 'plate_column'
]
df = df[cols]

In [138]:
# Sanity check FBgns for changes