In [6]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Set up cashdir
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -v -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lcdb-wf/lib')
sys.path.insert(0, '../../lib/python')

import yaml
with open('../../config/config.yml') as fh:
    config = yaml.load(fh)


last updated: 2017-09-01 

CPython 3.5.4
IPython 6.1.0
Git hash: b1807333b845adb20912da5246c0c8754330f435


In [2]:
# imports
import os
import re
from tempfile import TemporaryDirectory
import pandas as pd
import numpy as np

import GEOparse

In [None]:
# load flybase annotations

In [None]:
fb = pd.read_table(os.path.join(os.environ['REFERENCES_DIR']))

In [None]:
!ls $REFERENCES_DIR/dmel/r6-16/

In [9]:
config.keys()

dict_keys(['assembly', 'rrna', 'references', 'kallisto', 'aggregation_dir', 'sampletable', 'gtf', 'sample_dir', 'aligner'])

In [14]:
config['references']['dmel']['r6-16']

{'fasta': {'indexes': ['bowtie2', 'hisat2'],
  'postprocess': 'lib.postprocess.dm6.fasta_postprocess',
  'url': 'ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r6.16_FB2017_03/fasta/dmel-all-chromosome-r6.16.fasta.gz'},
 'gtf': {'conversions': ['refflat',
   {'gffutils': {'disable_infer_genes': True,
     'gtf_gene_key': 'gene_id',
     'gtf_transcript_key': 'transcript_id',
     'id_spec': {'gene': ['gene_id', 'gene_symbol'],
      'transcript': ['transcript_id', 'transcript_symbol']},
     'merge_strategy': 'merge'}},
   {'genelist': {'gene_id': 'gene_id'}},
   {'annotation_hub': {'ahkey': 'AH49581', 'keytype': 'ENSEMBL'}}],
  'postprocess': 'lib.postprocess.dm6.gtf_postprocess',
  'url': 'ftp://ftp.flybase.net/genomes/Drosophila_melanogaster/dmel_r6.16_FB2017_03/gtf/dmel-all-r6.16.gtf.gz'}}

In [12]:
from common import references_dict

In [22]:
references_dict(config)[0]['dmel']['r6-16'].keys()

dict_keys(['gtf', 'hisat2', 'gffutils', 'bowtie2', 'genelist', 'chromsizes', 'refflat', 'fasta', 'annotation_hub'])

In [136]:
# Build sample table
# Query GEO 
gse = GEOparse.get_GEO(geo="GSE81221", silent=True)

# Pull out sample attributes and build data frame
attributes = []
for gsm, dat in gse.gsms.items():
    try:
        attrs = re.match(r'^.*_(?P<fbgn>FBgn(\d+|NA))_(?P<symbol>.*?)_.*(?P<drsc>DRSC(\d+|NA))_replicate(?P<rep>\d)(\s\[Plate(?P<plate_id>\d+)-\d_F3\]|$)', dat.metadata['title'][0]).groupdict()
    except AttributeError:
        print(gsm, dat.metadata['title'])
        
    attrs['GEO'] = gsm
    attrs.update(re.match(r'.*_DRSC_Plate(?P<plate_id>\d+)-\d_(?P<well_id>\w\d+)_.*', 
                          dat.metadata['supplementary_file_2'][0]).groupdict())
    
    attrs.update(re.match(r'(?P<plate_row>\w)(?P<plate_column>\d+)', attrs['well_id']).groupdict())

    for x in dat.metadata['relation']:
        k, v = re.match(r'(\w+):.*[\/=](\w+\d+)$', x).groups()
        attrs[k] = v
    attributes.append(attrs)

df = pd.DataFrame(attributes)

df.set_index('SRA', inplace=True)
cols = [
    'BioSample', 'GEO', 'drsc', 'fbgn', 'symbol', 'rep',
    'plate_id', 'well_id', 'plate_row', 'plate_column'
]
df = df[cols]

In [138]:
# Sanity check FBgns for changes

In [185]:
# import FBgn to symbol
db = gffutils.FeatureDB(
    os.path.join(os.environ['REFERENCES_DIR'], 'dmel/r6-11/gtf/dmel_r6-11.gtf.db'))

genes = []
for gene in db.features_of_type(featuretype='gene'):
    fbgn = gene.id
    symbol = gene.attributes['gene_symbol'][0].replace('(', '[').replace(')', ']')
    genes.append({'FBgn': fbgn, 'r6-11_symbol': symbol})

fb = pd.DataFrame(genes)
fb.set_index('FBgn', inplace=True)

In [186]:
merged = df.merge(fb, how='left', left_on='fbgn', right_index=True)
merged[(merged.symbol != merged['r6-11_symbol'])]

Unnamed: 0_level_0,BioSample,GEO,drsc,fbgn,symbol,rep,plate_id,well_id,plate_row,plate_column,r6-11_symbol
SRA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SRX1749426,SAMN04959619,GSM2145690,DRSC20029,FBgn0085451,CG34422,1,4,B9,B,9,htk
SRX1748909,SAMN04959132,GSM2145175,DRSC04554,FBgn0026582,CG9418,1,1,G5,G,5,Hmg-2
SRX1750082,SAMN04960612,GSM2146343,DRSC18419,FBgn0030093,dalao,2,7,A9,A,9,Bap111
SRX1749968,SAMN04960053,GSM2146230,DRSC26287,FBgn0032940,Mio,2,6,H3,H,3,Mondo
SRX1748894,SAMN04959118,GSM2145161,DRSCNA,FBgnNA,LacZ,1,1,F3,F,3,
SRX1749370,SAMN04959441,GSM2145634,DRSC02668,FBgn0032940,Mio,2,3,F1,F,1,Mondo
SRX1749364,SAMN04959435,GSM2145628,DRSC08367,FBgn0035160,CG13897,2,3,E7,E,7,hng3
SRX1750341,SAMN04960902,GSM2146601,DRSC37496,FBgn0085451,CG34422,2,8,G6,G,6,htk
SRX1749224,SAMN04959791,GSM2145489,DRSC11182,FBgn0261283,HLH106,1,3,B10,B,10,SREBP
SRX1750381,SAMN04960761,GSM2146641,DRSC37996,FBgn0035407,CG14962,1,9,C11,C,11,Asciz


In [191]:
merged[merged['r6-11_symbol'].isnull()]

Unnamed: 0_level_0,BioSample,GEO,drsc,fbgn,symbol,rep,plate_id,well_id,plate_row,plate_column,r6-11_symbol
SRA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SRX1748894,SAMN04959118,GSM2145161,DRSCNA,FBgnNA,LacZ,1,1,F3,F,3,
SRX1749796,SAMN04960482,GSM2146058,DRSC27862,FBgn0005630,lola,1,6,A9,A,9,
SRX1749267,SAMN04959804,GSM2145532,DRSC07647,FBgn0005630,lola,1,3,E6,E,6,
SRX1749944,SAMN04960089,GSM2146206,DRSCNA,FBgnNA,LacZ,2,6,F3,F,3,
SRX1749276,SAMN04959813,GSM2145541,DRSCNA,FBgnNA,LacZ,1,3,F3,F,3,
SRX1748990,SAMN04959545,GSM2145256,DRSCNA,FBgnNA,LacZ,2,1,F3,F,3,
SRX1749467,SAMN04960126,GSM2145731,DRSCNA,FBgnNA,LacZ,1,4,F3,F,3,
SRX1750421,SAMN04960680,GSM2146681,DRSCNA,FBgnNA,LacZ,1,9,F3,F,3,
SRX1749658,SAMN04960345,GSM2145921,DRSCNA,FBgnNA,LacZ,1,5,F3,F,3,
SRX1749753,SAMN04960230,GSM2146016,DRSCNA,FBgnNA,LacZ,2,5,F3,F,3,


In [194]:
from tempfile import NamedTemporaryFile
import urllib

In [None]:
urllib.request.urlopen()

In [197]:
from common import get_references_dir

In [199]:
import yaml

In [203]:
with open('../../config/config.yml') as fh:
    config = yaml.load(fh)

In [204]:
config

{'aggregation_dir': 'data/aggregation',
 'aligner': {'index': 'hisat2', 'tag': 'r6-16'},
 'assembly': 'dmel',
 'gtf': {'tag': 'r6-16'},
 'kallisto': {'tag': 'r6-16_transcriptome'},
 'references': {'adapters': {'default': {'fasta': {'indexes': ['bowtie2'],
     'postprocess': 'lib.postprocess.adapters.fasta_postprocess',
     'url': 'https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/adapters.fa'}}},
  'dicty': {'ensembl_2.7': {'fasta': {'indexes': ['bowtie2', 'hisat2'],
     'url': 'ftp://ftp.ensemblgenomes.org/pub/protists/release-33/fasta/dictyostelium_discoideum/dna/Dictyostelium_discoideum.dicty_2.7.dna_sm.toplevel.fa.gz'}},
   'ensembl_2.7.33': {'gtf': {'conversions': ['refflat'],
     'url': 'ftp://ftp.ensemblgenomes.org/pub/release-33/protists/gtf/dictyostelium_discoideum/Dictyostelium_discoideum.dicty_2.7.33.gtf.gz'}},
   'ensembl_transcriptome_2.7': {'fasta': {'indexes': ['kallisto'],
     'postprocess': 'lib.common.cat',
     'url': ['ftp://ftp.ensemblgenom