In [1]:
# %load ../start.py
# Load useful extensions
import os
import sys

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Set up cashdir
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -v -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
sys.path.insert(0, '../../lcdb-wf/lib')
sys.path.insert(0, '../../lib/python')

# Set up references
import yaml
with open('../../config/config.yml') as fh:
    config = yaml.load(fh)

assembly = config['assembly']
tag = config['aligner']['tag']
REF = os.path.join(os.environ['REFERENCES_DIR'], assembly, tag)


last updated: 2017-09-04 

CPython 3.5.4
IPython 6.1.0
Git hash: 7c556f64840197dcf4c787f27f21abc390e73829


In [2]:
# imports
import re
from tempfile import TemporaryDirectory
import pandas as pd
import numpy as np

import GEOparse

In [127]:
# load flybase annotations
fb = pd.read_table(os.path.join(REF, 'fb_annotation/dmel_{}.fb_annotation'.format(tag)))[['primary_FBgn', 'gene_symbol', 'secondary_FBgn']]
fb.rename(
    columns={
       'primary_FBgn': 'FBgn',
       'gene_symbol': 'symbol'
    }, 
    inplace=True
)

In [113]:
# Make map of old fbgn to current fbgn and current fbgn to current symbol
fbgns = {}
genes = {}
for i, record in fb.iterrows():
    fbgn = record.FBgn
    symbol = record.symbol
    fbgn2 = record.secondary_FBgn
    
    fbgns[fbgn] = fbgn
    genes[fbgn] = symbol
    
    if isinstance(fbgn2, str):
        for f2 in fbgn2.strip().split(','):
            fbgns[f2] = fbgn

In [151]:
# Build sample table using GEO entry
# Query GEO 
tmpDir = TemporaryDirectory()
gse = GEOparse.get_GEO("GSE81221", destdir=tmpDir.name, silent=True)

# Pull out sample attributes and build data frame
attributes = []
for gsm, dat in gse.gsms.items():
    try:
        attrs = re.match(r'^.*_(?P<fbgn>FBgn(\d+|NA))_(?P<symbol>.*?)_.*(?P<drsc>DRSC(\d+|NA))_replicate(?P<rep>\d)(\s\[Plate(?P<plate_id>\d+)-\d_F3\]|$)', dat.metadata['title'][0]).groupdict()
    except AttributeError:
        print(gsm, dat.metadata['title'])
        
    attrs['GEO'] = gsm
    attrs.update(re.match(r'.*_DRSC_Plate(?P<plate_id>\d+)-\d_(?P<well_id>\w\d+)_.*', 
                          dat.metadata['supplementary_file_2'][0]).groupdict())
    
    attrs.update(re.match(r'(?P<plate_row>\w)(?P<plate_column>\d+)', attrs['well_id']).groupdict())

    for x in dat.metadata['relation']:
        k, v = re.match(r'(\w+):.*[\/=](\w+\d+)$', x).groups()
        attrs[k] = v
    attributes.append(attrs)

df = pd.DataFrame(attributes)

df.rename(columns={'SRA': 'sample_id'}, inplace=True)
df.set_index('sample_id', inplace=True)

# Grab useful columns and reorder
cols = [
    'BioSample', 'GEO', 'drsc', 'fbgn', 'symbol', 'rep',
    'plate_id', 'well_id', 'plate_row', 'plate_column'
]
df = df[cols]

# Reformat gene symbols
df.symbol = df.symbol.str.replace('[', '(').str.replace(']', ')')

In [153]:
# Sanity check FBgns for changes

In [154]:
# Map FBgn and symbols to current version
fbgns['FBgnNA'] = 'FBgnNA'
genes['FBgnNA'] = 'LacZ'
df['curr_fbgn'] = df.apply(lambda x: fbgns[x.fbgn], axis=1)
df['curr_symbol'] = df.apply(lambda x: genes[fbgns[x.fbgn]], axis=1)

In [155]:
# Number of FBgn changes.
dd = df[['fbgn', 'curr_fbgn', 'curr_symbol']].reset_index(drop=True).drop_duplicates()
print('There were {} FBgns that changed number.'.format(sum(dd.fbgn != dd.curr_fbgn)))
dd[dd.fbgn != dd.curr_fbgn]

There were 3 FBgns that changed number.


Unnamed: 0,fbgn,curr_fbgn,curr_symbol
75,FBgn0005630,FBgn0283521,lola
236,FBgn0010575,FBgn0285917,sbb
1222,FBgn0000054,FBgn0284249,Adf1


In [156]:
# Number of gene symbol changes
dd = df[['curr_fbgn', 'symbol', 'curr_symbol']].reset_index(drop=True).drop_duplicates()
print('There were {} genes that changed gene symbol.'.format(sum(dd.symbol != dd.curr_symbol)))
dd[dd.symbol != dd.curr_symbol]

There were 28 genes that changed gene symbol.


Unnamed: 0,curr_fbgn,symbol,curr_symbol
10,FBgn0030093,dalao,Bap111
40,FBgn0038551,CG7357,Odj
47,FBgn0037634,CG8359,hng2
60,FBgn0030891,dik,Ada3
84,FBgn0052133,ptip,Ptip
100,FBgn0032016,CG7818,Mettl14
129,FBgn0036134,fd68A,FoxK
155,FBgn0034599,CG9437,hng1
256,FBgn0261283,HLH106,SREBP
260,FBgn0035407,CG14962,Asciz


In [200]:
cleaned = df.drop(['fbgn', 'symbol'], axis=1).rename(columns={'curr_fbgn': 'target_FBgn', 'curr_symbol': 'target_symbol'})

In [201]:
# Add a column for DRSC replicate (based on DRSC sort order by FBgn)
drsc = cleaned[['drsc', 'target_FBgn']].reset_index(drop=True).drop_duplicates().sort_values('drsc')

drscs = []
for g, grp in drsc.groupby('target_FBgn'):
    new = grp.copy()
    new['drsc_rep'] = range(1, grp.shape[0] + 1)
    drscs.append(new)

drsc = pd.concat(drscs, ignore_index=True)[['drsc', 'drsc_rep']]

In [202]:
# Merge on to cleaned dataset
cleaned = cleaned.reset_index().merge(drsc, left_on='drsc', right_on='drsc')

In [204]:
cleaned.columns

Index(['sample_id', 'BioSample', 'GEO', 'drsc', 'rep', 'plate_id', 'well_id',
       'plate_row', 'plate_column', 'target_FBgn', 'target_symbol',
       'drsc_rep'],
      dtype='object')

In [205]:
# Reorder columns
cols = [
    'sample_id', 'BioSample', 'GEO',  'drsc',  'target_FBgn',  'target_symbol',
    'drsc_rep', 'rep',  'plate_id',  'well_id', 'plate_row',  'plate_column', 
]

cleaned[cols]

Unnamed: 0,sample_id,BioSample,GEO,drsc,target_FBgn,target_symbol,drsc_rep,rep,plate_id,well_id,plate_row,plate_column
0,SRX1750276,SAMN04960867,GSM2146536,DRSC37576,FBgn0001324,kto,2,2,8,B12,B,12
1,SRX1750180,SAMN04960560,GSM2146441,DRSC37576,FBgn0001324,kto,2,1,8,B12,B,12
2,SRX1749474,SAMN04960131,GSM2145738,DRSC05281,FBgn0000581,E(Pc),1,1,4,G10,G,10
3,SRX1749570,SAMN04960287,GSM2145833,DRSC05281,FBgn0000581,E(Pc),1,2,4,G10,G,10
4,SRX1748883,SAMN04959317,GSM2145150,DRSC03561,FBgn0001990,wek,1,1,1,E4,E,4
5,SRX1748979,SAMN04959669,GSM2145245,DRSC03561,FBgn0001990,wek,1,2,1,E4,E,4
6,SRX1749660,SAMN04960347,GSM2145923,DRSC22041,FBgn0086350,tef,2,1,5,F5,F,5
7,SRX1749755,SAMN04960232,GSM2146018,DRSC22041,FBgn0086350,tef,2,2,5,F5,F,5
8,SRX1749653,SAMN04960340,GSM2145916,DRSC25000,FBgn0024321,NK7.1,2,1,5,F10,F,10
9,SRX1749748,SAMN04960225,GSM2146011,DRSC25000,FBgn0024321,NK7.1,2,2,5,F10,F,10
