# Table of Contents
 <p>

In [20]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import epistasis as epi
import sklearn.decomposition
import tissue_enrichment_analysis as ea

from matplotlib import rc

import os
rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{cmbright}')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})


%matplotlib inline

# This enables SVG graphics inline. 
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style("dark")

mpl.rcParams['xtick.labelsize'] = 16 
mpl.rcParams['ytick.labelsize'] = 16 
mpl.rcParams['legend.fontsize'] = 14

In [4]:
tissue = tea.fetch_dictionary('tissue')
pheno = tea.fetch_dictionary('phenotype')
go = tea.fetch_dictionary('go')


In [5]:
strains =['ps4187', 'ps4087', 'ps4176', 'ew15', 'mt21245', 'mt4866','n2731', 'n2376', 'n767', 'n2731-n767', 'tm1489',
          'n3809', 'n3917', 'n745', 'n767-tm1489', 'n2731-tm1489',
          'ia4', 'ok1402', 'ok161', 'sa307', 'sa307-ok161', 'sa307-ia4',
          'q71']

strain_dict = {}
for i, g in enumerate(strains):
    strain_dict[g] = i

In [6]:
q = 0.1

In [115]:
genmap = pd.read_csv('../sleuth/rna_seq_info.txt', sep=' ', comment='#')
tidy  = pd.read_csv('../input/quantifications.csv')

In [116]:
# stress = epi.find_overlap(['sa307', 'sa307-ok161'], tidy)
# tidy = tidy[(tidy.strain.isin(['mt21245', 'mt4866'])) & (~tidy.target_id.isin(stress))]

In [117]:
len(stress)

2966

In [125]:
print('mt2124 DE genes sans stress: {0}'.format(len(tidy[(tidy.strain == 'mt21245') & (tidy.qval < q)])))
print('mt4866 DE genes sans stress: {0}'.format(len(tidy[(tidy.strain == 'mt4866') & (tidy.qval < q)])))

mt2124 DE genes sans stress: 3096
mt4866 DE genes sans stress: 899


In [120]:
ras = epi.find_overlap(['mt21245', 'mt4866'], tidy)

gf = tidy[(tidy.strain == 'mt21245') & (tidy.target_id.isin(ras))].copy()
lf = tidy[(tidy.strain == 'mt4866') & (tidy.target_id.isin(ras))].copy()

lf['standardized_b'] = lf.b.values/gf.b.values

ras_ac = lf[lf.standardized_b < 0].ens_gene.unique()
ras_corr = lf[lf.standardized_b > 0].ens_gene.unique()

ind = (tidy.qval < q) & (tidy.strain == 'mt21245')
notin = (~tidy.ens_gene.isin(ras_ac)) & (~tidy.ens_gene.isin(ras_corr))
ras_gf = tidy[ind & notin]


ind = (tidy.qval < q) & (tidy.strain == 'mt4866')
notin = (~tidy.ens_gene.isin(ras_ac)) & (~tidy.ens_gene.isin(ras_corr))
ras_lf = tidy[ind & notin]


print('ras overlap: {0}'.format(len(ras)))
print('ras anticorr: {0}'.format(len(ras_ac)))
print('ras corr: {0}'.format(len(ras_corr)))
print('ras gf: {0}'.format(len(ras_gf)))
print('ras lf: {0}'.format(len(ras_lf)))

ras overlap: 269
ras anticorr: 73
ras corr: 190
ras gf: 2817
ras lf: 621


In [121]:
tea_ = ea.enrichment_analysis(ras_gf.ens_gene.unique(), tissue, show=False)
tea_

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
131,male WBbt:0007850,364.351076,1240,3.403311,0.0,0.0
99,reproductive system WBbt:0005747,1423.246391,1974,1.38697,1.0552889999999999e-57,1.466852e-55
240,amphid sheath cell WBbt:0006754,88.810575,219,2.465923,1.088077e-39,1.008285e-37
212,oocyte WBbt:0006797,21.348696,51,2.388905,2.832873e-10,1.968846e-08
201,AB WBbt:0004015,16.225009,32,1.972264,3.695849e-05,0.002054892
108,epithelial system WBbt:0005730,564.601844,645,1.142398,0.0001121203,0.005194907
247,ABalpppa WBbt:0006649,5.550661,14,2.522222,0.0001651727,0.006559716
3,ABplpppa WBbt:0006423,5.266012,13,2.468661,0.0003309609,0.01150089
53,ABplpppp WBbt:0006647,5.408336,13,2.403697,0.0004557815,0.01407858
275,male distal tip cell WBbt:0006864,88.525926,117,1.321647,0.0006493009,0.01805056


In [122]:
pea = ea.enrichment_analysis(ras_gf.ens_gene.unique(), pheno, show=False)
pea

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
196,linker cell migration variant WBPhenotype:0001511,6.635976,25,3.767343,3.826431e-10,8.265092e-08
167,lipid metabolism variant WBPhenotype:0000725,73.765126,118,1.599672,1.425114e-07,1.539123e-05
160,fat content reduced WBPhenotype:0001183,44.624535,73,1.635871,1.133377e-05,0.0008160311
97,fat content increased WBPhenotype:0001184,25.293648,42,1.660496,0.0004099866,0.02213928
163,mRNA levels reduced WBPhenotype:0000137,12.40639,24,1.934487,0.000534118,0.0230739
211,neuropil development variant WBPhenotype:0000945,9.809704,20,2.038798,0.0006322923,0.0230739
94,ectopic expression transgene WBPhenotype:0001276,11.829349,23,1.944317,0.0006162698,0.0230739
60,ventral cord patterning variant WBPhenotype:00...,8.07858,17,2.10433,0.0009186366,0.02480319
23,transgene expression undetectable WBPhenotype:...,6.443629,14,2.172689,0.00149488,0.03587711
44,axon fasciculation variant WBPhenotype:0000632,8.559448,17,1.986109,0.001842144,0.03979031


In [123]:
gea = ea.enrichment_analysis(ras_gf.ens_gene.unique(), go, show=False)
gea[gea['Enrichment Fold Change'] > 3]

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
201,developmental process GO:0032502,186.932061,652,3.487898,3.0762329999999997e-168,9.228698e-166
175,embryo development GO:0009790,108.209968,354,3.271418,8.637915e-84,1.2956870000000001e-81
161,reproduction GO:0000003,104.589805,343,3.279478,1.665717e-81,1.665717e-79
5,regulation of cell shape GO:0008360,3.323968,69,20.758323,4.5508589999999996e-79,3.413144e-77
138,peptidyl-serine modification GO:0018209,5.002407,79,15.792397,5.1615520000000003e-76,3.096931e-74
74,immune system process GO:0002376,11.288327,71,6.289683,3.611371e-36,1.54773e-34
244,multi-organism process GO:0051704,25.439874,105,4.127379,8.309687e-35,3.116133e-33
208,aging GO:0007568,31.067582,101,3.250977,3.7268690000000002e-25,1.118061e-23
67,side of membrane GO:0098552,3.949269,30,7.596343,2.9154469999999995e-19,7.95122e-18
49,oviposition GO:0018991,13.592067,49,3.605044,4.515978e-15,8.467458e-14


In [102]:
tea = ea.enrichment_analysis(ras_lf.ens_gene.unique(), tissue, show=False)
tea

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
89,spermathecal-uterine valve cell WBbt:0008217,20.189172,54,2.674701,4.331298e-11,4.305777e-09
93,Nucleus WBbt:0006803,19.86354,54,2.718549,2.349354e-11,4.305777e-09
78,gon herm sujn WBbt:0008210,19.646452,54,2.748588,1.548841e-11,4.305777e-09
56,spermathecal-uterine junction WBbt:0006756,21.301747,55,2.581948,1.110762e-10,7.719799e-09
139,somatic gonad WBbt:0005785,33.540076,66,1.967795,1.192552e-07,6.630589e-06
63,hermaphrodite WBbt:0007849,81.869263,121,1.477966,1.123997e-05,0.0005207851
52,intestine WBbt:0005772,151.690149,195,1.285515,0.0001729151,0.0068672
128,reproductive tract WBbt:0005744,66.8902,94,1.405288,0.0005008505,0.01740456
154,hmc WBbt:0004697,1.655295,6,3.624732,0.001303964,0.04027801


In [103]:
pea = ea.enrichment_analysis(ras_lf.ens_gene.unique(), pheno, show=False)
pea

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
47,mitochondria alignment variant WBPhenotype:000...,3.418119,11,3.218144,0.000173,0.037388


In [104]:
gea_lf = ea.enrichment_analysis(ras_lf.ens_gene.unique(), go, show=False)
gea_lf[gea_lf['Enrichment Fold Change'] > 3]

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
167,developmental process GO:0032502,41.544344,136,3.273611,8.661666000000001e-33,2.5984999999999998e-30
134,reproduction GO:0000003,23.244353,82,3.527739,1.893923e-22,2.8408839999999996e-20
146,embryo development GO:0009790,24.048909,82,3.409718,1.525863e-21,1.346361e-19
5,post-embryonic development GO:0009791,17.166298,68,3.96125,1.346361e-21,1.346361e-19
115,collagen trimer GO:0005581,1.397002,16,11.4531,1.13446e-13,6.80676e-12
127,aging GO:0007568,6.904553,26,3.765631,4.125474e-09,2.062737e-07
128,cell death GO:0008219,5.602635,22,3.926724,2.34949e-08,8.810589e-07
47,positive regulation of growth GO:0045927,2.896401,15,5.17884,6.118043e-08,2.039348e-06
17,multicellular organism growth GO:0035264,3.028056,15,4.953673,1.117494e-07,3.352481e-06
199,immune system process GO:0002376,2.508752,13,5.18186,3.552715e-07,9.689222e-06


In [105]:
tea_ac = ea.enrichment_analysis(ras_ac, tissue, show=False)
tea_ac

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
17,somatic gonad WBbt:0005785,3.267622,14,4.284461,1e-06,0.000386
29,gon herm sujn WBbt:0008210,1.914044,10,5.22454,5e-06,0.000631
19,spermathecal-uterine junction WBbt:0006756,2.07531,10,4.818557,1e-05,0.000631
18,Nucleus WBbt:0006803,1.935194,10,5.167441,5e-06,0.000631
11,spermathecal-uterine valve cell WBbt:0008217,1.966918,10,5.084095,6e-06,0.000631
13,hermaphrodite WBbt:0007849,7.976065,18,2.256752,0.000488,0.022589


In [106]:
pea_ac = ea.enrichment_analysis(ras_ac, pheno, show=False)
pea_ac

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value


In [107]:
gea_ac = ea.enrichment_analysis(ras_ac, go, show=False)
gea_ac

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
2,post-embryonic development GO:0009791,1.524448,9,5.903776,4e-06,0.001193
3,lytic vacuole GO:0000323,0.088986,2,22.475525,0.000106,0.015931
45,response to biotic stimulus GO:0009607,0.114317,2,17.495153,0.000222,0.02221
29,embryo development GO:0009790,2.135656,8,3.745921,0.000342,0.025623
25,developmental process GO:0032502,3.689334,11,2.981568,0.000391,0.025623


In [108]:
tea_corr = ea.enrichment_analysis(ras_corr, tissue, show=False)
tea_corr

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value


In [109]:
pea_corr = ea.enrichment_analysis(ras_corr, pheno, show=False)
pea_corr

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value


In [110]:
gea_corr = ea.enrichment_analysis(ras_corr, go, show=False)
ind = (gea_corr['Expected'] > 1) | (gea_corr['Observed'] > 4)
gea_corr[(gea_corr['Enrichment Fold Change'] > 3) & (ind)]

Unnamed: 0,Tissue,Expected,Observed,Enrichment Fold Change,P value,Q value
115,developmental process GO:0032502,17.484993,62,3.545898,9.917349e-18,2.975205e-15
119,embryo development GO:0009790,10.121595,42,4.149544,7.17774e-15,1.076661e-12
109,reproduction GO:0000003,9.782977,33,3.373206,8.550326e-10,8.550326e-08
3,post-embryonic development GO:0009791,7.224873,26,3.598679,1.054614e-08,7.909603e-07
163,myosin filament organization GO:0031033,0.452517,7,15.469048,2.396128e-08,1.437677e-06
19,cellular component assembly involved in morpho...,0.50177,7,13.950613,5.346464e-08,2.673232e-06
33,striated muscle cell differentiation GO:0051146,0.523318,7,13.376176,7.397818e-08,3.170493e-06
17,muscle cell development GO:0055001,0.523318,7,13.376176,7.397818e-08,3.170493e-06
186,actomyosin structure organization GO:0031032,0.551024,7,12.703631,1.100203e-07,3.667342e-06
76,development of primary sexual characteristics ...,3.059874,14,4.575352,7.456211e-07,2.236863e-05


In [114]:
tidy[tidy.ens_gene.isin(ras_ac)].ext_gene.unique()

array(['pdi-6', 'B0513.4', 'gck-4', 'C17F4.7', 'C17G1.2', 'C18E9.5',
       'best-7', 'C32H11.4', 'C42D4.1', 'nspc-20', 'ddo-3', 'ifd-2',
       'F32H5.1', 'nhr-8', 'F37H8.5', 'cpr-4', 'F48E3.4', 'skpo-1',
       'F52E1.14', 'lys-5', 'lys-6', 'lpin-1', 'cah-3', 'M28.10', 'dpy-27',
       'T04G9.7', 'lin-59', 'pod-2', 'Y105C5B.15', 'cls-3', 'ZK1307.1',
       'ZK673.1'], dtype=object)