In [12]:
q = 0.1

In [2]:
# Specify the genotypes to refer to:
single_mutants = ['b', 'c', 'd', 'e', 'g']
# Specify which genotypes are double mutants 
# and of what single mutants:
double_mutants = {'a' : 'bd', 'f':'bc'}

# initialize the morgan.hunt object:
# target_id is the column with isoform specific names
# b is the name of the column with the GLM regression coefficients
# tpm is the name of the column with the TPM numbers
# qval is the name of the column with the FDR corrected q-values
thomas = morgan.hunt('target_id', 'b', 'tpm', 'qval')

# input the genmap file:
thomas.add_genmap('../input/library_genotype_mapping.txt', comment='#')

# add the names of the single mutants
thomas.add_single_mutant(single_mutants)

# add the names of the double mutants
thomas.add_double_mutants(['a', 'f'], ['bd', 'bc'])

# set the q-value threshold for significance to its default value, 0.1
thomas.set_qval()

# Add the tpm files: 
kallisto_loc = '../input/kallisto_all/'
thomas.add_tpm(kallisto_loc, '/kallisto/abundance.tsv', '')

# load all the beta values for each genotype:
sleuth_loc = '../sleuth_all_adjusted/kallisto/'
for file in os.listdir("../sleuth_all_adjusted/kallisto"):
    if file[:4] == 'beta':
        letter = file[-5:-4].lower()
        thomas.add_beta(sleuth_loc + file, letter)
        thomas.beta[letter].sort_values('target_id', inplace=True)
        thomas.beta[letter].reset_index(inplace=True)

In [3]:
thomas.filter_data(0, 0.1)

Number of na genes: 232


In [4]:
tissue_df = tea.fetch_dictionary()
phenotype_df = pd.read_csv('../input/phenotype_ontology.csv')
go_df = pd.read_csv('../input/go_dictionary.csv')

In [5]:
melted_tissue = pd.melt(tissue_df, id_vars='wbid', var_name='term', value_name='expressed')
melted_tissue = melted_tissue[melted_tissue.expressed == 1]

melted_phenotype = pd.melt(phenotype_df, id_vars='wbid', var_name='term', value_name='expressed')
melted_phenotype = melted_phenotype[melted_phenotype.expressed == 1]

melted_go = pd.melt(go_df, id_vars='wbid', var_name='term', value_name='expressed')
melted_go = melted_go[melted_go.expressed == 1]

In [6]:
frames = []
for key, df in thomas.beta.items():
    df['genotype'] = key
    df['abs_b'] = df.b.abs()
    frames += [df]

tidy_data = pd.concat(frames)

In [9]:
df1 = thomas.beta['f'].copy()
df2 = thomas.beta['c']
df3 = thomas.beta['e']
df4 = thomas.beta['b']
df5 = thomas.beta['a'] 
df6 = thomas.beta['d']

df1['b_c'] = df2.b
df1['b_e'] = df3.b
df1['b_b'] = df4.b
df1['b_a'] = df5.b
df1['b_d'] = df6.b

df1['se_b_c'] = df2.se_b
df1['se_b_e'] = df3.se_b
df1['se_b_b'] = df4.se_b
df1['se_b_a'] = df5.se_b
df1['se_b_d'] = df6.se_b

df1['q_c'] = df2.qval
df1['q_e'] = df3.qval
df1['q_b'] = df4.qval
df1['q_a'] = df5.qval
df1['q_d'] = df6.qval

# HIF-1-OH

In [60]:
egl_matches_hif = (df1.b_b*df1.b_c > 0) # if effect is in same direction, this should be true
egl_anti_vhl = (df1.b_b*df1.b_d < 0) # egl effect should be opposite direction of vhl
egl_matches_eglhif = (df1.b_b*df1.b > 0) # egl should be epistatic to hif
egl_matches_eglvhl = (df1.b_b*df1.b_a > 0) # egl should be epistatic to vhl
ind = egl_matches_hif & egl_anti_vhl & egl_matches_eglhif & egl_matches_eglvhl
ind2 = (df1.qval < q) & (df1.q_b < q) & (df1.q_c < q) & (df1.q_d < q)
df1[ind & ind2][['ext_gene']]

Unnamed: 0,ext_gene
6894,ftn-1
7210,ftn-2
9850,F20D6.11
11479,cat-4


# HIF-1

In [14]:
ind = ((df1.qval < 0.1) & (df1.q_c < 0.1) &
       (df1.q_e < 0.1) & (df1.q_b < 0.1) &
       (df1.q_a < 0.1) & (df1.q_d < 0.1))
ind2 = ((df1.b_e > 0) & (df1.b_b > 0) &
        (df1.b_a > 0) & (df1.b_d > 0))
ind3 = (df1.b < 0) & (df1.b_c < 0)

df1[ind & ind2 & ind3][['ext_gene', 'b_e', 'q_e']].sort_values('q_e')

Unnamed: 0,ext_gene,b_e,q_e
19626,R08E5.3,4.577666,0.0
31611,nit-1,3.369282,4.598481e-32


In [15]:
sig =  ((df1.q_e < 0.1) & (df1.q_b < 0.1) &
                    (df1.q_a < 0.1) & (df1.q_d < 0.1))
up = ((df1.b_e > 0) & (df1.b_b > 0) &
        (df1.b_a > 0) & (df1.b_d > 0))
nochange = (~((df1.q_c < 0.1) & (df1.b_c > 0)) &
        ~((df1.qval < 0.1) & (df1.b > 0)))
hypoxia_direct_targets = df1[sig & up & nochange]
print(hypoxia_direct_targets.shape[0])
hypoxia_direct_targets[['ext_gene', 'b_e', 'q_e']].sort_values('q_e').head()
hypoxia_direct_targets.to_csv('../output/hypoxia_targets_candidates.csv', index=False)

133


In [16]:
ids = hypoxia_direct_targets.ens_gene.unique()
_ = tea.enrichment_analysis(ids, tissue_df, show=True)
_ = tea.enrichment_analysis(ids, phenotype_df, show=True)
_ = tea.enrichment_analysis(ids, go_df, show=False)

Executing script

                          Tissue  Expected  Observed  Enrichment Fold Change  \
33  coelomic system WBbt:0005749  5.910394        16                2.707095   

     P value   Q value  
33  0.000119  0.032602  
Executing script

                                         Tissue  Expected  Observed  \
46  oxygen response variant WBPhenotype:0000464  0.325218         4   

    Enrichment Fold Change  P value   Q value  
46               12.299444  0.00002  0.004965  


# Egl-9

In [19]:
ind =  ((df1.q_e < 0.1) & (df1.q_b < 0.1) &
        (df1.q_a < 0.1) & (df1.q_d > 0.1) &
        (df1.q_c > 0.1))
ind2 = ((df1.b_e*df1.b_b > 0) &
        (df1.b_e*df1.b_a > 0) &
        (df1.b_b*df1.b_a > 0)
        & (df1.b_b*df1.b_d > 0) & (df1.b_e*df1.b_d > 0))

#         & (df1.b_c*df1.b_e < 0) & (df1.b_c*df1.b_b < 0)\
#         & (df1.b_b*df1.b_d > 0) & (df1.b_e*df1.b_d > 0)

# remember ids contains the hypoxia_direct_targets
egl_targets = df1[ind & ind2]
print(egl_targets.ens_gene.unique().shape[0])
egl_targets.to_csv('../output/egl_downstream.csv', index=False)
egl_targets[['ext_gene', 'b_b', 'q_b']].sort_values('q_b').head(5)

432


Unnamed: 0,ext_gene,b_b,q_b
3558,C18H9.6,-3.021398,1.269703e-72
17056,nas-33,3.10255,4.313509e-38
25104,W10G11.3,-2.330393,8.49166e-34
15295,cdo-1,2.171999,6.303173e-32
18213,nas-11,1.521469,7.037006000000001e-32


In [20]:
ids = egl_targets.ens_gene.unique()
_ = tea.enrichment_analysis(ids, tissue_df, show=True)
_ = tea.enrichment_analysis(ids, phenotype_df, show=True)
_ = tea.enrichment_analysis(ids, go_df, show=False)

Executing script

                                 Tissue   Expected  Observed  \
149  anal depressor muscle WBbt:0004292  11.604908        33   
76       intestinal muscle WBbt:0005796   5.002116        15   
103          hermaphrodite WBbt:0007849  56.273800        83   
56                   Psub1 WBbt:0006874   5.827465        15   

     Enrichment Fold Change       P value   Q value  
149                2.843624  3.565821e-08  0.000010  
76                 2.998731  4.965669e-05  0.006778  
103                1.474931  2.128106e-04  0.019366  
56                 2.574018  2.912939e-04  0.019881  
Executing script

                                                Tissue   Expected  Observed  \
50   pleiotropic defects severe early emb WBPhenoty...   5.541985        30   
204  pachytene progression during oogenesis variant...  12.503817        44   
146  germ cell compartment expansion variant WBPhen...  20.015267        56   
89                   rachis narrow WBPhenotype:0001941  1

# Vhl-1

In [21]:
# should be statistically altered in vhl-1 mutants:
ind =  (df1.q_a < 0.1) & (df1.q_d < 0.1)
# should NOT b altered in egl-9, hif-1 or rhy-1 KO
ind2 = ((df1.q_e > 0.1) & (df1.q_b > 0.1) &
        (df1.q_c > 0.1) & (df1.qval > 0.1))

vhl_targets = df1[ind & ind2]
print(vhl_targets.shape[0])
vhl_targets[['ext_gene', 'b_d', 'q_d', 'b_b', 'b_e']].sort_values('q_d').head(26)

36


Unnamed: 0,ext_gene,b_d,q_d,b_b,b_e
8285,F08G12.1,-2.989972,7.173738999999999e-19,-0.395204,-0.271049
11227,F31C3.4,0.556422,1.653782e-08,-0.039171,0.069805
11226,F31C3.3,0.509236,5.936751e-07,0.108664,0.062222
19090,R03A10.5,-1.249179,1.642922e-05,0.329048,0.320421
7621,ttr-49,-0.800108,7.00502e-05,-0.224866,-0.397218
1291,ugt-51,-1.858265,0.0003809678,-0.849478,-0.429457
22569,cyp-29A2,-0.728152,0.00404075,-0.417813,-0.210286
11228,psf-2,0.592778,0.004516616,0.224087,0.094825
11606,F33H2.6,0.390757,0.006374993,-0.127979,-0.199501
31155,duxl-1,0.382592,0.006407984,0.136819,0.063909


In [23]:
ids = vhl_targets.ens_gene.unique()
_ = tea.enrichment_analysis(ids, tissue_df, show=True)
_ = tea.enrichment_analysis(ids, phenotype_df, show=True)
_ = tea.enrichment_analysis(ids, go_df, show=False)

Executing script

Analysis returned no enriched tissues.
Executing script

Analysis returned no enriched tissues.


# Core Hypoxia Response

In [54]:
# select all the genes that are stat. sig. with the exception
# of the egl-9;hif-1 suppressor
ind = ((df1.q_c < 0.1) & (df1.q_e < 0.1) &
      (df1.q_b < 0.1) & (df1.q_a < 0.1) &
       (df1.q_d < 0.1))
y = df1[ind].copy()
y.shape

(55, 32)

In [48]:
y_bs = df1[ind][['ens_gene', 'b_e', 'b_b', 'b_d', 'b_c', 'b', 'b_a']]
all_down = y_bs[y_bs<0].dropna().index
s = 'Number of genes that go down: {0}'
message = s.format(len(all_down))
print(message)

Number of genes that go down: 13


In [50]:
cols = ['ext_gene', 'ens_gene', 
        'b_e', 'b_b', 'b_d', 'b_c', 'b_a',
        'q_e', 'q_b', 'q_d', 'q_c', 'q_a']
sel = df1.ens_gene.isin(y_bs[y_bs<0].dropna().ens_gene)
all_down = df1[(sel)][cols].sort_values('q_a')

In [53]:
all_up = y_bs[y_bs>0].dropna().index
s = 'Number of genes that go up: {0}'
message = s.format(len(all_up))
print(message)

cols = ['ext_gene', 'ens_gene', 
        'b_e', 'b_b', 'b_d', 'b_c', 'b_a',
        'q_e', 'q_b', 'q_d', 'q_c', 'q_a']
all_up = df1[df1.index.isin(all_up)][cols].sort_values('q_a')

Number of genes that go up: 23


In [55]:
ids = all_up.ens_gene
_ = tea.enrichment_analysis(ids, tissue_df, show=True)
_ = tea.enrichment_analysis(ids, phenotype_df, show=True)
_ = tea.enrichment_analysis(ids, go_df, show=False)

Executing script

Analysis returned no enriched tissues.
Executing script

Analysis returned no enriched tissues.
