In [1]:
import synapseclient
import pandas as pd
import numpy as np

syn = synapseclient.Synapse()
syn.login(silent=True)

rna = 'syn12177499'
rna = syn.get(rna)

rna_df = pd.read_json(rna.path, orient='records')
rna_df.head(5)

Unnamed: 0,ensembl_gene_id,hgnc_symbol,logfc,fc,ci_l,ci_r,adj_p_val,tissue,study,model
0,ENSG00000228521,AC099552.3,1.646715,3.131199,1.219379,2.074052,2.261704e-10,CBE,MayoRNAseq,AD Diagnosis (males and females)
1,ENSG00000163221,S100A12,1.584079,2.998163,0.890001,2.278157,0.0001323673,CBE,MayoRNAseq,AD Diagnosis (males and females)
2,ENSG00000273802,HIST1H2BG,1.373508,2.590998,0.970485,1.776531,1.59062e-08,CBE,MayoRNAseq,AD Diagnosis (males and females)
3,ENSG00000124107,SLPI,1.369576,2.583946,0.564874,2.174278,0.004777964,CBE,MayoRNAseq,AD Diagnosis (males and females)
4,ENSG00000168329,CX3CR1,-1.362178,0.388995,-1.87164,-0.852716,7.491453e-06,CBE,MayoRNAseq,AD Diagnosis (males and females)


We first get summary statistics about the rna dataset and select the interesting columns.  Subsequently we rename those columns to calculate the correct interquartile range:

In [2]:
rna_df = rna_df[['tissue', 'model', 'logfc']]
rna_df = rna_df.groupby(['tissue', 'model']).agg('describe')['logfc'].reset_index()[['model', 'tissue', 'min', 'max', '25%', '50%', '75%']]
rna_df.rename(columns={'25%': 'first_quartile', '50%': 'median', '75%': 'third_quartile'}, inplace=True)

We calculate the quartiles correctly and round them up.  Since we don't need the interquartile range for the portal, we drop that column.

In [3]:
rna_df['IQR'] = rna_df['third_quartile'] - rna_df['first_quartile']
rna_df['min'] = rna_df['first_quartile'] - (1.5 * rna_df['IQR'])
rna_df['max'] = rna_df['third_quartile'] + (1.5 * rna_df['IQR'])

for col in ['min', 'max', 'median', 'first_quartile', 'third_quartile']:
    rna_df[col] = np.around(rna_df[col], 4)

rna_df.drop('IQR', axis=1, inplace=True)