In [2]:
import synapseclient
import pandas as pd

syn = synapseclient.Synapse()
syn.login()

diff_exp_data = syn.get(entity='syn14237651')
gene_info = syn.get(entity='syn25953363')
target_list = syn.get('syn12540368')
eqtl = syn.get('syn12514912')

diff_exp_data = pd.read_csv(diff_exp_data.path, sep='\t')
gene_info = pd.read_feather(gene_info.path)
target_list = pd.read_csv(target_list.path)
eqtl = pd.read_csv(eqtl.path)


models_to_keep = ["Diagnosis AD-CONTROL ALL", "Diagnosis.AOD AD-CONTROL ALL", "Diagnosis.Sex AD-CONTROL FEMALE", "Diagnosis.Sex AD-CONTROL MALE"]


# interestingly, the list will hold references to the original objects instead of copying them
for df in [diff_exp_data, gene_info, target_list, eqtl]:
    df.columns = df.columns.str.replace("[#,@,&,*,^,?,(,),%,$,#,!,/]", "")
    df.columns = df.columns.str.replace("[' ', '-', '.']", "_")
    df.columns = map(str.lower, df.columns)
    
    for column in df:
        dt = df[column].dtype
        if dt == int or dt == float:
            df[column] = df[column].fillna(0)
        else:
            df[column] = df[column].fillna("")

    df = df.replace(["NA", "n/a", "N/A", "na", "n/A", "N/a", "Na", "nA"], "")
    
    
    print(df.shape)

Welcome, Matthew Fazza!



  df.columns = df.columns.str.replace("[#,@,&,*,^,?,(,),%,$,#,!,/]", "")

  df.columns = df.columns.str.replace("[' ', '-', '.']", "_")



(1330018, 18)
(56742, 9)
(693, 13)
(19396, 3)


In [3]:
# gene_info = gene_info[gene_info['hgnc_symbol'].notna()]
eqtl = eqtl[['ensembl_gene_id', 'haseqtl']]
gene_info = pd.merge(left=gene_info, right=eqtl, on='ensembl_gene_id', how='left')
print(gene_info.shape)

(56742, 10)


The next few blocks is analogous to the function get_rnaseq_diff_expr_data in the lib.R in the original repo:

In [4]:
diff_exp_data['tmp'] = diff_exp_data[['model', 'comparison', 'sex']].agg(' '.join, axis=1)
print(diff_exp_data.shape[0])
diff_exp_data = diff_exp_data[diff_exp_data['tmp'].isin(models_to_keep)]
print(diff_exp_data.shape[0])

1330018
460008


In [5]:
diff_exp_data['study'].replace(to_replace = {'MAYO': 'MayoRNAseq', 'MSSM': 'MSBB'}, inplace=True)
diff_exp_data['sex'].replace(to_replace={'ALL': 'males and females', 'FEMALE': 'females only', 'MALE': 'males only'}, inplace=True)
diff_exp_data['model'].replace(to_replace='\\.', value=' x ', regex=True)
diff_exp_data['model'].replace(to_replace = {'Diagnosis': 'AD Diagnosis'}, inplace=True)
diff_exp_data['logfc'] = diff_exp_data['logfc'].round(decimals=3)
diff_exp_data['fc'] = 2**diff_exp_data['logfc']
diff_exp_data['model'] = diff_exp_data['model'] + " (" + diff_exp_data['sex'] + ")"

[print(diff_exp_data[col].value_counts(dropna=False)) for col in ['study', 'sex', 'model', 'logfc', 'fc']]

print(diff_exp_data[['logfc', 'fc']].iloc[:1]) # quick check on the fc column


MSBB          261568
MayoRNAseq    136104
ROSMAP         62336
Name: study, dtype: int64
males and females    230004
females only         115002
males only           115002
Name: sex, dtype: int64
Diagnosis.AOD (males and females)    115002
Diagnosis.Sex (males only)           115002
Diagnosis.Sex (females only)         115002
AD Diagnosis (males and females)     115002
Name: model, dtype: int64
 0.000    39694
-0.001    24739
 0.001    22687
-0.002    10407
 0.002     9156
          ...  
 1.263        1
 1.658        1
 1.138        1
 1.229        1
-2.242        1
Name: logfc, Length: 2158, dtype: int64
1.000000    39694
0.999307    24739
1.000693    22687
0.998615    10407
1.001387     9156
            ...  
0.445346        1
0.576343        1
2.406606        1
0.436181        1
2.061937        1
Name: fc, Length: 2158, dtype: int64
        logfc        fc
136104  0.363  1.286097


The cells below are analogous to get_target_list in the lib.R:

In [6]:
adj_p_value_threshold = 1


print(diff_exp_data.shape)

adjusted_diff_exp_data = diff_exp_data.loc[((diff_exp_data['adj_p_val'] <= adj_p_value_threshold) 
                                            | (diff_exp_data['ensembl_gene_id'].isin(target_list['ensembl_gene_id']))) 
                                            & (diff_exp_data['ensembl_gene_id'].isin(gene_info['ensembl_gene_id']))
                                                ]

print(adjusted_diff_exp_data.shape)

adjusted_diff_exp_data = adjusted_diff_exp_data.drop_duplicates(['ensembl_gene_id'])
adjusted_diff_exp_data = adjusted_diff_exp_data[['ensembl_gene_id']]

print(adjusted_diff_exp_data.shape)

diff_exp_data = diff_exp_data[diff_exp_data['ensembl_gene_id'].isin(adjusted_diff_exp_data['ensembl_gene_id'])]
diff_exp_data = diff_exp_data[['ensembl_gene_id', 'logfc', 'fc', 'ci_l', 'ci_r',
                  'adj_p_val', 'tissue', 'study', 'model', 'hgnc_symbol']]
                            
print(diff_exp_data.shape)
diff_exp_data = pd.merge(left=diff_exp_data, right=gene_info, on='ensembl_gene_id', how='left')
print(diff_exp_data.shape)
diff_exp_data = diff_exp_data[diff_exp_data['hgnc_symbol'].notna()]
diff_exp_data = diff_exp_data[['ensembl_gene_id', 'hgnc_symbol', 'logfc', 'fc', 'ci_l', 'ci_r', 'adj_p_val', 'tissue',
       'study', 'model']]

(460008, 20)
(457408, 20)
(18381, 1)
(457408, 10)
(457436, 19)


In [7]:
diff_exp_data.shape

(457436, 10)

In [8]:
old_rna = syn.get('syn25721516')
old_rna = pd.read_json(old_rna.path, orient='records')
old_rna.shape

(458944, 10)