In [None]:
import synapseclient
import pandas as pd

syn = synapseclient.Synapse()
syn.login()

diff_exp_data = syn.get(entity='syn14237651')
gene_info = syn.get(entity='syn25953363')
target_list = syn.get('syn12540368')
eqtl = syn.get('syn12514912')

diff_exp_data = pd.read_csv(diff_exp_data.path, sep='\t')
gene_info = pd.read_feather(gene_info.path)
target_list = pd.read_csv(target_list.path)
eqtl = pd.read_csv(eqtl.path)


models_to_keep = ["Diagnosis AD-CONTROL ALL", "Diagnosis.AOD AD-CONTROL ALL", "Diagnosis.Sex AD-CONTROL FEMALE", "Diagnosis.Sex AD-CONTROL MALE"]


# interestingly, the list will hold references to the original objects instead of copying them
for df in [diff_exp_data, gene_info, target_list, eqtl]:
    df.columns = df.columns.str.replace("[#,@,&,*,^,?,(,),%,$,#,!,/]", "")
    df.columns = df.columns.str.replace("[' ', '-', '.']", "_")
    df.columns = map(str.lower, df.columns)
    
    for column in df:
        dt = df[column].dtype
        if dt == int or dt == float:
            df[column] = df[column].fillna(0)
        else:
            df[column] = df[column].fillna("")

    df = df.replace(["NA", "n/a", "N/A", "na", "n/A", "N/a", "Na", "nA"], "")
    
print(diff_exp_data.columns, gene_info.columns)

In [None]:
# gene_info = gene_info[gene_info['hgnc_symbol'].notna()]
gene_info = pd.merge(left=gene_info, right=eqtl, on='ensembl_gene_id', how='left')

The next few blocks is analogous to the function get_rnaseq_diff_expr_data in the lib.R in the original repo:

In [None]:
diff_exp_data['tmp'] = diff_exp_data[['model', 'comparison', 'sex']].agg(' '.join, axis=1)
print(diff_exp_data.shape[0])
diff_exp_data = diff_exp_data[diff_exp_data['tmp'].isin(models_to_keep)]
print(diff_exp_data.shape[0])

In [None]:
diff_exp_data['study'].replace(to_replace = {'MAYO': 'MayoRNAseq', 'MSSM': 'MSBB'}, inplace=True)
diff_exp_data['sex'].replace(to_replace={'ALL': 'males and females', 'FEMALE': 'females only', 'MALE': 'males only'}, inplace=True)
diff_exp_data['model'].replace(to_replace='\\.', value=' x ', regex=True)
diff_exp_data['model'].replace(to_replace = {'Diagnosis': 'AD Diagnosis'}, inplace=True)
diff_exp_data['logfc'] = diff_exp_data['logfc'].round(decimals=3)
diff_exp_data['fc'] = 2**diff_exp_data['logfc']
diff_exp_data['model'] = diff_exp_data['model'] + " (" + diff_exp_data['sex'] + ")"

[print(diff_exp_data[col].value_counts(dropna=False)) for col in ['study', 'sex', 'model', 'logfc', 'fc']]

print(diff_exp_data[['logfc', 'fc']].iloc[:1]) # quick check on the fc column


The cells below are analogous to get_target_list in the lib.R:

In [None]:
adj_p_value_threshold = 1

adjusted_diff_exp_data = diff_exp_data.loc[((diff_exp_data['adj_p_val'] <= adj_p_value_threshold) 
                                            | (diff_exp_data['ensembl_gene_id'].isin(target_list['ensembl_gene_id']))) 
                                            & (diff_exp_data['ensembl_gene_id'].isin(gene_info['ensembl_gene_id']))
                                                ]

adjusted_diff_exp_data = adjusted_diff_exp_data.drop_duplicates(['ensembl_gene_id'])
adjusted_diff_exp_data = adjusted_diff_exp_data[['ensembl_gene_id']]

diff_exp_data = diff_exp_data[diff_exp_data['ensembl_gene_id'].isin(adjusted_diff_exp_data['ensembl_gene_id'])]
diff_exp_data = diff_exp_data[['ensembl_gene_id', 'logfc', 'fc', 'ci_l', 'ci_r',
                  'adj_p_val', 'tissue', 'study', 'model']]
                              
diff_exp_data = pd.merge(left=diff_exp_data, right=gene_info, on='ensembl_gene_id', how='left')

diff_exp_data = diff_exp_data[diff_exp_data['hgnc_symbol'].notna()]
diff_exp_data = diff_exp_data[['ensembl_gene_id', 'hgnc_symbol', 'logfc', 'fc', 'ci_l', 'ci_r', 'adj_p_val', 'tissue',
       'study', 'model']]

In [None]:
diff_exp_data.shape

In [None]:
old_rna = syn.get('syn25721516')
old_rna = pd.read_json(old_rna.path, orient='records')
old_rna.shape

In [None]:
weird = gene_info[gene_info['symbol'] != gene_info['hgnc_symbol']]
len(weird.groupby(['ensembl_gene_id']))

In [None]:
print(old_rna.shape)
print(diff_exp_data.shape)
print(old_rna['ensembl_gene_id'].value_counts())
print(diff_exp_data['ensembl_gene_id'].value_counts())

# s_old = set(old_rna['ensembl_gene_id'])
# s_new = set(diff_exp_data['ensembl_gene_id'])
# print(s_old.symmetric_difference(s_new))