In [1]:
import synapseclient
import pandas as pd
from numpy import NaN

In [2]:
syn = synapseclient.Synapse()
syn.login(silent=True)

In [3]:
gene_info = 'syn25953363'
igap = 'syn12514826' 
eqtl = 'syn12514912'
brain_expression_change = 'syn11914808'
rna_change = 'syn14237651'
proteomics = 'syn18689335'

gene_info = syn.get(gene_info)
igap = syn.get(igap)
eqtl = syn.get(eqtl)
brain_expression_change = syn.get(brain_expression_change)
rna_change = syn.get(rna_change)
proteomics = syn.get(proteomics)

gene_info = pd.read_feather(gene_info.path)
igap = pd.read_csv(igap.path)
eqtl = pd.read_csv(eqtl.path)
brain_expression_change = pd.read_csv(brain_expression_change.path, sep='\t')
rna_change = pd.read_csv(rna_change.path, sep='\t')
proteomics = pd.read_csv(proteomics.path)

Prepare gene_info to remove duplicate ensembl_gene_ids (this should take 5 or 6 minutes):

In [4]:
print(gene_info.shape)
gene_info = gene_info[['ensembl_gene_id', 'symbol', '_version', 'alias', 'name', 'summary']]
gene_info = gene_info.groupby('ensembl_gene_id').apply(lambda x: x.nlargest(1, "_version")).reset_index(drop=True)
print(gene_info.shape)

gene_info = gene_info[['ensembl_gene_id', 'symbol',  'name', 'summary', 'alias']]

(60727, 9)
(56710, 6)


Create gene_metadata dataset by joining gene_info with the others.  First is igap:

In [5]:
gene_metadata = pd.merge(left = gene_info, right=igap, how='left', on='ensembl_gene_id')
gene_metadata['igap'] = gene_metadata.apply(lambda row: False if row['hgnc_symbol'] is NaN else True, axis=1)
gene_metadata['igap'].fillna(False, inplace=True)
gene_metadata = gene_metadata[['ensembl_gene_id', 'symbol', 'name', 'summary', 'alias', 'igap']]
gene_metadata['igap'].value_counts()

False    56685
True        25
Name: igap, dtype: int64

Join eqtl to the gene_metadata table:

In [6]:
gene_metadata = pd.merge(left = gene_metadata, right=eqtl, how='left', on='ensembl_gene_id')
gene_metadata = gene_metadata[['ensembl_gene_id', 'symbol', 'name', 'summary', 'alias', 'igap', 'hasEqtl']]
gene_metadata.rename(columns={'hasEqtl':'eqtl'}, inplace=True)
gene_metadata['eqtl'] = gene_metadata['eqtl'].replace({'TRUE': True}).fillna(False)
gene_metadata['eqtl'].value_counts()

False    38388
True     18322
Name: eqtl, dtype: int64

This is the current state of the table: ['ensembl_gene_id', 'igap', 'eqtl']
Next we add brain_expression_data to it.

In [7]:
ad_brain_change_threshold = 0.05

gene_metadata = pd.merge(left = gene_metadata, right=brain_expression_change, how='left', on='ensembl_gene_id')
gene_metadata['fdr.random'] = gene_metadata['fdr.random'].fillna(-1)
gene_metadata['brain_change_studied'] = gene_metadata.apply(lambda row: False if row['fdr.random'] == -1 else True, axis=1)
gene_metadata['ad_brain_change'] = gene_metadata.apply(lambda row: True if row['fdr.random'] <= ad_brain_change_threshold else False, axis=1)
gene_metadata = gene_metadata[['ensembl_gene_id', 'symbol', 'name', 'summary', 'alias', 'igap', 'eqtl', 'ad_brain_change', 'brain_change_studied']]

gene_metadata['ad_brain_change'].value_counts()

True     46113
False    10597
Name: ad_brain_change, dtype: int64

Current datafreame is ['ensembl_gene_id', 'igap', 'eqtl', 'ad_brain_change'].
Lastly we need to add rna change in the brain:

In [8]:
adjusted_p_value_threshold = 0.05

rna_change = rna_change[['ensembl_gene_id', 'adj.P.Val']]
rna_change = rna_change.groupby('ensembl_gene_id')['adj.P.Val'].agg('min').reset_index()

gene_metadata = pd.merge(left = gene_metadata, right=rna_change, how='left', on='ensembl_gene_id')
gene_metadata['adj.P.Val'] = gene_metadata['adj.P.Val'].fillna(-1)
gene_metadata['rna_brain_change_studied'] = gene_metadata.apply(lambda row: False if row['adj.P.Val'] == -1 else True, axis=1)
gene_metadata['rna_in_ad_brain_change'] = gene_metadata.apply(lambda row: True if row['adj.P.Val'] <= adjusted_p_value_threshold else False, axis=1)

gene_metadata = gene_metadata[['ensembl_gene_id', 'name', 'summary', 'alias', 'igap', 'symbol', 'eqtl', 'ad_brain_change', 'brain_change_studied', 'rna_in_ad_brain_change', 'rna_brain_change_studied']]

gene_metadata['rna_in_ad_brain_change'].value_counts()

True     55248
False     1462
Name: rna_in_ad_brain_change, dtype: int64

In [9]:
protein_level_threshold = 0.05

proteomics = proteomics.groupby('ENSG')['Cor_PVal'].agg('min').reset_index()

gene_metadata = pd.merge(left = gene_metadata, right=proteomics, how='left', left_on='ensembl_gene_id', right_on='ENSG')
gene_metadata['Cor_PVal'] = gene_metadata['Cor_PVal'].fillna(-1)
gene_metadata['protein_brain_change_studied'] = gene_metadata.apply(lambda row: False if row['Cor_PVal'] == -1 else True, axis=1)
gene_metadata['protein_in_ad_brain_change'] = gene_metadata.apply(lambda row: True if row['Cor_PVal'] <= protein_level_threshold else False, axis=1)

gene_metadata = gene_metadata[['ensembl_gene_id', 'name', 'summary', 'symbol', 'alias', 'igap', 'eqtl', 'ad_brain_change', 'brain_change_studied', 'rna_in_ad_brain_change', 'rna_brain_change_studied', 'protein_in_ad_brain_change', 'protein_brain_change_studied']]
gene_metadata.rename(columns={'symbol': 'hgnc_symbol'}, inplace=True)

In [10]:
gene_metadata.to_json('../output/gene_metadata.json', orient='records')