In [93]:
import synapseclient
import pandas as pd
from numpy import NaN

In [94]:
syn = synapseclient.Synapse()
syn.login(silent=True)

In [95]:
gene_info = 'syn25953363'
igap = 'syn12514826' 
eqtl = 'syn12514912'
brain_expression_change = 'syn11914808'
rna_change = 'syn14237651'

gene_info = syn.get(gene_info)
igap = syn.get(igap)
eqtl = syn.get(eqtl)
brain_expression_change = syn.get(brain_expression_change)
rna_change = syn.get(rna_change)

gene_info = pd.read_feather(gene_info.path)
igap = pd.read_csv(igap.path)
eqtl = pd.read_csv(eqtl.path)
brain_expression_change = pd.read_csv(brain_expression_change.path, sep='\t')
rna_change = pd.read_csv(rna_change.path, sep='\t')

Prepare gene_info to remove duplicate ensembl_gene_ids (this should take 5 or 6 minutes):

In [96]:
print(gene_info.shape)
gene_info = gene_info[['ensembl_gene_id', 'symbol', '_version']]
gene_info = gene_info.groupby('ensembl_gene_id').apply(lambda x: x.nlargest(1, "_version")).reset_index(drop=True)
print(gene_info.shape)

gene_info = gene_info['ensembl_gene_id']

(56742, 9)
(56740, 3)


Create gene_metadata dataset by joining gene_info with the others.  First is igap:

In [97]:
gene_metadata = pd.merge(left = gene_info, right=igap, how='left', on='ensembl_gene_id')
gene_metadata['igap'] = gene_metadata.apply(lambda row: False if row['hgnc_symbol'] is NaN else True, axis=1)
gene_metadata['igap'].fillna(False, inplace=True)
gene_metadata = gene_metadata[['ensembl_gene_id', 'igap']]
gene_metadata['igap'].value_counts()

False    56715
True        25
Name: igap, dtype: int64

Join eqtl to the gene_metadata table:

In [98]:
gene_metadata = pd.merge(left = gene_metadata, right=eqtl, how='left', on='ensembl_gene_id')
gene_metadata = gene_metadata[['ensembl_gene_id', 'igap', 'hasEqtl']]
gene_metadata.rename(columns={'hasEqtl':'eqtl'}, inplace=True)
gene_metadata['eqtl'] = gene_metadata['eqtl'].replace({'TRUE': True}).fillna(False)
gene_metadata['eqtl'].value_counts()

False    38412
True     18328
Name: eqtl, dtype: int64

This is the current state of the table: ['ensembl_gene_id', 'igap', 'eqtl']
Next we add brain_expression_data to it.

In [99]:
ad_brain_change_threshold = 0.05

gene_metadata = pd.merge(left = gene_metadata, right=brain_expression_change, how='left', on='ensembl_gene_id')
gene_metadata = gene_metadata[['ensembl_gene_id', 'igap', 'eqtl', 'fdr.random']]
gene_metadata['ad_brain_change'] = gene_metadata.apply(lambda row: True if row['fdr.random'] <= ad_brain_change_threshold else False, axis=1)
gene_metadata['ad_brain_change'].value_counts()

False    52970
True      3770
Name: ad_brain_change, dtype: int64