In [1]:
import pandas as pd
import json
import re


In [2]:
gwas_df=pd.read_csv('gwas_catalog_v1.0.2-associations_e105_r2022-02-21.tsv', sep='\t', low_memory=False)

In [3]:
bipolar_df = gwas_df[gwas_df['MAPPED_TRAIT'].str.contains("Bipolar|lithium", case = False)==True]
bipolar_df.shape

(1610, 38)

In [4]:
flist = ['original_dataset_NR_vs_LR_results.csv', 
         'joined_dataset_NR_vs_LR_results.csv', 
         'original_dataset_CTRL_vs_BD_results.csv'] 

padj_th = 0.1
log2FoldChange_th =0

deseq_dfs=[]

for fname in flist:

    df=pd.read_csv(fname, index_col=0)
    df.dropna(axis=0, inplace=True)
    deseq_dfs.append(df)
    

In [5]:
common_genes_list =[]

for key, study_df in bipolar_df.groupby('STUDY ACCESSION'): 
    reported_genes = set(study_df['REPORTED GENE(S)'].to_list())

    for f_idx, df in enumerate(deseq_dfs): 
        gene_name_list = set(df[(abs(df['log2FoldChange'])> log2FoldChange_th) & (df['padj'] < padj_th)].index)
        common_genes = reported_genes.intersection(gene_name_list)
        
        if common_genes: 
            common_genes_list.append({'STUDY ACCESSION': key, 
                                      'link' : study_df['LINK'].iloc[0],
                                      'title': study_df['STUDY'].iloc[0],
                                      'common genes': common_genes, 
                                      'dataset' : f_idx, 
                                      'total reported genes': len(reported_genes)
                                    })


In [10]:
pd.DataFrame(common_genes_list)

Unnamed: 0,STUDY ACCESSION,link,title,common genes,dataset,total reported genes
0,GCST005081,www.ncbi.nlm.nih.gov/pubmed/29121268,Association of Polygenic Score for Schizophren...,{ADCY1},0,8
1,GCST008103,www.ncbi.nlm.nih.gov/pubmed/31043756,Genome-wide association study identifies 30 lo...,"{NPTX1, RIMS1, BCL11B}",0,146
2,GCST009600,www.ncbi.nlm.nih.gov/pubmed/31835028,"Genomic Relationships, Novel Loci, and Pleiotr...","{RIMS1, ZSWIM6, BCL11B}",0,111
3,GCST011102,www.ncbi.nlm.nih.gov/pubmed/33263727,Novel Risk Loci Associated With Genetic Risk f...,{RIMS1},0,23
4,GCST012465,www.ncbi.nlm.nih.gov/pubmed/34002096,"Genome-wide association study of more than 40,...",{BCL11B},0,63


In [11]:
bipolar_df.shape

(1610, 38)

In [8]:

bipolar_df.drop_duplicates(subset='LINK').shape

(55, 38)

In [26]:
bipolar_df.drop_duplicates(subset='LINK').loc[:,['STUDY ACCESSION', 'LINK', 'STUDY', 'REPORTED GENE(S)', 'MAPPED_GENE']].to_csv('Bipolar_lithium_GWAS results.csv', index=None)