In [1]:
## Import libraries

import pandas as pd
import numpy as np

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
function name: parse_df_columns

purpose: parsing the last aggregate column of the gtf/gff3 into useful columns and cleaning non-relevant columns

input: dataframe containining "raw" gtf/gff

output: dataframe containing gtf with useful columns ["gene_id", "transcript_id", etc...]
'''

def parse_df_columns(df, is_ref=True, is_transcript=False, is_exon=False):

    if is_ref:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)
        
        ## Get gene names
        df["gene_name"] = df["other"].str.split("gene_name \"", expand=True)[1].str.split('\";', expand=True)[0]
        
        ## Get get transcript biotype
        df["gene_biotype"] = df["other"].str.split('gene_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_transcript:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
        
        if is_exon:
            df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]
            df["transcript_biotype"] = df["other"].str.split('transcript_biotype "', expand=True)[1].str.split('"', expand=True)[0]
            df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

            
        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)
        

    else:

        ## Get gene ids
        df["gene_id"] = df["other"].str.split('";', expand=True)[0].str.extract("([^ \"]*$)", expand=True)

        ## Get transcript ids
        df["transcript_id"] = df["other"].str.split('transcript_id "', expand=True)[1].str.split('"', expand=True)[0]

        ## Get exon number
        df["exon_number"] = df["other"].str.split('exon_number "', expand=True)[1].str.split('"', expand=True)[0]

        ## Drop "other" column
        df.drop(columns=["other", "dot_1", "dot_2"], inplace=True)

    for col in df.columns:
        df.loc[df[col].isnull(), col] = np.NaN
        

    return df

In [3]:
## Open original reference
original_ref = pd.read_csv("../../references/Homo_sapiens.GRCh38.107_ERCC.gtf", header=None, delimiter="\t", low_memory=False, 
                       names=["chr", "source", "type", "start", "end", "dot_1", "strand", "dot_2", "other"], comment="#")

In [4]:
original_ref_gene = original_ref.loc[original_ref["type"]=="gene"].copy()
orig_ref_gene =  parse_df_columns(original_ref_gene.copy(), is_ref=True)

In [5]:
## Drop genes without name
orig_ref_gene.dropna(inplace=True)

In [6]:
## Create parkinson's dictionary
## List of curated genes made from the following articles:
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6201690/
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8048219/

dict_parkinsons = {"gene_name": ['LRRK2', 'PARK2', 'PRNP', 'PARK7', 'PINK1', 'SNCA', 'MAPT', 'GBA', 'ACMSD', 'ASXL3', 'BCKDK', 'BRIP1', 'BST1', 
                                 'C5orf24', 'CAB39L', 'CCDC62', 'CD19', 'CHRNB1', 'CLCN3', 'CRLS1', 'DDRGK1', 'DGKQ', 'DNAH17', 'DYRK1A', 
                                 'FAM171A2', 'FAM47E', 'FAM49B', 'FBRSL1', 'FCGR2A', 'FGF20', 'FYN', 'GAK', 'GBA', 'GBF1', 'GPNMB', 'HIP1', 'HLA-DQB1',
                                 'HLA-DRA', 'HLA-DRB5', 'INPP5F', 'KCNIP3', 'KCNS3', 'KPNA1', 'LAMP', 'LCORL', 'LINC00693', 'MBNL2', 'MCCC1/3',
                                 'MED12L', 'MEX3C', 'MIPOL1', 'NOD2', 'NUCKS', 'PAM', 'RAB29', 'RAB7L1', 'RAI1', 'RIMS1', 'RIT2', 'RNF141', 'RPS12', 'RPS6KL1',
                                 'SCAF11', 'SCARB2', 'SIPA1L2', 'SPTSSB', 'SREBF1', 'STBD1', 'STK39', 'STX1B', 'SYT11', 'TMEM163', 'TMEM175', 
                                 'TRIM40', 'UBAP2', 'UBTF', 'VAMP4', 'VPS13C']}


df_parkinsons = pd.DataFrame(data=dict_parkinsons)

df_parkinsons["disease"] = "PD"

In [7]:
## Create schizophrenia dictionary
## List of curated genes made from the following articles:
## https://www.nature.com/articles/s41586-022-04556-w

dict_schizophrenia = {"gene_name": ['TRIO', 'SP4', 'CUL1', 'XPO7', 'RB1CC1', 'SETD1A', 'HERC1', 
                                    'GRIN2A', 'CACNA1G', 'GRIA3']}


df_schizophrenia = pd.DataFrame(data=dict_schizophrenia)

df_schizophrenia["disease"] = "Schizophrenia"

In [8]:
## Create mood disorders dictionary
## List of curated genes made from the following articles:
## https://www.nature.com/articles/s41588-022-01034-x (Bipolar Disorder)
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6065213/ (Major Depressive Disorder)


dict_mood = {"gene_name": ['DRD4', 'HTR1A', 'MAOA', 'PCLO', 'SLC6A3', 'SLC6A4', 'TPH2', "ACE", "APOE",
                          "MTHFR", "CHST11", "PTPRR", "ADCY9", "ITPR1", "DNAJB2", "EHD3", "FREM3", 
                          "GNB3", "PHACTR3", "HS6ST3", "KLHL29", "LHFPL2", "SLC25A21", "UGT2A1", "VGLL4"]}

df_mood = pd.DataFrame(data=dict_mood)

df_mood["disease"] = "MDD"

In [9]:
## Create LATE (Limbic Predominant TDP-43 Encephalopathy) dictionary
## List of curated genes made from the following articles:
## https://pubmed.ncbi.nlm.nih.gov/31039256/


dict_late = {"gene_name": ['GRN', 'TMEM106B', 'ABCC9', 'KCNMB2', 'APOE']}

df_late = pd.DataFrame(data=dict_late)

df_late["disease"] = "LATE"

In [10]:
## Create ALS & FTD dictionary
## List of curated genes made from the following articles:
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8048219/


dict_als_ftd = {"gene_name": ['GRN', 'MAPT', 'FUS', 'SOD1', 'TARDBP', 'C9orf72', 'TBK1', 'VCP', 'BTNL2', 'C4orf27', 'CTSC',
                           'DPP6', 'HLA-DRA', 'HLA-DRB5', 'HLA-DQA2', 'IMMP2L', 'IRF2', 'MIR548AP', 'OLFM1', 'RAB38', 'RERG',
                           'TMEM106B', 'UNC13A', 'C21orf2', 'DPP6', 'FGGY', 'ITPR2', 'KIF5A', 'MOBP', 'SARM', 'SCFD1', 'UNC13A']}

df_als_ftd = pd.DataFrame(data=dict_als_ftd)

df_als_ftd["disease"] = "ALS/FTD"

In [11]:
## Create Autism dictionary
## List of curated genes made from the following articles:
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7250485/


dict_autism = {"gene_name": ['ASH1L', 'POGZ', 'SCN2A', 'FOXP1', 'SLC6A1', 'CTNNB1', 'ANK2', 'ARID1B', 'SYNGAP1', 
                              'GIGYF1', 'PTEN', 'DEAF1', 'KMT5B', 'GRIN2B', 'MED13L', 'CHD8', 'CHD2', 'KDM6B', 'ANKRD11',
                              'TLK2', 'DSCAM', 'SHANK3', 'DYRK1A', 'ADNP']}

df_autism = pd.DataFrame(data=dict_autism)

df_autism["disease"] = "ASD"

In [12]:
## Create PTSD dictionary
## List of curated genes made from the following articles:
## https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8917986/#SD1
## https://www.nature.com/articles/s41588-020-00767-x


#dict_ptsd = {"gene_name": ['DCAF5', 'EXD2', 'FAM120A', 'FOXP2', 'GALNT16', 'PHF2', 'METTL15', 'AUTS2',
#                           'MAD1L1', 'CNTN6', 'BMP2', 'PACRG', 'PARK2']}

#df_ptsd = pd.DataFrame(data=dict_ptsd)

#df_ptsd["disease"] = "PTSD"

In [13]:
## Create substance use disorders dictionary
## List of curated genes made from the following articles:
## https://www.nature.com/articles/s44220-023-00034-y#MOESM1

dict_sud = {"gene_name": ['PDE4B', 'GTF3C2', 'IFT172', 'GCKR', 'C2orf16', 'PLCL2', 'PRKAR2A', 'SLC25A20', 'ARIH2', 'P4HTM',
                          'WDR6', 'QRICH1', 'QARS', 'USP19', 'C3orf84', 'CCDC36', 'RP11-3B7.1', 'USP4', 'RHOA', 'NICN1',
                          'DAG1', 'ADD1', 'NOP14', 'BANK1', 'PPP6C', 'SOX6', 'BDNF', 'MTCH2', 'ZDHHC5', 'TMX2', 'TMX2-CTNND1',
                          'C11orf31', 'RP11-691N7.6', 'CTNND1', 'ANKK1', 'DRD2', 'HS6ST3', 'ARID4A', 'PPP1R13B', 'SEMA6D',
                          'FTO', 'C20orf112']}

df_sud = pd.DataFrame(data=dict_sud)

df_sud["disease"] = "SUD"

In [15]:
## Create dictionary with all brain diseases
df_brain_disease = pd.concat([df_parkinsons, df_schizophrenia, df_mood, df_late, df_als_ftd, df_autism, df_sud])

In [16]:
## Only keep genes with a gene name that matches the reference genome

df_brain_disease = df_brain_disease.merge(orig_ref_gene[["gene_name", "gene_id"]], how="inner", on=["gene_name"]).drop_duplicates()

In [17]:
## Load medically relevant genes
med_relevant_genes = pd.read_csv("../../references/medically_relevant_genes.tsv", sep="\t")

In [18]:
## Load AD genes 
ad_relevant_genes = pd.read_csv("../../references/AD_gwas_genes.tsv", sep="\t")

In [19]:
## Add annotation to AD genes
ad_relevant_genes["disease"] = "AD"

In [20]:
## Add AD genes to brain disease genes
df_brain_disease = pd.concat([df_brain_disease, ad_relevant_genes])

In [21]:
## Show all gene names involved in multiple diseases
df_brain_disease.loc[df_brain_disease["gene_name"].duplicated(keep=False)]

Unnamed: 0,gene_name,disease,gene_id
5,MAPT,PD,ENSG00000186868
6,MAPT,ALS/FTD,ENSG00000186868
11,BCKDK,PD,ENSG00000103507
24,DYRK1A,PD,ENSG00000157540
25,DYRK1A,ASD,ENSG00000157540
37,HLA-DRA,PD,ENSG00000204287
38,HLA-DRA,ALS/FTD,ENSG00000204287
39,HLA-DRB5,PD,ENSG00000198502
40,HLA-DRB5,ALS/FTD,ENSG00000198502
92,ACE,MDD,ENSG00000159640


In [22]:
## Annotate gene names involved in multiple diseases
df_brain_disease.loc[df_brain_disease["gene_name"] == "MAPT", "disease"] = "PD + ALS/FTD + AD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["BCKDK"]), "disease"] = "PD + AD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["DYRK1A"]), "disease"] = "PD + ASD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["HLA-DRA", "HLA-DRB5"]), "disease"] = "PD + ALS/FTD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["TMEM106B", "GRN"]), "disease"] = "LATE + ALS/FTD + AD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["APOE"]), "disease"] = "LATE + AD + MDD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["APOE"]), "disease"] = "LATE + AD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["ACE"]), "disease"] = "AD + MDD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["HS6ST3"]), "disease"] = "MDD + SUD"
df_brain_disease.loc[df_brain_disease["gene_name"].isin(["TARDBP"]), "disease"] = "ALS/FTD + LATE"

In [23]:
df_brain_disease.drop_duplicates(inplace=True)

In [24]:
## Create dataframe with brain disease IDs
df_brain_disease_IDS = df_brain_disease[["gene_id", "gene_name"]].copy().drop_duplicates()

In [25]:
## Merge brain diseases and med relevant genes so that med relevant genes include all brain disease genes
med_relevant_genes_final = pd.concat([med_relevant_genes, df_brain_disease_IDS])

med_relevant_genes_final.drop_duplicates(inplace=True)

In [26]:
## Save new medically relvant genes list including the brain disease genes
med_relevant_genes.to_csv("../../references/medically_relevant_genes_02-04-2023_UPDATED.tsv", 
                          sep="\t", index=False)

In [27]:
## Save brain disease dataframes

df_brain_disease.to_csv("../../references/brain_disease_genes_with_disease.tsv", 
                        sep="\t", index=False)

df_brain_disease_IDS.to_csv("../../references/brain_disease_genes_only_IDs.tsv", 
                        sep="\t", index=False)