In [1]:
## Load libraries
import pandas as pd
import numpy as np
import mygene

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Opt into the future behavior for silent downcasting
pd.set_option('future.no_silent_downcasting', True)

In [2]:
def unify_gene_ids_and_symbols(df):

    ## Make a copy to use in function
    dff = df.copy()
    
    ## Take out . from ensembl ids
    dff["gene_id"] = dff["gene_id"].str.split(".", expand=True)[0]

    # Standardize all NaN-like values to pd.NA
    dff = dff.replace({np.nan: pd.NA, None: pd.NA})

    ## Set gene ids that are not ensembl IDs to NA
    dff['gene_id'] = dff['gene_id'].apply(lambda x: x if pd.isna(x) or str(x).startswith('ENS') else pd.NA)

    ## Get all the unique gene_id and gene_symbols
    df_ids = dff["gene_id"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()
    df_symbols = dff["gene_symbol"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()

    ## Query by gene_ids and gene_symbols
    mg = mygene.MyGeneInfo()
    results_id_query = mg.querymany(df_ids, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)
    results_symbol_query = mg.querymany(df_symbols, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)

    ## Delete duplicates, only keep highest value
    results_id_query = results_id_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    results_symbol_query = results_symbol_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    
    # Ensure that the column 'ensembl' is of string type, which allows the use of string methods
    results_id_query["ensembl"] = results_id_query["ensembl"].astype(str)
    results_symbol_query["ensembl"] = results_symbol_query["ensembl"].astype(str)
    
    # Split the strings and handle NaNs by chaining the .str methods
    results_id_query["first_ensembl"] = results_id_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]
    results_symbol_query["first_ensembl"] = results_symbol_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]

    
    # Standardize all NaN-like values to pd.NA
    results_id_query = results_id_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    results_symbol_query = results_symbol_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    
    # Fill NA values in 'ensembl.gene' column with values from 'first_ensembl' column
    results_id_query['ensembl.gene'] = results_id_query['ensembl.gene'].copy().fillna(results_id_query['first_ensembl'], inplace=False)
    results_symbol_query['ensembl.gene'] = results_symbol_query['ensembl.gene'].copy().fillna(results_symbol_query['first_ensembl'], inplace=False)

    ## Only keep relevant columns
    results_id_query = results_id_query[["query", "ensembl.gene", "symbol"]].copy()
    results_symbol_query = results_symbol_query[["query", "ensembl.gene", "symbol"]].copy()

    ## Rename columns
    results_id_query.columns = ["gene_id", "from_id_ensembl_id", "from_id_official_gene_symbol"]
    results_symbol_query.columns = ["gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol"]

    results_id_query.drop_duplicates(inplace=True)
    results_symbol_query.drop_duplicates(inplace=True)

    
    ## Merge with original dataframe
    dff = dff.merge(results_id_query, how="left", on="gene_id")
    dff = dff.merge(results_symbol_query, how="left", on="gene_symbol")

    ## Combine results from both approaches (symbol and id)
    dff['ensembl_id'] = dff['from_id_ensembl_id'].copy().fillna(dff['from_symbol_ensembl_id'], inplace=False)
    dff['official_symbol'] = dff['from_id_official_gene_symbol'].copy().fillna(dff['from_symbol_official_gene_symbol'], inplace=False)

    ## Drop intermediate columns
    dff.drop(columns=["from_id_ensembl_id", "from_id_official_gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol", "gene_id", "gene_symbol"], inplace=True)

    ## Drop anything that is missing both ensembl_id and official_symbol
    dff.dropna(subset=["ensembl_id", "official_symbol"], how="all", inplace=True)
    
    ## Drop Duplicates
    dff.drop_duplicates(inplace=True)
    
    ## Make ensembl id and official symbol the first two columns 
    dff = dff[dff.columns[-2:].tolist() + dff.columns[:-2].tolist()].copy()

    return dff

In [3]:
## Import Data
df = pd.read_csv("../../data/raw_data/overlap_analysis/all_studies_ad_vs_healthy_control_DEG_overlap.csv")

In [4]:
## Fix name for Marques-Coelho and Fisher studies
df.loc[df["study"] == "Marques-Coelho", "study"] = "Marques-Coelho et al."
df.loc[df["study"] == "Fischer et al.", "study"] = "Fisher et al."

In [5]:
## Import quality assessment information more throroughly from table 3
df_qa = pd.read_excel("../../tables/table_3.xlsx")

In [6]:
## Fix name for Panitch study
df_qa.loc[df_qa["Study"] == "Panich et al.", "Study"] = "Panitch et al."

In [7]:
## Only keep studies with DGE information
df_qa = df_qa.loc[df_qa["Study"].isin(df["study"])].copy()

In [8]:
### Drop studies that do not fit the inclusion criteria for the pathway meta-analysis. See table 2 for details on quality assessment scoring

## Only keep studies with DGE information
df_qa = df_qa.loc[df_qa["Study"].isin(df["study"])]

## Quality assessment average score > 1.5
df_qa = df_qa.loc[df_qa["Average_score"] > 1.5].copy()

## Statistical rigor quality assessment score >= 2
df_qa = df_qa.loc[df_qa["Statistical_rigor"] >= 2].copy()

## AD diagnosis criteria quality assessment score = 3
df_qa = df_qa.loc[df_qa["AD_diagnosis_criteria"] == 3].copy()

In [9]:
## Show how many studies we began with
print("There were with differential gene expression gene lists available to begin with", str(df["study"].nunique()), "studies to begin with")

There were with differential gene expression gene lists available to begin with 17 studies to begin with


In [10]:
## Show dropped studies
print("The studies:\n\n", str(df.loc[~df["study"].isin(df_qa["Study"])]["study"].unique()), 
      "\n\nwere removed because they did not pass filtering thresholds for pathway meta-analysis:\nStudy quality assessment average score > 1.5\nStatistical rigor quality assessment score >= 2\nAD diagnostic criteria quality assessment score = 3")

The studies:

 ['Cho et al.' 'Felsky et al.' 'Guennewig et al.' 'Jia et al.' 'Lee et al.'
 'Luo et al.' 'Mills et al.' 'van Rooij et al.'] 

were removed because they did not pass filtering thresholds for pathway meta-analysis:
Study quality assessment average score > 1.5
Statistical rigor quality assessment score >= 2
AD diagnostic criteria quality assessment score = 3


In [11]:
## Drop studies not passing filtering criteria
df = df.loc[df["study"].isin(df_qa["Study"])].copy()

print("There were", str(df["study"].nunique()), "studies that passsed pathway meta-analysis filtering")

There were 9 studies that passsed pathway meta-analysis filtering


In [12]:
## Remove under represented brain regions or brain regions that were not specified well
print("The studies [Annese et al., Magistry et al.] were excluded from the pathway meta-analysis because there were only two hippocampal studies/datasets\nleft after filtering")

df = df.loc[~df["study"].isin(["Annese et al.", "Magistry et al."])].copy()

print("\nThere were", str(df["study"].nunique()), "studies left after filtering out the brain regions represented in less than 3 studies/datasets")

The studies [Annese et al., Magistry et al.] were excluded from the pathway meta-analysis because there were only two hippocampal studies/datasets
left after filtering

There were 7 studies left after filtering out the brain regions represented in less than 3 studies/datasets


In [13]:
## Add specifier to study tag of Marques-Coelho data since we are dealing with 6 different datasets

df.loc[df["brain_region"] == "Temporal Lobe (MAYO)", "study"] = df["study"] + " (MAYO)"
df.loc[df["brain_region"] == "Frontal Lobe (MSSB BA10)", "study"] = df["study"] + " (MSSB BA10)"
df.loc[df["brain_region"] == "Temporal Lobe (MSSB BA22)", "study"] = df["study"] + " (MSSB BA22)"
df.loc[df["brain_region"] == "Temporal Lobe (MSSB BA36)", "study"] = df["study"] + " (MSSB BA36)"
df.loc[df["brain_region"] == "Frontal Lobe (MSBB BA44)", "study"] = df["study"] + " (MSSB BA44)"
df.loc[df["brain_region"] == "Frontal Lobe (ROSMAP)", "study"] = df["study"] + " (ROSMAP)"

In [14]:
## Convert the names

df['brain_region'] = df['brain_region'].replace({'hippocampal CA1 region': 'Hippocampus', "anterior temporal lobe\xa0": "Temporal Lobe", "superior temporal gyrus": "Temporal Lobe",
                           "temporal cortex": "Temporal Lobe", 'prefrontal cortex': "Frontal Lobe", "precuneus": "Parietal Lobe", "Temporal Cortex (BA20)": "Temporal Lobe",
                           "Temporal Lobe (Superior Temporal Lobe)": "Temporal Lobe", "Temporal Lobe (MAYO)": "Temporal Lobe", "Frontal Lobe (MSSB BA10)": "Frontal Lobe",
                           'Temporal Lobe (MSSB BA22)': "Temporal Lobe", 'Temporal Lobe (MSSB BA36)': "Temporal Lobe", 'Frontal Lobe (MSBB BA44)': "Frontal Lobe", 'Frontal Lobe (ROSMAP)': "Frontal Lobe",
                           'Frontal Lobe (Dorsolateral pre-frontal cortex)': "Frontal Lobe", 'Middle temporal gyrus': "Temporal Lobe",
                           'Hippocampus (dentate gyrus and cornu amonis)': "Hippocampus", "superior temporal gyrus ": "Temporal Lobe"}, inplace=False)

In [15]:
## Das et al. was dropped due to not performing analysis on "general" brain tissue and instead focusing on laser 
## captured subsections near of farther from pathology
df = df.loc[df["study"] != "Das et al."].copy()

## Panitch et al. was dropped because they don't provide results for only their original cohort. They only provides results for a meta-analysis including
## ROSMAP and MAYO data that is also present in Marques-Coelho et al.
df = df.loc[df["study"] != "Panitch et al."].copy()

In [16]:
print("There were 5 studies and 10 datasets left after filtering out studies that did not fit other inclusion criteria")

There were 5 studies and 10 datasets left after filtering out studies that did not fit other inclusion criteria


In [17]:
## Remove extra information from ensembl ids
df["gene_id"] = df["gene_id"].str.split(".", expand=True)[0]

In [18]:
## Create master dataframe
master_df = unify_gene_ids_and_symbols(df)

71 input query terms found no hit:	['ENSG00000204282', 'ENSG00000239665', 'ENSG00000213029', 'ENSG00000255823', 'ENSG00000183729', 'ENS
2045 input query terms found dup hits:	[('NUTM2A', 2), ('PPEF1', 3), ('OSTN', 2), ('FRMPD2', 2), ('HCRTR2', 2), ('PPDPF', 5), ('RAB7B', 2),
636 input query terms found no hit:	['4-Mar', '8-Mar', 'AC009063.2', 'AL162171.1', 'AC007546.3', 'AC118549.1', 'AL031056.1', 'AL049840.3


In [19]:
## Number of entries lost due to not finding gene_name or gene_id in the database
print("We lost", str(df.shape[0] - master_df.shape[0]), "entries due to not finding a gene_name or gene_id in the database")

We lost 346 entries due to not finding a gene_name or gene_id in the database


In [20]:
## Create column that is the official gene symbol, unless that is NA, then make it the ensembl_id
master_df['gene_name'] = master_df['ensembl_id'].fillna(master_df['official_symbol']).copy()

## Make it the first column
master_df = master_df[master_df.columns[-1:].tolist() + master_df.columns[:-1].tolist()]

In [21]:
## Replace all the spaces with underscores in the study column to make it easier to save later
master_df['study'] = master_df['study'].str.replace(' ', '_')
master_df['study'] = master_df['study'].str.replace('.', '')
master_df['study'] = master_df['study'].str.replace('(', '')
master_df['study'] = master_df['study'].str.replace(')', '')

In [22]:
# Separate by study and brain region

df_temporal = master_df.loc[master_df["brain_region"] == "Temporal Lobe"].copy()

df_frontal = master_df.loc[master_df["brain_region"] == "Frontal Lobe"].copy()

In [23]:
## Separate up and downregulated genes

df_temporal_up_in_AD = df_temporal.loc[df_temporal["is_upregulated_in_AD"] == True].copy()
df_temporal_down_in_AD = df_temporal.loc[df_temporal["is_upregulated_in_AD"] == False].copy()

df_frontal_up_in_AD = df_frontal.loc[df_frontal["is_upregulated_in_AD"] == True].copy()
df_frontal_down_in_AD = df_frontal.loc[df_frontal["is_upregulated_in_AD"] == False].copy()

In [25]:
## Separate each brain region by study and save it into a CSV file for each study + brain region combo

{df_temporal_up_in_AD[df_temporal_up_in_AD['study'] == study]["gene_name"].to_csv(f"../../data/processed_data/overlap_analysis/overlap_analysis_input/{study}_temporal_lobe_up_in_AD.csv", index=False, header=False) for study in df_temporal_up_in_AD['study'].unique()}
{df_temporal_down_in_AD[df_temporal_down_in_AD['study'] == study]["gene_name"].to_csv(f"../../data/processed_data/overlap_analysis/overlap_analysis_input/{study}_temporal_lobe_down_in_AD.csv", index=False, header=False) for study in df_temporal_down_in_AD['study'].unique()}

{df_frontal_up_in_AD[df_frontal_up_in_AD['study'] == study]["gene_name"].to_csv(f"../../data/processed_data/overlap_analysis/overlap_analysis_input/{study}_frontal_lobe_up_in_AD.csv", index=False, header=False) for study in df_frontal_up_in_AD['study'].unique()}
{df_frontal_down_in_AD[df_frontal_down_in_AD['study'] == study]["gene_name"].to_csv(f"../../data/processed_data/overlap_analysis/overlap_analysis_input/{study}_frontal_lobe_down_in_AD.csv", index=False, header=False) for study in df_frontal_down_in_AD['study'].unique()}

{None}