In [1]:
## Load libraries
import pandas as pd
import numpy as np
import mygene

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Opt into the future behavior for silent downcasting
pd.set_option('future.no_silent_downcasting', True)

In [2]:
def unify_gene_ids_and_symbols(df):

    ## Make a copy to use in function
    dff = df.copy()
    
    ## Take out . from ensembl ids
    dff["gene_id"] = dff["gene_id"].str.split(".", expand=True)[0]

    # Standardize all NaN-like values to pd.NA
    dff = dff.replace({np.nan: pd.NA, None: pd.NA})

    ## Set gene ids that are not ensembl IDs to NA
    dff['gene_id'] = dff['gene_id'].apply(lambda x: x if pd.isna(x) or str(x).startswith('ENS') else pd.NA)

    ## Get all the unique gene_id and gene_symbols
    df_ids = dff["gene_id"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()
    df_symbols = dff["gene_symbol"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()

    ## Query by gene_ids and gene_symbols
    mg = mygene.MyGeneInfo()
    results_id_query = mg.querymany(df_ids, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)
    results_symbol_query = mg.querymany(df_symbols, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)

    ## Delete duplicates, only keep highest value
    results_id_query = results_id_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    results_symbol_query = results_symbol_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    
    # Ensure that the column 'ensembl' is of string type, which allows the use of string methods
    results_id_query["ensembl"] = results_id_query["ensembl"].astype(str)
    results_symbol_query["ensembl"] = results_symbol_query["ensembl"].astype(str)
    
    # Split the strings and handle NaNs by chaining the .str methods
    results_id_query["first_ensembl"] = results_id_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]
    results_symbol_query["first_ensembl"] = results_symbol_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]

    
    # Standardize all NaN-like values to pd.NA
    results_id_query = results_id_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    results_symbol_query = results_symbol_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    
    # Fill NA values in 'ensembl.gene' column with values from 'first_ensembl' column
    results_id_query['ensembl.gene'] = results_id_query['ensembl.gene'].copy().fillna(results_id_query['first_ensembl'], inplace=False)
    results_symbol_query['ensembl.gene'] = results_symbol_query['ensembl.gene'].copy().fillna(results_symbol_query['first_ensembl'], inplace=False)

    ## Only keep relevant columns
    results_id_query = results_id_query[["query", "ensembl.gene", "symbol"]].copy()
    results_symbol_query = results_symbol_query[["query", "ensembl.gene", "symbol"]].copy()

    ## Rename columns
    results_id_query.columns = ["gene_id", "from_id_ensembl_id", "from_id_official_gene_symbol"]
    results_symbol_query.columns = ["gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol"]

    results_id_query.drop_duplicates(inplace=True)
    results_symbol_query.drop_duplicates(inplace=True)

    
    ## Merge with original dataframe
    dff = dff.merge(results_id_query, how="left", on="gene_id")
    dff = dff.merge(results_symbol_query, how="left", on="gene_symbol")

    ## Combine results from both approaches (symbol and id)
    dff['ensembl_id'] = dff['from_id_ensembl_id'].copy().fillna(dff['from_symbol_ensembl_id'], inplace=False)
    dff['official_symbol'] = dff['from_id_official_gene_symbol'].copy().fillna(dff['from_symbol_official_gene_symbol'], inplace=False)

    ## Drop intermediate columns
    dff.drop(columns=["from_id_ensembl_id", "from_id_official_gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol", "gene_id", "gene_symbol"], inplace=True)

    ## Drop anything that is missing both ensembl_id and official_symbol
    dff.dropna(subset=["ensembl_id", "official_symbol"], how="all", inplace=True)
    
    ## Drop Duplicates
    dff.drop_duplicates(inplace=True)
    
    ## Make ensembl id and official symbol the first two columns 
    dff = dff[dff.columns[-2:].tolist() + dff.columns[:-2].tolist()].copy()

    return dff

In [3]:
## Import Data
df = pd.read_csv("../../data/raw_data/overlap_analysis/all_studies_ad_vs_healthy_control_DEG_overlap.csv")

In [4]:
## Show how many studies we began with
print("There were", str(df["study"].nunique()), "studies to begin with")

There were 17 studies to begin with


In [5]:
## Drop all studies with quality score less than or equal to 1.5
print("The study", str(df.loc[df["quality_assessment"] > 1.5]["study"].unique()[0]), "was removed because the its quality score was below or at 1.5")

df = df.loc[df["quality_assessment"] > 1.5].copy()

print("There were", str(df["study"].nunique()), "studies left after filtering out those with quality score <= 1.5")

The study Annese et al. was removed because the its quality score was below or at 1.5
There were 16 studies left after filtering out those with quality score <= 1.5


In [6]:
## Remove under represented brain regions or brain regions that were not specified well
print("The study", str(df.loc[df["brain_region"]=="Cortex tissues"]["study"].unique()[0]), "was removed because the brain region 'Cortex Tissues' was not specific enough")
print("\nThe primary visual cortex data from study", str(df.loc[df["brain_region"]=="primary visual cortex"]["study"].unique()[0]), "was removed because the brain region 'visual cortex' was only represented in one study")
print("\nThe Entorhinal Cortex data from study", str(df.loc[df["brain_region"]=="Entorhinal Cortex"]["study"].unique()[0]), "was removed because the brain region 'visual cortex' was only represented in one study")

df = df.loc[df["brain_region"] != "Cortex tissues"].copy()
df = df.loc[df["brain_region"] != "primary visual cortex"].copy()
df = df.loc[df["brain_region"] != "Entorhinal Cortex"].copy()

print("\nThere were", str(df["study"].nunique()), "studies left after filtering out the brain regions represented in only one study")

The study Lee et al. was removed because the brain region 'Cortex Tissues' was not specific enough

The primary visual cortex data from study Guennewig et al. was removed because the brain region 'visual cortex' was only represented in one study

The Entorhinal Cortex data from study Jia et al. was removed because the brain region 'visual cortex' was only represented in one study

There were 14 studies left after filtering out the brain regions represented in only one study


In [7]:
## Add specifier to some of the data from the isoform usage paper

df.loc[df["brain_region"] == "Temporal Lobe (MAYO)", "study"] = df["study"] + " (MAYO)"
df.loc[df["brain_region"] == "Frontal Lobe (MSSB BA10)", "study"] = df["study"] + " (MSSB BA10)"
df.loc[df["brain_region"] == "Temporal Lobe (MSSB BA22)", "study"] = df["study"] + " (MSSB BA22)"
df.loc[df["brain_region"] == "Temporal Lobe (MSSB BA36)", "study"] = df["study"] + " (MSSB BA36)"
df.loc[df["brain_region"] == "Frontal Lobe (MSBB BA44)", "study"] = df["study"] + " (MSSB BA44)"
df.loc[df["brain_region"] == "Frontal Lobe (ROSMAP)", "study"] = df["study"] + " (ROSMAP)"

In [8]:
## Convert the names

df['brain_region'] = df['brain_region'].replace({'hippocampal CA1 region': 'Hippocampus', "anterior temporal lobe\xa0": "Temporal Lobe", "superior temporal gyrus": "Temporal Lobe",
                           "temporal cortex": "Temporal Lobe", 'prefrontal cortex': "Frontal Lobe", "precuneus": "Parietal Lobe", "Temporal Cortex (BA20)": "Temporal Lobe",
                           "Temporal Lobe (Superior Temporal Lobe)": "Temporal Lobe", "Temporal Lobe (MAYO)": "Temporal Lobe", "Frontal Lobe (MSSB BA10)": "Frontal Lobe",
                           'Temporal Lobe (MSSB BA22)': "Temporal Lobe", 'Temporal Lobe (MSSB BA36)': "Temporal Lobe", 'Frontal Lobe (MSBB BA44)': "Frontal Lobe", 'Frontal Lobe (ROSMAP)': "Frontal Lobe",
                           'Frontal Lobe (Dorsolateral pre-frontal cortex)': "Frontal Lobe", 'Middle temporal gyrus': "Temporal Lobe",
                           'Hippocampus (dentate gyrus and cornu amonis)': "Hippocampus", "superior temporal gyrus ": "Temporal Lobe"}, inplace=False)

In [9]:
## Create master dataframe
master_df = unify_gene_ids_and_symbols(df)

1 input query terms found dup hits:	[('ENSG00000230373', 2)]
267 input query terms found no hit:	['ENSG00000231865', 'ENSG00000268759', 'ENSG00000197846', 'ENSG00000176034', 'ENSG00000264868', 'ENS
3495 input query terms found dup hits:	[('CYCS', 10), ('GPCPD1', 3), ('PRICKLE2', 7), ('PITPNA', 2), ('WDR7', 3), ('FARSA', 2), ('YPEL5', 8
1825 input query terms found no hit:	['KIAA1045', 'LOC100652824chr2:202937977-203061886', 'gen-01', 'LOC728730', '45355', 'LOC283070', 'L


In [10]:
## Number of entries lost due to not finding gene_name or gene_id in the database
print("We lost", str(df.shape[0] - master_df.shape[0]), "entries due to not finding a gene_name or gene_id in the database")

We lost 724 entries due to not finding a gene_name or gene_id in the database


In [11]:
## Create column that is the official gene symbol, unless that is NA, then make it the ensembl_id
master_df['gene_name'] = master_df['official_symbol'].fillna(master_df['ensembl_id']).copy()

## Make it the first column
master_df = master_df[master_df.columns[-1:].tolist() + master_df.columns[:-1].tolist()]

In [12]:
## Replace all the spaces with underscores in the study column to make it easier to save later
master_df['study'] = master_df['study'].str.replace(' ', '_')
master_df['study'] = master_df['study'].str.replace('.', '')
master_df['study'] = master_df['study'].str.replace('(', '')
master_df['study'] = master_df['study'].str.replace(')', '')

In [13]:
# Separate by study and brain region

df_temporal = master_df.loc[master_df["brain_region"] == "Temporal Lobe"].copy()

df_parietal = master_df.loc[master_df["brain_region"] == "Parietal Lobe"].copy()

df_hippocampus = master_df.loc[master_df["brain_region"] == "Hippocampus"].copy()

df_frontal = master_df.loc[master_df["brain_region"] == "Frontal Lobe"].copy()

In [14]:
## Separate them by study

temporal_studies = {study: df_temporal[df_temporal['study'] == study] for study in df_temporal['study'].unique()}

parietal_studies = {study: df_parietal[df_parietal['study'] == study] for study in df_parietal['study'].unique()}

hippocampus_studies = {study: df_hippocampus[df_hippocampus['study'] == study] for study in df_hippocampus['study'].unique()}

frontal_studies = {study: df_frontal[df_frontal['study'] == study] for study in df_frontal['study'].unique()}

In [15]:
## Separate each brain region by study and save it into a CSV file for each study + brain region combo

{df_temporal[df_temporal['study'] == study].to_csv(f"../../data/processed_data/overlap_analysis/overlap_analysis_input/{study}_temporal_lobe.csv", index=False) for study in df_temporal['study'].unique()}

{df_parietal[df_parietal['study'] == study].to_csv(f"../../data/processed_data/overlap_analysis/overlap_analysis_input/{study}_parietal_lobe.csv", index=False) for study in df_parietal['study'].unique()}

{df_hippocampus[df_hippocampus['study'] == study].to_csv(f"../../data/processed_data/overlap_analysis/overlap_analysis_input/{study}_hippocampus.csv", index=False) for study in df_hippocampus['study'].unique()}

{df_frontal[df_frontal['study'] == study].to_csv(f"../../data/processed_data/overlap_analysis/overlap_analysis_input/{study}_frontal_lobe.csv", index=False) for study in df_frontal['study'].unique()}

{None}