In [1]:
## Load libraries
import pandas as pd
import numpy as np
import mygene
import os

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Opt into the future behavior for silent downcasting
pd.set_option('future.no_silent_downcasting', True)

In [2]:
def unify_gene_ids_and_symbols(df):

    ## Make a copy to use in function
    dff = df.copy()
    
    ## Get all the unique gene_id and gene_symbols
    df_ids = dff["gene_id"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()
    df_symbols = dff["gene_symbol"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()

    ## Query by gene_ids and gene_symbols
    mg = mygene.MyGeneInfo()
    results_id_query = mg.querymany(df_ids, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)
    results_symbol_query = mg.querymany(df_symbols, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)
    
    ## Delete duplicates, only keep highest value
    results_id_query = results_id_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    results_symbol_query = results_symbol_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    
    # Ensure that the column 'ensembl' is of string type, which allows the use of string methods
    results_id_query["ensembl"] = results_id_query["ensembl"].astype(str)
    results_symbol_query["ensembl"] = results_symbol_query["ensembl"].astype(str)
    
    # Split the strings and handle NaNs by chaining the .str methods
    results_id_query["first_ensembl"] = results_id_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]
    results_symbol_query["first_ensembl"] = results_symbol_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]

    # Standardize all NaN-like values to pd.NA
    results_id_query = results_id_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    results_symbol_query = results_symbol_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    
    # Fill NA values in 'ensembl.gene' column with values from 'first_ensembl' column
    results_id_query['ensembl.gene'] = results_id_query['ensembl.gene'].copy().fillna(results_id_query['first_ensembl'], inplace=False)
    results_symbol_query['ensembl.gene'] = results_symbol_query['ensembl.gene'].copy().fillna(results_symbol_query['first_ensembl'], inplace=False)

    ## Only keep relevant columns
    results_id_query = results_id_query[["query", "ensembl.gene", "symbol"]].copy()
    results_symbol_query = results_symbol_query[["query", "ensembl.gene", "symbol"]].copy()

    ## Rename columns
    results_id_query.columns = ["gene_id", "from_id_ensembl_id", "from_id_official_gene_symbol"]
    results_symbol_query.columns = ["gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol"]

    results_id_query.drop_duplicates(inplace=True)
    results_symbol_query.drop_duplicates(inplace=True)

    
    ## Merge with original dataframe
    dff = dff.merge(results_id_query, how="left", on="gene_id")
    dff = dff.merge(results_symbol_query, how="left", on="gene_symbol")

    ## Combine results from both approaches (symbol and id)
    dff['ensembl_id'] = dff['from_id_ensembl_id'].copy().fillna(dff['from_symbol_ensembl_id'], inplace=False)
    dff['official_symbol'] = dff['from_id_official_gene_symbol'].copy().fillna(dff['from_symbol_official_gene_symbol'], inplace=False)

    ## Drop intermediate columns
    dff.drop(columns=["from_id_ensembl_id", "from_id_official_gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol", "gene_id", "gene_symbol"], inplace=True)

    ## Drop anything that is missing both ensembl_id and official_symbol. Also drop any duplicates
    dff.dropna(subset=["ensembl_id", "official_symbol"], how="all", inplace=True)
    
    ## Drop Duplicates
    dff.drop_duplicates(inplace=True)
    dff.drop_duplicates(subset=["ensembl_id", "official_symbol"], inplace=True)

    
    ## Make ensembl id and official symbol the first two columns 
    dff = dff[dff.columns[-2:].tolist() + dff.columns[:-2].tolist()].copy()

    
    return dff

In [3]:
# Define the directory containing the CSV files
directory = '../../data/raw_data/meta-analysis/'

# List to store dataframes
dataframes = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        ## Print filename for records
        print("Opening file names:", filename,"\n")
        # Construct the full path to the file
        file_path = os.path.join(directory, filename)
        ## Read the CSV file into a dataframe
        df = pd.read_csv(file_path)
        ## Append the dataframe to the list
        dataframes.append(df)

Opening file names: miller_et_al_parietal_white_matter.csv 

Opening file names: das_et_al_plaque_vs_control.csv 

Opening file names: king_et_al_BA17_synaptoneurosome.csv 

Opening file names: miller_et_al_temporal_cortex.csv 

Opening file names: marques-coelho_et_al.csv 

Opening file names: das_et_al_tangle_vs_control.csv 

Opening file names: king_et_al_BA17_total_brain_homogenate.csv 

Opening file names: fischer_et_al.csv 

Opening file names: felsky_et_al.csv 

Opening file names: miller_et_al_parietal_cortex.csv 

Opening file names: miller_et_al_hippocampus.csv 

Opening file names: das_et_al_peri-plaque_vs_control.csv 

Opening file names: van_rooij_et_al.csv 

Opening file names: das_et_al_distant_vs_control.csv 

Opening file names: king_et_al_BA20_synaptoneurosome.csv 

Opening file names: king_et_al_BA20_total_brain_homogenate.csv 



In [4]:
## Fix column names manually

dataframes[0].columns = ['gene_symbol', 'AD vs Control: Parietal White Matter - Log2(fold change) - RIN Corrected',
       'AD vs Control: White Matter - SVA P-value -  RIN corrected']
      
dataframes[1].columns = ['gene_id', 'gene_symbol', 'logFC', 'CI.L', 'CI.R', 'AveExpr', 't',
       'P.Value', 'adj.P.Val', 'B']
      
dataframes[2].columns = ['gene_id', 'gene_symbol', 'BA17_SNp_HA_vs_AD.l2fc', 'BA17_SNp_HA_vs_AD.pval',
       'BA17_SNp_HA_vs_AD.padj', 'BA17_SNp_HA_vs_AD.raw_l2fc', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']
      
dataframes[3].columns = ['gene_symbol', 'AD vs Control: Temporal Cortex - Log2(fold change) - RIN Corrected',
       'AD vs Control: Temporal Cortex - SVA P-value -  RIN corrected',
       'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5']

dataframes[4].columns = ['transcript_name', 'dataset', 'gene_id', 'gene_symbol', 'logFC', 'gene.padj',
       'transcript.log2FC', 'transcript.padj', 'gene_biotype', 'iso_biotype',
       'dtu.dIF', 'dtu.gene.padj', 'dtu.isoform.padj', 'dtu.ofdr.gene',
       'dtu.ofdr.transcript', 'threshDT', 'threshG']

dataframes[5].columns = ['gene_id', 'gene_symbol', 'logFC', 'CI.L', 'CI.R', 'AveExpr', 't',
       'P.Value', 'adj.P.Val', 'B']
      
dataframes[6].columns = ['gene_id', 'gene_symbol', 'BA17_THp_HA_vs_AD.l2fc', 'BA17_THp_HA_vs_AD.pval',
       'BA17_THp_HA_vs_AD.padj', 'BA17_THp_HA_vs_AD.raw_l2fc', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']
      
dataframes[7].columns = ['gene_id', 'entrez_id', 'gene_symbol', 'logFC', 'AveExpr', 't', 'P.Value',
       'adj.P.Val']
      
dataframes[8].columns = ['gene_id', 'gene_symbol', 'logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B',
       'se', 'chr', 'N']
      
dataframes[9].columns = ['gene_symbol', 'AD vs Control: Parietal Cortex - Log2(fold change) - RIN Corrected',
       'AD vs Control: Parietal Cortex - SVA P-value -  RIN corrected']
      
dataframes[10].columns = ['gene_symbol', 'AD vs Control: Hippocampus - Log2(fold change) - RIN Corrected',
       'AD vs Control: Hippocampus - SVA P-value -  RIN corrected']
      
dataframes[11].columns = ['gene_id', 'gene_symbol', 'logFC', 'CI.L', 'CI.R', 'AveExpr', 't',
       'P.Value', 'adj.P.Val', 'B']
      
dataframes[12].columns = ['gene_symbol', 'logFC', 'logCPM', 'PValue', 'FDR', 'UP/DOWN', 'DE Score']

dataframes[13].columns = ['gene_id', 'gene_symbol', 'logFC', 'CI.L', 'CI.R', 'AveExpr', 't',
       'P.Value', 'adj.P.Val', 'B']

dataframes[14].columns = ['gene_id', 'gene_symbol', 'BA20_SNp_HA_vs_AD.l2fc', 'BA20_SNp_HA_vs_AD.pval',
       'BA20_SNp_HA_vs_AD.padj', 'BA20_SNp_HA_vs_AD.raw_l2fc', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']

dataframes[15].columns = ['gene_id', 'gene_symbol', 'BA20_THp_HA_vs_AD.l2fc', 'BA20_THp_HA_vs_AD.pval',
       'BA20_THp_HA_vs_AD.padj', 'BA20_THp_HA_vs_AD.raw_l2fc', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']

In [5]:
## Remove unnamed columns

for df in dataframes:
    df.drop(columns=[col for col in df.columns if col.startswith('Unnamed:')], inplace=True)    

In [6]:
## Only keep the relevant columns

dataframes[0] = dataframes[0][['gene_symbol', 'AD vs Control: Parietal White Matter - Log2(fold change) - RIN Corrected',
                'AD vs Control: White Matter - SVA P-value -  RIN corrected']].copy()
dataframes[0].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[1] = dataframes[1][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[1].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[2] = dataframes[2][['gene_id', 'gene_symbol', 'BA17_SNp_HA_vs_AD.l2fc', 'BA17_SNp_HA_vs_AD.pval']].copy()
dataframes[2].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[3] = dataframes[3][['gene_symbol', 'AD vs Control: Temporal Cortex - Log2(fold change) - RIN Corrected',
                               'AD vs Control: Temporal Cortex - SVA P-value -  RIN corrected']]
dataframes[3].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[4] = dataframes[4][['dataset', 'gene_id', 'gene_symbol', 'logFC', 'gene.padj']].copy()
dataframes[4].columns = ['dataset', 'gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[5] = dataframes[5][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[5].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[6] = dataframes[6][['gene_id', 'gene_symbol', 'BA17_THp_HA_vs_AD.l2fc', 'BA17_THp_HA_vs_AD.pval']].copy()
dataframes[6].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[7] = dataframes[7][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[7].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[8] = dataframes[8][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[8].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[9] = dataframes[9][['gene_symbol', 'AD vs Control: Parietal Cortex - Log2(fold change) - RIN Corrected',
               'AD vs Control: Parietal Cortex - SVA P-value -  RIN corrected']].copy()
dataframes[9].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[10] = dataframes[10][['gene_symbol', 'AD vs Control: Hippocampus - Log2(fold change) - RIN Corrected',
                'AD vs Control: Hippocampus - SVA P-value -  RIN corrected']].copy()
dataframes[10].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[11] = dataframes[11][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[11].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[12] = dataframes[12][['gene_symbol', 'logFC',  'PValue']].copy()
dataframes[12].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[13] = dataframes[13][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[13].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[14] = dataframes[14][['gene_id', 'gene_symbol', 'BA20_SNp_HA_vs_AD.l2fc', 'BA20_SNp_HA_vs_AD.pval']].copy()
dataframes[14].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[15] = dataframes[15][['gene_id', 'gene_symbol', 'BA20_THp_HA_vs_AD.l2fc', 'BA20_THp_HA_vs_AD.pval']].copy()
dataframes[15].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

In [7]:
## Pop the main dataframe and further process it
marques_coelho = dataframes.pop(4)

In [8]:
## Separate different datasets in marques_coelho study.
marques_coelho_MAYO_temporal = marques_coelho.loc[marques_coelho["dataset"] == "MAYO"].copy()
marques_coelho_MSBB_BM10_temporal = marques_coelho.loc[marques_coelho["dataset"] == "MSBB BM10"].copy()
marques_coelho_MSBB_BM22_frontal = marques_coelho.loc[marques_coelho["dataset"] == "MSBB BM22"].copy()
marques_coelho_MSBB_BM36_frontal = marques_coelho.loc[marques_coelho["dataset"] == "MSBB BM36"].copy()
marques_coelho_MSBB_BM44_temporal = marques_coelho.loc[marques_coelho["dataset"] == "MSBB BM44"].copy()
marques_coelho_ROSMAP_frontal = marques_coelho.loc[marques_coelho["dataset"] == "ROSMAP"].copy()

In [9]:
## Create list with dataframes for marques-coelho
marques_coelho_dataframes = [marques_coelho_MAYO_temporal, marques_coelho_MSBB_BM10_temporal, marques_coelho_MSBB_BM22_frontal,
                             marques_coelho_MSBB_BM36_frontal, marques_coelho_MSBB_BM44_temporal, marques_coelho_ROSMAP_frontal]

In [10]:
## STANDARDIZE NON MARQUES-COELHO RESULTS

## Initialize counter
i=0

## Loop through dataframes
for df in dataframes:
    
    ## Standardize missing values
    df = df.replace({np.nan: pd.NA, None: pd.NA})
    
    if "gene_id" in df.columns:
        ## Take out . from ensembl ids
        df["gene_id"] = df["gene_id"].str.split(".", expand=True)[0]
        ## Set gene ids that are not ensembl IDs to NA
        df['gene_id'] = df['gene_id'].apply(lambda x: x if pd.isna(x) or str(x).startswith('ENS') else pd.NA)

    # Check if 'gene_symbol' column is missing and add it filled with NAs if necessary
    if 'gene_symbol' not in df.columns:
        df['gene_symbol'] = df['gene_id'].copy()

    # Check if 'ensembl_id' column is missing and add it filled with NAs if necessary
    if 'gene_id' not in df.columns:
        df['gene_id'] = df['gene_symbol'].copy()

    # Ensure 'ensembl_id' is the first column and 'gene_symbol' is the second column
    columns = ['gene_id', 'gene_symbol'] + [col for col in df.columns if col not in ['gene_id', 'gene_symbol']]
    df = df[columns].copy()
    dataframes[i] = df

    i += 1
    

In [11]:
## DO SAME THING FOR MARQUES COELHO DATAFRAME

## Initialize counter
i=0

## Loop through dataframes
for df in marques_coelho_dataframes:
    
    ## Drop dataset column
    df.drop(columns="dataset", inplace=True)
    
    ## Standardize missing values
    df = df.replace({np.nan: pd.NA, None: pd.NA})
    
    if "gene_id" in df.columns:
        ## Take out . from ensembl ids
        df["gene_id"] = df["gene_id"].str.split(".", expand=True)[0]
        ## Set gene ids that are not ensembl IDs to NA
        df['gene_id'] = df['gene_id'].apply(lambda x: x if pd.isna(x) or str(x).startswith('ENS') else pd.NA)

    # Check if 'gene_symbol' column is missing and add it filled with NAs if necessary
    if 'gene_symbol' not in df.columns:
        df['gene_symbol'] = df['gene_id'].copy()

    # Check if 'ensembl_id' column is missing and add it filled with NAs if necessary
    if 'gene_id' not in df.columns:
        df['gene_id'] = df['gene_symbol'].copy()

    # Ensure 'ensembl_id' is the first column and 'gene_symbol' is the second column
    columns = ['gene_id', 'gene_symbol'] + [col for col in df.columns if col not in ['gene_id', 'gene_symbol']]
    df = df[columns].copy()
    marques_coelho_dataframes[i] = df

    i += 1

In [12]:
## Now standardize gene_symbol and gene_id for all the dataframes that are MARQUES-COELHO

## Initialize counter
i=0

## Loop through dataframes
for df in marques_coelho_dataframes:

    print(i)
    ## Create master dataframe
    master_df = unify_gene_ids_and_symbols(df)

    ## Create column that is the official gene symbol, unless that is NA, then make it the ensembl_id
    
    # Check if 'gene_symbol' column is missing and add it filled with NAs if necessary
    if 'official_symbol' not in master_df.columns:
        master_df['official_symbol'] = "NA_symbol"

    # Check if 'ensembl_id' column is missing and add it filled with NAs if necessary
    if 'ensembl_id' not in master_df.columns:
        master_df['ensembl_id'] = "NA_id"
    
    master_df['gene_name'] = master_df['official_symbol'].copy() + "|" + master_df['ensembl_id'].copy()

    ## Make it the first column
    master_df = master_df[master_df.columns[-1:].tolist() + master_df.columns[:-1].tolist()]

    marques_coelho_dataframes[i] = master_df

    i += 1

0



KeyboardInterrupt



In [None]:
## Get output file names FOR MARQUES-COELHO
output_names = ["marques_coelho_MAYO_temporal.tsv", 
                "marques_coelho_MSBB_BM10_temporal.tsv",
                "marques_coelho_MSBB_BM22_frontal.tsv", 
                "marques_coelho_MSBB_BM36_frontal.tsv",
                "marques_coelho_MSBB_BM44_temporal.tsv", 
                "marques_coelho_ROSMAP_frontal.tsv"]

sample_sizes = [160, 233, 238, 232, 227, 403]


## Now add information and save

## Initialize counter and output directory
i=0
output_dir = "../../data/processed_data/meta-analysis/meta-analysis_input_marques-coelho/"

## Loop through dataframes
for df in dataframes:

    marques_coelho_dataframes[i] = marques_coelho_dataframes[i][["gene_name", "log2_fold_change", "p_value"]].copy()

    ## Add sample sizes
    marques_coelho_dataframes[i]["sample_size"] = sample_sizes[i]
    marques_coelho_dataframes[i]["ref_allele"] = "Control"
    marques_coelho_dataframes[i]["non_ref_allele"] = "AD"

    ## Create output path
    output_path = output_dir + output_names[i]

    marques_coelho_dataframes[i].to_csv(output_path, index=False, sep="\t")

    i+=1

In [16]:
## Now standardize gene_symbol and gene_id for all the dataframes that are NON MARQUES-COELHO

## Initialize counter
i=0

## Loop through dataframes
for df in dataframes:

    print(i)
    ## Create master dataframe
    master_df = unify_gene_ids_and_symbols(df)

    ## Create column that is the official gene symbol, unless that is NA, then make it the ensembl_id
    
    # Check if 'gene_symbol' column is missing and add it filled with NAs if necessary
    if 'official_symbol' not in master_df.columns:
        master_df['official_symbol'] = "NA_symbol"

    # Check if 'ensembl_id' column is missing and add it filled with NAs if necessary
    if 'ensembl_id' not in master_df.columns:
        master_df['ensembl_id'] = "NA_id"
    
    master_df['gene_name'] = master_df['official_symbol'].copy() + "|" + master_df['ensembl_id'].copy()

    ## Make it the first column
    master_df = master_df[master_df.columns[-1:].tolist() + master_df.columns[:-1].tolist()]

    dataframes[i] = master_df

    i += 1

0


3 input query terms found dup hits:	[('ENSG00000230373', 2), ('ENSG00000268674', 3), ('ENSG00000284926', 2)]
299 input query terms found no hit:	['ENSG00000130489', 'ENSG00000182230', 'ENSG00000241978', 'ENSG00000285258', 'ENSG00000168078', 'ENS
8346 input query terms found dup hits:	[('M6PR', 3), ('FKBP4', 2), ('SEMA3F', 2), ('CFTR', 10), ('CYP51A1', 2), ('SLC7A2', 2), ('PDK4', 3),
5266 input query terms found no hit:	['AC021097.2', 'AC008013.1', 'AL078645.2', 'AC011473.4', 'AC139677.2', 'AL391994.1', 'AL138847.1', '


1


7 input query terms found dup hits:	[('ENSG00000230373', 2), ('ENSG00000267635', 2), ('ENSG00000268674', 3), ('ENSG00000277176', 5), ('E
334 input query terms found no hit:	['ENSG00000130489', 'ENSG00000182230', 'ENSG00000241978', 'ENSG00000285258', 'ENSG00000168078', 'ENS
9579 input query terms found dup hits:	[('M6PR', 3), ('FKBP4', 2), ('SEMA3F', 2), ('CFTR', 10), ('CYP51A1', 2), ('SLC7A2', 2), ('PDK4', 3),
7290 input query terms found no hit:	['AC008033.1', 'AC021097.2', 'AC008013.1', 'AL078645.2', 'AC011473.4', 'AC139677.2', 'AL391994.1', '


2


6 input query terms found dup hits:	[('ENSG00000230373', 2), ('ENSG00000267635', 2), ('ENSG00000268674', 3), ('ENSG00000277176', 5), ('E
332 input query terms found no hit:	['ENSG00000130489', 'ENSG00000182230', 'ENSG00000241978', 'ENSG00000285258', 'ENSG00000168078', 'ENS
9492 input query terms found dup hits:	[('M6PR', 3), ('FKBP4', 2), ('SEMA3F', 2), ('CFTR', 10), ('CYP51A1', 2), ('SLC7A2', 2), ('PDK4', 3),
7194 input query terms found no hit:	['AC008033.1', 'AC021097.2', 'AC008013.1', 'AL078645.2', 'AC011473.4', 'AC139677.2', 'AL391994.1', '


3


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f4430643820>>
Traceback (most recent call last):
  File "/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [9]:
## Get output file names for NON MARQUES-COELHO
output_names = ["miller_et_al_parietal_white_matter_ad_vs_control_RIN_corrected_processed.tsv", 
                "das_et_al_superior_temporal_gyrus_ad_plaque_ad_vs_control.tsv",
                "king_et_al_BA17_visual_cortex_synaptoneurosome_ad_vs_ha.tsv", 
                "miller_et_al_temporal_cortex_ad_vs_control_RIN_corrected_processed.tsv",
                "das_et_al_superior_temporal_gyrus_ad_tangle_vs_control.tsv", 
                "king_et_al_BA17_visual_cortex_total_brain_homogenate_ad_vs_ha.tsv",
                "fischer_et_al_prefrontal_cortex_ad_vs_control.tsv", 
                "felsky_et_al_temporal_cortex_ad_vs_control.tsv", 
                "miller_et_al_parietal_cortex_ad_vs_control_RIN_corrected_processed.tsv",
                "miller_et_al_hippocampus_ad_vs_control_RIN_corrected_processed.tsv", 
                "das_et_al_superior_temporal_gyrus_ad_peri-plaque_vs_control.tsv",
                "van_rooij_et_al_hippocampus_ad_vs_control.tsv", 
                "das_et_al_superior_temporal_gyrus_ad_distant_vs_control.tsv",
                "king_et_al_BA20_temporal_cortex_synaptoneurosome_ad_vs_ha.tsv",
                "king_et_al_BA20_temporal_cortex_total_brain_homogenate_ad_vs_ha.tsv"]

## Get sample sizes
sample_sizes = [106, 18, 31, 106, 18 ,31, 29, 39, 106, 106, 18, 28, 18, 31 ,31]


## Now add information and save

## Initialize counter and output directory
i=0
output_dir = "../../data/processed_data/meta-analysis/meta-analysis_input/"

## Loop through dataframes
for df in dataframes:

    dataframes[i] = dataframes[i][["gene_name", "log2_fold_change", "p_value"]].copy()

    ## Add sample sizes
    dataframes[i]["sample_size"] = sample_sizes[i]
    dataframes[i]["ref_allele"] = "Control"
    dataframes[i]["non_ref_allele"] = "AD"

    ## Create output path
    output_path = output_dir + output_names[i]

    dataframes[i].to_csv(output_path, index=False, sep="\t")

    i+=1