In [1]:
## Load libraries
import pandas as pd
import numpy as np
import mygene
import os

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Opt into the future behavior for silent downcasting
pd.set_option('future.no_silent_downcasting', True)

In [2]:
def unify_gene_ids_and_symbols(df):

    ## Make a copy to use in function
    dff = df.copy()
    
    ## Get all the unique gene_id and gene_symbols
    df_ids = dff["gene_id"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()
    df_symbols = dff["gene_symbol"].copy().drop_duplicates(inplace=False).dropna(inplace=False).to_list()

    ## Query by gene_ids and gene_symbols
    mg = mygene.MyGeneInfo()
    results_id_query = mg.querymany(df_ids, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)
    results_symbol_query = mg.querymany(df_symbols, scopes='symbol,alias,name,ensembl.gene,ensembl.transcript', fields='symbol,ensembl.gene', species='human', as_dataframe=True, df_index=False, returnall=False)
    
    ## Delete duplicates, only keep highest value
    results_id_query = results_id_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    results_symbol_query = results_symbol_query.sort_values(by=['query', '_score'], ascending=[True, False]).drop_duplicates(subset='query', keep='first')
    
    # Ensure that the column 'ensembl' is of string type, which allows the use of string methods
    results_id_query["ensembl"] = results_id_query["ensembl"].astype(str)
    results_symbol_query["ensembl"] = results_symbol_query["ensembl"].astype(str)
    
    # Split the strings and handle NaNs by chaining the .str methods
    results_id_query["first_ensembl"] = results_id_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]
    results_symbol_query["first_ensembl"] = results_symbol_query["ensembl"].str.split(":", expand=True)[1].str.split("'", expand=True)[1]

    # Standardize all NaN-like values to pd.NA
    results_id_query = results_id_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    results_symbol_query = results_symbol_query.replace({np.nan: pd.NA, None: pd.NA, "nan": pd.NA})
    
    # Fill NA values in 'ensembl.gene' column with values from 'first_ensembl' column
    results_id_query['ensembl.gene'] = results_id_query['ensembl.gene'].copy().fillna(results_id_query['first_ensembl'], inplace=False)
    results_symbol_query['ensembl.gene'] = results_symbol_query['ensembl.gene'].copy().fillna(results_symbol_query['first_ensembl'], inplace=False)

    ## Only keep relevant columns
    results_id_query = results_id_query[["query", "ensembl.gene", "symbol"]].copy()
    results_symbol_query = results_symbol_query[["query", "ensembl.gene", "symbol"]].copy()

    ## Rename columns
    results_id_query.columns = ["gene_id", "from_id_ensembl_id", "from_id_official_gene_symbol"]
    results_symbol_query.columns = ["gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol"]

    results_id_query.drop_duplicates(inplace=True)
    results_symbol_query.drop_duplicates(inplace=True)

    
    ## Merge with original dataframe
    dff = dff.merge(results_id_query, how="left", on="gene_id")
    dff = dff.merge(results_symbol_query, how="left", on="gene_symbol")

    ## Combine results from both approaches (symbol and id)
    dff['ensembl_id'] = dff['from_id_ensembl_id'].copy().fillna(dff['from_symbol_ensembl_id'], inplace=False)
    dff['official_symbol'] = dff['from_id_official_gene_symbol'].copy().fillna(dff['from_symbol_official_gene_symbol'], inplace=False)

    ## Drop intermediate columns
    dff.drop(columns=["from_id_ensembl_id", "from_id_official_gene_symbol", "from_symbol_ensembl_id", "from_symbol_official_gene_symbol", "gene_id", "gene_symbol"], inplace=True)

    ## Drop anything that is missing both ensembl_id and official_symbol. Also drop any duplicates
    dff.dropna(subset=["ensembl_id", "official_symbol"], how="all", inplace=True)
    
    ## Drop Duplicates
    dff.drop_duplicates(inplace=True)
    dff.drop_duplicates(subset=["ensembl_id", "official_symbol"], inplace=True)

    
    ## Make ensembl id and official symbol the first two columns 
    dff = dff[dff.columns[-2:].tolist() + dff.columns[:-2].tolist()].copy()

    
    return dff

In [3]:
# Define the directory containing the CSV files
directory = '../../data/raw_data/meta-analysis/'

# List to store dataframes
dataframes = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        ## Print filename for records
        print("Opening file names:", filename,"\n")
        # Construct the full path to the file
        file_path = os.path.join(directory, filename)
        ## Read the CSV file into a dataframe
        df = pd.read_csv(file_path)
        ## Append the dataframe to the list
        dataframes.append(df)

Opening file names: miller_et_al_parietal_white_matter.csv 

Opening file names: das_et_al_plaque_vs_control.csv 

Opening file names: king_et_al_BA17_synaptoneurosome.csv 

Opening file names: miller_et_al_temporal_cortex.csv 

Opening file names: das_et_al_tangle_vs_control.csv 

Opening file names: king_et_al_BA17_total_brain_homogenate.csv 

Opening file names: fischer_et_al.csv 

Opening file names: felsky_et_al.csv 

Opening file names: miller_et_al_parietal_cortex.csv 

Opening file names: miller_et_al_hippocampus.csv 

Opening file names: das_et_al_peri-plaque_vs_control.csv 

Opening file names: van_rooij_et_al.csv 

Opening file names: das_et_al_distant_vs_control.csv 

Opening file names: king_et_al_BA20_synaptoneurosome.csv 

Opening file names: king_et_al_BA20_total_brain_homogenate.csv 



In [4]:
## Fix column names manually

dataframes[0].columns = ['gene_symbol', 'AD vs Control: Parietal White Matter - Log2(fold change) - RIN Corrected',
       'AD vs Control: White Matter - SVA P-value -  RIN corrected']
      
dataframes[1].columns = ['gene_id', 'gene_symbol', 'logFC', 'CI.L', 'CI.R', 'AveExpr', 't',
       'P.Value', 'adj.P.Val', 'B']
      
dataframes[2].columns = ['gene_id', 'gene_symbol', 'BA17_SNp_HA_vs_AD.l2fc', 'BA17_SNp_HA_vs_AD.pval',
       'BA17_SNp_HA_vs_AD.padj', 'BA17_SNp_HA_vs_AD.raw_l2fc', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']
      
dataframes[3].columns = ['gene_symbol', 'AD vs Control: Temporal Cortex - Log2(fold change) - RIN Corrected',
       'AD vs Control: Temporal Cortex - SVA P-value -  RIN corrected',
       'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5']
      
dataframes[4].columns = ['gene_id', 'gene_symbol', 'logFC', 'CI.L', 'CI.R', 'AveExpr', 't',
       'P.Value', 'adj.P.Val', 'B']
      
dataframes[5].columns = ['gene_id', 'gene_symbol', 'BA17_THp_HA_vs_AD.l2fc', 'BA17_THp_HA_vs_AD.pval',
       'BA17_THp_HA_vs_AD.padj', 'BA17_THp_HA_vs_AD.raw_l2fc', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']
      
dataframes[6].columns = ['gene_id', 'entrez_id', 'gene_symbol', 'logFC', 'AveExpr', 't', 'P.Value',
       'adj.P.Val']
      
dataframes[7].columns = ['gene_id', 'gene_symbol', 'logFC', 'AveExpr', 't', 'P.Value', 'adj.P.Val', 'B',
       'se', 'chr', 'N']
      
dataframes[8].columns = ['gene_symbol', 'AD vs Control: Parietal Cortex - Log2(fold change) - RIN Corrected',
       'AD vs Control: Parietal Cortex - SVA P-value -  RIN corrected']
      
dataframes[9].columns = ['gene_symbol', 'AD vs Control: Hippocampus - Log2(fold change) - RIN Corrected',
       'AD vs Control: Hippocampus - SVA P-value -  RIN corrected']
      
dataframes[10].columns = ['gene_id', 'gene_symbol', 'logFC', 'CI.L', 'CI.R', 'AveExpr', 't',
       'P.Value', 'adj.P.Val', 'B']
      
dataframes[11].columns = ['gene_symbol', 'logFC', 'logCPM', 'PValue', 'FDR', 'UP/DOWN', 'DE Score']

dataframes[12].columns = ['gene_id', 'gene_symbol', 'logFC', 'CI.L', 'CI.R', 'AveExpr', 't',
       'P.Value', 'adj.P.Val', 'B']

dataframes[13].columns = ['gene_id', 'gene_symbol', 'BA20_SNp_HA_vs_AD.l2fc', 'BA20_SNp_HA_vs_AD.pval',
       'BA20_SNp_HA_vs_AD.padj', 'BA20_SNp_HA_vs_AD.raw_l2fc', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']

dataframes[14].columns = ['gene_id', 'gene_symbol', 'BA20_THp_HA_vs_AD.l2fc', 'BA20_THp_HA_vs_AD.pval',
       'BA20_THp_HA_vs_AD.padj', 'BA20_THp_HA_vs_AD.raw_l2fc', 'Unnamed: 6',
       'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10']

In [5]:
## Remove unnamed columns

for df in dataframes:
    df.drop(columns=[col for col in df.columns if col.startswith('Unnamed:')], inplace=True)    

In [6]:
## Only keep the relevant columns

dataframes[0] = dataframes[0][['gene_symbol', 'AD vs Control: Parietal White Matter - Log2(fold change) - RIN Corrected',
                'AD vs Control: White Matter - SVA P-value -  RIN corrected']].copy()
dataframes[0].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[1] = dataframes[1][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[1].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[2] = dataframes[2][['gene_id', 'gene_symbol', 'BA17_SNp_HA_vs_AD.l2fc', 'BA17_SNp_HA_vs_AD.pval']].copy()
dataframes[2].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[3] = dataframes[3][['gene_symbol', 'AD vs Control: Temporal Cortex - Log2(fold change) - RIN Corrected',
                               'AD vs Control: Temporal Cortex - SVA P-value -  RIN corrected']]
dataframes[3].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[4] = dataframes[4][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[4].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[5] = dataframes[5][['gene_id', 'gene_symbol', 'BA17_THp_HA_vs_AD.l2fc', 'BA17_THp_HA_vs_AD.pval']].copy()
dataframes[5].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[6] = dataframes[6][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[6].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[7] = dataframes[7][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[7].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[8] = dataframes[8][['gene_symbol', 'AD vs Control: Parietal Cortex - Log2(fold change) - RIN Corrected',
               'AD vs Control: Parietal Cortex - SVA P-value -  RIN corrected']].copy()
dataframes[8].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[9] = dataframes[9][['gene_symbol', 'AD vs Control: Hippocampus - Log2(fold change) - RIN Corrected',
                'AD vs Control: Hippocampus - SVA P-value -  RIN corrected']].copy()
dataframes[9].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[10] = dataframes[10][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[10].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[11] = dataframes[11][['gene_symbol', 'logFC',  'PValue']].copy()
dataframes[11].columns = ['gene_symbol', "log2_fold_change", "p_value"]

dataframes[12] = dataframes[12][['gene_id', 'gene_symbol', 'logFC', 'P.Value']].copy()
dataframes[12].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[13] = dataframes[13][['gene_id', 'gene_symbol', 'BA20_SNp_HA_vs_AD.l2fc', 'BA20_SNp_HA_vs_AD.pval']].copy()
dataframes[13].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

dataframes[14] = dataframes[14][['gene_id', 'gene_symbol', 'BA20_THp_HA_vs_AD.l2fc', 'BA20_THp_HA_vs_AD.pval']].copy()
dataframes[14].columns = ['gene_id', 'gene_symbol', "log2_fold_change", "p_value"]

In [7]:
## Initialize counter
i=0

## Loop through dataframes
for df in dataframes:
    
    ## Standardize missing values
    df = df.replace({np.nan: pd.NA, None: pd.NA})
    
    if "gene_id" in df.columns:
        ## Take out . from ensembl ids
        df["gene_id"] = df["gene_id"].str.split(".", expand=True)[0]
        ## Set gene ids that are not ensembl IDs to NA
        df['gene_id'] = df['gene_id'].apply(lambda x: x if pd.isna(x) or str(x).startswith('ENS') else pd.NA)

    # Check if 'gene_symbol' column is missing and add it filled with NAs if necessary
    if 'gene_symbol' not in df.columns:
        df['gene_symbol'] = df['gene_id'].copy()

    # Check if 'ensembl_id' column is missing and add it filled with NAs if necessary
    if 'gene_id' not in df.columns:
        df['gene_id'] = df['gene_symbol'].copy()

    # Ensure 'ensembl_id' is the first column and 'gene_symbol' is the second column
    columns = ['gene_id', 'gene_symbol'] + [col for col in df.columns if col not in ['gene_id', 'gene_symbol']]
    df = df[columns].copy()
    dataframes[i] = df

    i += 1
    

In [8]:
## Now standardize gene_symbol and gene_id for all the dataframes

## Initialize counter
i=0

## Loop through dataframes
for df in dataframes:

    print(i)
    ## Create master dataframe
    master_df = unify_gene_ids_and_symbols(df)

    ## Create column that is the official gene symbol, unless that is NA, then make it the ensembl_id
    
    # Check if 'gene_symbol' column is missing and add it filled with NAs if necessary
    if 'official_symbol' not in df.columns:
        df['official_symbol'] = "NA_symbol"

    # Check if 'ensembl_id' column is missing and add it filled with NAs if necessary
    if 'gene_id' not in df.columns:
        df['gene_id'] = "NA_id"
    
    master_df['gene_name'] = master_df['official_symbol'].copy() + "|" + master_df['ensembl_id'].copy()

    ## Make it the first column
    master_df = master_df[master_df.columns[-1:].tolist() + master_df.columns[:-1].tolist()]

    dataframes[i] = master_df

    i += 1

0


5030 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA11P', 2), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10)
1750 input query terms found no hit:	['CRIPAK', 'ERCC6-PGBD3', 'FAM231D', 'FBXO22-AS1', 'FLJ10038', 'FLJ23867', 'FLJ27354', 'FLJ31104', '
5030 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA11P', 2), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10)
1750 input query terms found no hit:	['CRIPAK', 'ERCC6-PGBD3', 'FAM231D', 'FBXO22-AS1', 'FLJ10038', 'FLJ23867', 'FLJ27354', 'FLJ31104', '


1


65 input query terms found no hit:	['ENSG00000270168', 'ENSG00000257151', 'ENSG00000221995', 'ENSG00000237548', 'ENSG00000243444', 'ENS
4365 input query terms found dup hits:	[('SPN', 3), ('ITGB2', 2), ('KLHL6', 3), ('PIK3R5', 3), ('LILRA2', 3), ('NCF2', 2), ('LILRB1', 4), (
772 input query terms found no hit:	['AC116667.1', 'AC119674.1', 'AL683813.1', 'AC145285.2', 'AC108062.1', 'AL121672.1', 'AL355974.2', '


2


10 input query terms found dup hits:	[('ENSG00000249738', 2), ('ENSG00000230373', 2), ('ENSG00000278903', 3), ('ENSG00000188660', 2), ('E
401 input query terms found no hit:	['ENSG00000228106', 'ENSG00000259834', 'ENSG00000272301', 'ENSG00000267034', 'ENSG00000261409', 'ENS
11480 input query terms found dup hits:	[('DPP10', 5), ('C1QTNF1', 3), ('SPIN1', 2), ('CORO7', 2), ('OXR1', 4), ('KCNIP4', 2), ('RASA1', 2),
19840 input query terms found no hit:	['AC079834.2', 'AC090578.1', 'AL139125.1', 'AC104083.1', 'AC015712.2', 'AC124804.1', 'AC104024.2', '


3


5030 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA11P', 2), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10)
1750 input query terms found no hit:	['CRIPAK', 'ERCC6-PGBD3', 'FAM231D', 'FBXO22-AS1', 'FLJ10038', 'FLJ23867', 'FLJ27354', 'FLJ31104', '
5030 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA11P', 2), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10)
1750 input query terms found no hit:	['CRIPAK', 'ERCC6-PGBD3', 'FAM231D', 'FBXO22-AS1', 'FLJ10038', 'FLJ23867', 'FLJ27354', 'FLJ31104', '


4


65 input query terms found no hit:	['ENSG00000270168', 'ENSG00000204092', 'ENSG00000221995', 'ENSG00000237548', 'ENSG00000257151', 'ENS
4365 input query terms found dup hits:	[('UCKL1', 2), ('DDIT4', 2), ('PSMA5', 2), ('STAT1', 2), ('FXYD2', 2), ('KANSL1', 2), ('GTF2IRD1', 2
772 input query terms found no hit:	['AC119674.1', 'AC018557.2', 'AL121672.1', 'AF064858.1', 'AC000403.1', 'AC010618.3', 'AP001043.1', '


5


10 input query terms found dup hits:	[('ENSG00000249738', 2), ('ENSG00000230373', 2), ('ENSG00000278903', 3), ('ENSG00000188660', 2), ('E
401 input query terms found no hit:	['ENSG00000228106', 'ENSG00000259834', 'ENSG00000272301', 'ENSG00000267034', 'ENSG00000261409', 'ENS
11480 input query terms found dup hits:	[('DPP10', 5), ('C1QTNF1', 3), ('SPIN1', 2), ('CORO7', 2), ('OXR1', 4), ('KCNIP4', 2), ('RASA1', 2),
19840 input query terms found no hit:	['AC079834.2', 'AC090578.1', 'AL139125.1', 'AC104083.1', 'AC015712.2', 'AC124804.1', 'AC104024.2', '


6


14 input query terms found no hit:	['ENSG00000228439', 'ENSG00000270672', 'ENSG00000213865', 'ENSG00000189144', 'ENSG00000180525', 'ENS
3646 input query terms found dup hits:	[('ITGB8', 2), ('MKLN1', 3), ('STAG1', 3), ('MPI', 2), ('LIFR', 2), ('SP3', 4), ('PKN2', 2), ('FKBP1
20 input query terms found no hit:	['8-Mar', '4-Mar', '10-Sep', '4-Sep', '9-Mar', '3-Mar', '2-Mar', 'LOC102724488', '1-Sep', '6-Sep', '


7


1 input query terms found dup hits:	[('ENSG00000230373', 2)]
124 input query terms found no hit:	['ENSG00000213240', 'ENSG00000263013', 'ENSG00000187695', 'ENSG00000189144', 'ENSG00000261490', 'ENS
4688 input query terms found dup hits:	[('DUSP5', 3), ('GABPB1', 5), ('GEM', 10), ('BACE2', 3), ('FREM2', 3), ('GRTP1', 3), ('LINC00598', 2
1586 input query terms found no hit:	['AC025257.1', 'AP002884.1', 'AC104162.1', 'AL137003.2', 'AC092807.3', 'AC139713.2', 'AC016026.1', '


8


5030 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA11P', 2), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10)
1750 input query terms found no hit:	['CRIPAK', 'ERCC6-PGBD3', 'FAM231D', 'FBXO22-AS1', 'FLJ10038', 'FLJ23867', 'FLJ27354', 'FLJ31104', '
5030 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA11P', 2), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10)
1750 input query terms found no hit:	['CRIPAK', 'ERCC6-PGBD3', 'FAM231D', 'FBXO22-AS1', 'FLJ10038', 'FLJ23867', 'FLJ27354', 'FLJ31104', '


9


5030 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA11P', 2), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10)
1750 input query terms found no hit:	['CRIPAK', 'ERCC6-PGBD3', 'FAM231D', 'FBXO22-AS1', 'FLJ10038', 'FLJ23867', 'FLJ27354', 'FLJ31104', '
5030 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA11P', 2), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10)
1750 input query terms found no hit:	['CRIPAK', 'ERCC6-PGBD3', 'FAM231D', 'FBXO22-AS1', 'FLJ10038', 'FLJ23867', 'FLJ27354', 'FLJ31104', '


10


65 input query terms found no hit:	['ENSG00000180525', 'ENSG00000270168', 'ENSG00000257151', 'ENSG00000256427', 'ENSG00000237548', 'ENS
4365 input query terms found dup hits:	[('DDIT4', 2), ('FOXD1', 3), ('LINC00484', 2), ('NRSN2', 2), ('IDI1', 7), ('DPY19L1', 6), ('UCKL1', 
772 input query terms found no hit:	['AC117500.2', 'AC010618.3', 'AC119674.1', 'AC116667.1', 'AC102953.2', 'AC145285.2', 'AC108062.1', '


11


3990 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10), ('ABCC3', 2), 
121 input query terms found no hit:	['AC002472.13', 'AC004381.6', 'AC005003.1', 'AC006547.14', 'AC006946.15', 'AC007375.1', 'AC007390.5'
3990 input query terms found dup hits:	[('A2M', 4), ('A2ML1', 5), ('AARSD1', 3), ('ABCA3', 3), ('ABCA9', 2), ('ABCB10', 10), ('ABCC3', 2), 
121 input query terms found no hit:	['AC002472.13', 'AC004381.6', 'AC005003.1', 'AC006547.14', 'AC006946.15', 'AC007375.1', 'AC007390.5'


12


65 input query terms found no hit:	['ENSG00000259855', 'ENSG00000237548', 'ENSG00000273576', 'ENSG00000237721', 'ENSG00000230839', 'ENS
4365 input query terms found dup hits:	[('UCKL1', 2), ('LINC00484', 2), ('DDIT4', 2), ('FIBCD1', 3), ('IDI1', 7), ('EHMT1', 2), ('KANSL1', 
772 input query terms found no hit:	['AC119674.1', 'AL121672.1', 'AC010618.3', 'AL583810.2', 'AP000808.1', 'AC092747.4', 'AL360012.1', '


13


10 input query terms found dup hits:	[('ENSG00000249738', 2), ('ENSG00000230373', 2), ('ENSG00000278903', 3), ('ENSG00000188660', 2), ('E
401 input query terms found no hit:	['ENSG00000228106', 'ENSG00000259834', 'ENSG00000272301', 'ENSG00000267034', 'ENSG00000261409', 'ENS
11480 input query terms found dup hits:	[('DPP10', 5), ('C1QTNF1', 3), ('SPIN1', 2), ('CORO7', 2), ('OXR1', 4), ('KCNIP4', 2), ('RASA1', 2),
19840 input query terms found no hit:	['AC079834.2', 'AC090578.1', 'AL139125.1', 'AC104083.1', 'AC015712.2', 'AC124804.1', 'AC104024.2', '


14


10 input query terms found dup hits:	[('ENSG00000249738', 2), ('ENSG00000230373', 2), ('ENSG00000278903', 3), ('ENSG00000188660', 2), ('E
401 input query terms found no hit:	['ENSG00000228106', 'ENSG00000259834', 'ENSG00000272301', 'ENSG00000267034', 'ENSG00000261409', 'ENS
11480 input query terms found dup hits:	[('DPP10', 5), ('C1QTNF1', 3), ('SPIN1', 2), ('CORO7', 2), ('OXR1', 4), ('KCNIP4', 2), ('RASA1', 2),
19840 input query terms found no hit:	['AC079834.2', 'AC090578.1', 'AL139125.1', 'AC104083.1', 'AC015712.2', 'AC124804.1', 'AC104024.2', '


In [9]:
## Get output file names
output_names = ["miller_et_al_parietal_white_matter_ad_vs_control_RIN_corrected_processed.tsv", 
                "das_et_al_superior_temporal_gyrus_ad_plaque_ad_vs_control.tsv",
                "king_et_al_BA17_visual_cortex_synaptoneurosome_ad_vs_ha.tsv", 
                "miller_et_al_temporal_cortex_ad_vs_control_RIN_corrected_processed.tsv",
                "das_et_al_superior_temporal_gyrus_ad_tangle_vs_control.tsv", 
                "king_et_al_BA17_visual_cortex_total_brain_homogenate_ad_vs_ha.tsv",
                "fischer_et_al_prefrontal_cortex_ad_vs_control.tsv", 
                "felsky_et_al_temporal_cortex_ad_vs_control.tsv", 
                "miller_et_al_parietal_cortex_ad_vs_control_RIN_corrected_processed.tsv",
                "miller_et_al_hippocampus_ad_vs_control_RIN_corrected_processed.tsv", 
                "das_et_al_superior_temporal_gyrus_ad_peri-plaque_vs_control.tsv",
                "van_rooij_et_al_hippocampus_ad_vs_control.tsv", 
                "das_et_al_superior_temporal_gyrus_ad_distant_vs_control.tsv",
                "king_et_al_BA20_temporal_cortex_synaptoneurosome_ad_vs_ha.tsv",
                "king_et_al_BA20_temporal_cortex_total_brain_homogenate_ad_vs_ha.tsv"]

## Get sample sizes
sample_sizes = [106, 18, 31, 106, 18 ,31, 29, 39, 106, 106, 18, 28, 18, 31 ,31]


## Now add information and save

## Initialize counter and output directory
i=0
output_dir = "../../data/processed_data/meta-analysis/meta-analysis_input/"

## Loop through dataframes
for df in dataframes:

    dataframes[i] = dataframes[i][["gene_name", "log2_fold_change", "p_value"]].copy()

    ## Add sample sizes
    dataframes[i]["sample_size"] = sample_sizes[i]
    dataframes[i]["ref_allele"] = "Control"
    dataframes[i]["non_ref_allele"] = "AD"

    ## Create output path
    output_path = output_dir + output_names[i]

    dataframes[i].to_csv(output_path, index=False, sep="\t")

    i+=1