In [1]:
## Load libraries
import pandas as pd
import numpy as np
import mygene
import os
from scipy.stats import false_discovery_control

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Opt into the future behavior for silent downcasting
pd.set_option('future.no_silent_downcasting', True)

In [2]:
def fdr_correction(df, pval_column):
    """
    Perform Benjamini-Hochberg FDR correction on the specified p-value column of a DataFrame.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    pval_column (str): The name of the column containing p-values to be corrected.
    
    Returns:
    pd.DataFrame: The DataFrame with an additional column containing the FDR corrected p-values.
    """
    # Sort the p-values and store the original indices
    sorted_pvals = np.sort(df[pval_column])
    original_indices = np.argsort(df[pval_column])
    
    # Calculate the rank of each p-value
    ranks = np.arange(1, len(sorted_pvals) + 1)
    
    # Calculate the FDR correction
    n = len(sorted_pvals)
    adjusted_pvals = sorted_pvals * n / ranks
    
    # Ensure that the adjusted p-values are not greater than 1
    adjusted_pvals = np.minimum(adjusted_pvals, 1)
    
    # Reverse the sorting
    corrected_pvals = np.empty_like(adjusted_pvals)
    corrected_pvals[original_indices] = adjusted_pvals
    
    # Add the corrected p-values as a new column
    corrected_column_name = pval_column + '_FDR_corrected'
    df[corrected_column_name] = corrected_pvals
    
    return df

In [3]:
## Open data
df = pd.read_csv("../../data/raw_data/supplementary_table_1.csv")

In [4]:
## Drop non-needed columns
df.drop(columns=["TXNAME", "transcript.log2FC", "transcript.padj", "iso_biotype", "dtu.dIF", "dtu.dIF", "dtu.gene.padj", "dtu.ofdr.gene", "dtu.ofdr.transcript",
                 "threshDT", "threshDT", "dtu.isoform.padj", "threshG", "gene_biotype"], inplace=True)

In [5]:
## Fix gene names
df["GENEID"] = df["GENEID"].str.split(".", expand=True)[0]

In [6]:
## Fix column names and orders
df.columns = ["dataset", "gene_id", "gene_name", "log2_fold_change", "FDR_adjusted_p_value"]
df = df[["gene_id", "gene_name", "log2_fold_change", "FDR_adjusted_p_value", "dataset"]].copy()

In [7]:
## Create reference allelle column
df["ref_allele"] = "AD"
df["non_ref_allele"] = "Control"

In [8]:
## Create better gene name option and then delete old
df["gene_name"] = df["gene_id"].copy() + "|" + df["gene_name"].copy()

df.drop(columns="gene_id", inplace=True)

In [9]:
## Separate Datasets
df_MAYO_TL = df.loc[df["dataset"] == "MAYO"].copy()
df_MSSB_BA10_FL = df.loc[df["dataset"] == "MSBB BM10"].copy()
df_MSSB_BA22_TL = df.loc[df["dataset"] == "MSBB BM22"].copy()
df_ROSMAP_FL = df.loc[df["dataset"] == "ROSMAP"].copy()

In [10]:
## Add sample size (weight) to the datasets
df_MAYO_TL["sample_size"] = 160
df_MSSB_BA10_FL["sample_size"] = 233
df_MSSB_BA22_TL["sample_size"] = 238
df_ROSMAP_FL["sample_size"] = 403

In [11]:
## Drop dataset variable
df_MAYO_TL.drop(columns=["dataset"], inplace=True)
df_MSSB_BA10_FL.drop(columns=["dataset"], inplace=True)
df_MSSB_BA22_TL.drop(columns=["dataset"], inplace=True)
df_ROSMAP_FL.drop(columns=["dataset"], inplace=True)

In [12]:
## Drop duplicates in each dataset
df_MAYO_TL.drop_duplicates(inplace=True)
df_MSSB_BA10_FL.drop_duplicates(inplace=True)
df_MSSB_BA22_TL.drop_duplicates(inplace=True)
df_ROSMAP_FL.drop_duplicates(inplace=True)

In [13]:
## Drop NAs in each dataset
df_MAYO_TL.dropna(inplace=True)
df_MSSB_BA10_FL.dropna(inplace=True)
df_MSSB_BA22_TL.dropna(inplace=True)
df_ROSMAP_FL.dropna(inplace=True)

In [14]:
## Output dataset
df_MAYO_TL.to_csv("../../data/processed_data/temporal_lobe/MAYO_TL.tsv", sep="\t", index=False)
df_MSSB_BA10_FL.to_csv("../../data/processed_data/frontal_lobe/MSSB_BA10_FL.tsv", sep="\t", index=False)
df_MSSB_BA22_TL.to_csv("../../data/processed_data/temporal_lobe/MSSB_BA22_TL.tsv", sep="\t", index=False)
df_ROSMAP_FL.to_csv("../../data/processed_data/frontal_lobe/ROSMAP_FL.tsv", sep="\t", index=False)