In [150]:
import pandas as pd
import numpy as np
import os

from scipy.stats import combine_pvalues

In [151]:
def fisher_method_threshold(p_values, min_p_value = 1e-16):
    p_values = np.maximum(p_values, min_p_value)
    _, combined_p_value = combine_pvalues(p_values, method='fisher')
    return combined_p_value

In [152]:
#establish file paths to DEA results
proj_codes_df = pd.read_csv('proj_codes.csv')
proj_codes_df['matched_tissue'] = proj_codes_df['matched_tissue'].str.replace(' ','_')

base_dir = '/Users/samlokanc/projects/Lab/IEI/IEI 2024/Results'
file_paths_DEA = proj_codes_df.apply(
    lambda row: os.path.join(base_dir, row['proj'], 'DEA', f"subset_DEGs_{row['proj']}_{row['sample_abbr']}_GTEX_{row['matched_tissue']}.csv"),
    axis = 1
)

#load data from specified files into pandas dataframes
DEA_dfs = []

for i, file_path in enumerate(file_paths_DEA):
    df = pd.read_csv(file_path)
    df['proj'] = proj_codes_df.iloc[i]['proj']
    DEA_dfs.append(df)

In [153]:
#concatenate all DEA dataframes into single respective dataframe
DEA_data = pd.concat(DEA_dfs, ignore_index = True)

#count occurences of each gene
gene_counts = DEA_data['Gene.symbol'].value_counts().reset_index()
gene_counts.columns = ['Gene.symbol', 'count']

#calculate the fisher combined p-vals and log2fc for each gene
grouped_DEA_data = DEA_data.groupby('Gene.symbol').agg(
    combined_p_values = ('adj.P.Val', lambda p_values: fisher_method_threshold(p_values)),
    average_log2fc = ('logFC', 'mean')
).reset_index()

#aggregate project codes for each gene
project_codes = DEA_data.groupby('Gene.symbol')['proj'].apply(lambda x: list(x.unique())).reset_index()

#merge gene counts with grouped data
DEA_result_df = pd.merge(gene_counts, grouped_DEA_data, on='Gene.symbol')
DEA_result_df = pd.merge(DEA_result_df, project_codes, on='Gene.symbol')

#Filter to get significant top up/down regulated terms
top_upregulated = DEA_result_df[DEA_result_df['average_log2fc'] > 0].sort_values(by = 'average_log2fc', ascending = False)
top_downregulated = DEA_result_df[DEA_result_df['average_log2fc'] < 0].sort_values(by = 'average_log2fc', ascending = True)

In [154]:
#establish file paths to GO results
file_paths_GO = proj_codes_df.apply(
    lambda row: os.path.join(base_dir, row['proj'], 'GO', f"top_file_{row['proj']}_{row['sample_abbr']}_GTEX_{row['matched_tissue']}.csv"),
    axis = 1
)

#load data from specified files into pandas dataframes
GO_dfs =[]

for i, file_path in enumerate(file_paths_GO):
    df = pd.read_csv(file_path)
    df['proj'] = proj_codes_df.iloc[i]['proj']
    GO_dfs.append(df)

In [211]:
#concatenate all GO dataframes into single respective dataframe
GO_data = pd.concat(GO_dfs, ignore_index = True)

#count occurences of each GO term
term_counts = GO_data['term'].value_counts().reset_index()
term_counts.columns = ['term', 'count']

#calculate the fisher combined p-vals and log2fc for each gene
grouped_GO_data = GO_data.groupby(['term','category']).agg(
    combined_overrepresented_pvalue = ('over_represented_pvalue', lambda p_values: fisher_method_threshold(p_values)),
    combined_underrepresented_pvalue = ('under_represented_pvalue', lambda p_values: fisher_method_threshold(p_values))
).reset_index()

#aggregate project codes for each gene
project_codes = GO_data.groupby('term')['proj'].apply(lambda x: list(x.unique())).reset_index()

#merge gene counts with grouped data
GO_result_df = pd.merge(term_counts, grouped_GO_data, on='term')
GO_result_df = pd.merge(GO_result_df, project_codes, on='term')

#Filter to get significant top up/down regulated genes
top_overrepresented = GO_result_df[GO_result_df['combined_overrepresented_pvalue'] < 0.05].sort_values(by = 'combined_overrepresented_pvalue', ascending = True)
top_underrepresented = GO_result_df[GO_result_df['combined_underrepresented_pvalue'] < 0.05].sort_values(by = 'combined_underrepresented_pvalue', ascending = True)

In [212]:
top_upregulated.to_csv('top_upregulated.csv', index = False)
top_downregulated.to_csv('top_downregulated.csv', index = False)

top_overrepresented.to_csv('top_overrepresented.csv', index = False)
top_underrepresented.to_csv('top_underrepresented.csv', index = False)

In [220]:
UR = top_underrepresented[(top_underrepresented['count'] >= 10)]
UR = ', '.join(UR['category'].astype(str))
UR

'GO:0005622, GO:0043229, GO:0043231, GO:0005737'

In [236]:
top_upregulated[(top_upregulated['Gene.symbol'] == 'FANCG')]


Unnamed: 0,Gene.symbol,count,combined_p_values,average_log2fc,proj
39,FANCG,28,3.0174729999999996e-26,0.269445,"[TCGA-LAML, TCGA-STAD, TCGA-SKCM, TCGA-BRCA, T..."


In [237]:
top_downregulated[(top_downregulated['Gene.symbol'] == 'FANCG')]

Unnamed: 0,Gene.symbol,count,combined_p_values,average_log2fc,proj
