In [1]:
import pandas as pd
import os 
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import fisher_exact


# read real and 10 synth data

In [2]:
# real = pd.read_parquet("data/synth/real.parquet")
# all = pd.read_parquet("data/synth/all.parquet")

# # sourcery skip: avoid-builtin-shadow
# real_genes = set(real.gene_id)    
# all_genes = set(all.gene_id)

# # details
# print(f"Real genes: {len(real_genes)}")
# print(f"All genes: {len(all_genes)}")
# print(f"Real genes not in all: {len(real_genes - all_genes)}")
# print(f"Crossection: {len(real_genes & all_genes)}")

# # taking crossection
# common_genes = list(set(real.gene_id)  & set(all.gene_id))
# real = real[real.gene_id.isin(common_genes)]
# all = all[all.gene_id.isin(common_genes)]

# # dropping not_found gene ids
# real = real[real.gene_id != "not_found"]
# all = all[all.gene_id != "not_found"]

# real_2 = real.groupby(['gene_id', 'is_gene_upregulated']).size().unstack(fill_value=0)
# synth_2 = all.groupby(['gene_id', 'is_gene_upregulated']).size().unstack(fill_value=0).div(10)
# values = pd.merge(real_2, synth_2, on="gene_id", suffixes=["_real", "_synth"])
# values.to_csv("data/synth/values.csv", index=True)

In [3]:
values = pd.read_csv("data/synth/values.csv")

rename_dict = {
    "gene_id": "gene_id",
    "False_real": "downregulated_real",
    "True_real": "upregulated_real",
    "False_synth": "downregulated_synth",
    "True_synth": "upregulated_synth"
}

values = values.rename(columns=rename_dict)

In [4]:
def calculate_log2_odds_ratio(a, b, c, d, k=0.5):
    # a, b, c, d are the four cells of the 2x2 contingency table
    # k is the smoothing constant
    #
    odds_ratio = ((a + k) * (d + k)) / ((b + k) * (c + k))
    return np.log2(odds_ratio)

# laplace smoothing with k=0.5 (Jeffreys prior)
values['log2_odds_ratio'] = values.apply(lambda row: calculate_log2_odds_ratio(
    row['upregulated_real'], 
    row['downregulated_real'], 
    row['upregulated_synth'], 
    row['downregulated_synth']
), axis=1)

In [5]:
def perform_fisher_test_vectorized(df, pseudocount=0.5):
    # Add pseudocount to the table
    table = np.array([
        [df['upregulated_real'] + pseudocount, df['downregulated_real'] + pseudocount],
        [df['upregulated_synth'] + pseudocount, df['downregulated_synth'] + pseudocount]
    ]).transpose((2, 0, 1))  # reshape for 2x2 tables

    p_values = np.zeros(len(df))

    for i in range(len(df)):
        _, p_values[i] = fisher_exact(table[i])

    df['p_value'] = p_values
    df['p_adj'] = multipletests(p_values, method='fdr_bh')[1]
    
    return df
def add_z_score(df):
    # Calculate mean and standard deviation of log2 odds ratios
    mean_log2or = df['log2_odds_ratio'].mean()
    std_log2or = df['log2_odds_ratio'].std()
    
    # Calculate Z-score
    df['z_score'] = (df['log2_odds_ratio'] - mean_log2or) / std_log2or
    
    return df


df = perform_fisher_test_vectorized(values)
df = add_z_score(df)


log2_threshold = 0.32
p_threshold = 0.05

# df = df[(abs(df['log2_odds_ratio']) > log2_threshold) & (df['p_adj'] < p_threshold)]

In [6]:
import pandas as pd
from pyensembl import EnsemblRelease

def get_gene_details(df, gene_id_column, release=75):
    # Initialize Ensembl data for the specified release
    ensembl = EnsemblRelease(release)
    
    def fetch_details(gene_id):
        try:
            gene = ensembl.gene_by_id(gene_id)
            return {
                'gene_id': gene.gene_id,
                'gene_name': gene.gene_name,
                'biotype': gene.biotype,
                'chromosome': gene.contig,
            }
        except ValueError:
            return {
                'gene_id': None,
                'gene_name': None,
                'biotype': None,
                'chromosome': None,
            }
    
    # Apply the function to each gene ID in the column
    gene_details = df[gene_id_column].apply(fetch_details)
    
    # Convert the result to a DataFrame
    details_df = pd.DataFrame(gene_details.tolist())
    

    
    return pd.merge(df, details_df, on='gene_id', how='left')
result = get_gene_details(df, 'gene_id')


In [7]:
result.drop(['upregulated_synth', 'downregulated_synth', 'upregulated_real', 'downregulated_real'], axis=1, inplace=True)

In [8]:
import pandas as pd
import numpy as np
import gseapy as gp
import matplotlib.pyplot as plt

# Load your data

# Handle zero and very small p-values
min_pvalue = result['p_adj'][result['p_adj'] > 0].min() / 2
result['p_adj'] = result['p_adj'].replace(0, min_pvalue)

# Calculate rank, handling potential infinite values
result['rank'] = result['log2_odds_ratio'] * (-np.log10(result['p_adj']))
result['rank'] = result['rank'].clip(lower=-1e10, upper=1e10)  # Clip extreme values

# Add small random values to break ties
result['rank'] = result['rank'] + np.random.random(len(result)) * 1e-10

# Sort by rank
df_sorted = result.sort_values('rank', ascending=False)

# Create a Series with gene names as index and ranks as values
ranked_genes = df_sorted.set_index('gene_name')['rank']

# Run preranked GSEA
pre_res = gp.prerank(rnk=ranked_genes, 
                     gene_sets='KEGG_2021_Human',
                     threads=4,
                     permutation_num=1000,
                     min_size=10,
                     max_size=1000,
                     outdir='gsea_results/prerank_result',
                     format='png')


# Get the results
results = pre_res.res2d
sorted_results = results.sort_values('NES', ascending=False)

In [9]:
sorted_results

Unnamed: 0,Name,Term,ES,NES,NOM p-val,FDR q-val,FWER p-val,Tag %,Gene %,Lead_genes
0,prerank,Glyoxylate and dicarboxylate metabolism,0.57868,1.651448,0.047619,0.008535,0.006,2/13,2.18%,HAO2;ACAT1
298,prerank,Mannose type O-glycan biosynthesis,-0.270059,-0.454663,0.996929,0.999366,1.0,2/11,29.79%,FKTN;FUT9
297,prerank,Antigen processing and presentation,-0.347592,-0.623736,0.989,0.995351,1.0,8/30,39.94%,HLA-DPA1;KIR3DL1;KLRD1;HLA-F;CD4;CANX;HSP90AA1...
296,prerank,Glycolysis / Gluconeogenesis,-0.364853,-0.653153,0.977978,0.99539,1.0,11/29,45.51%,PGK1;MINPP1;HK1;ADH1B;HKDC1;LDHB;ALDH3B2;ALDH9...
295,prerank,Ascorbate and aldarate metabolism,-0.383196,-0.658629,0.956566,0.997968,1.0,8/14,40.46%,UGT2B7;UGT1A8;KL;RGN;UGT1A6;UGT2A3;ALDH9A1;UGT...
...,...,...,...,...,...,...,...,...,...,...
5,prerank,Glutamatergic synapse,-0.697959,-1.302775,0.0,0.618783,0.927,36/83,18.82%,GNGT1;GNAQ;PLCB4;GRIA2;CACNA1C;GRIN2B;PRKCA;SL...
4,prerank,Taste transduction,-0.725356,-1.309327,0.007,0.697805,0.9,20/36,21.57%,PLCB4;CHRM3;PDE1C;CACNA1C;ASIC2;PDE1A;GABRA1;S...
3,prerank,African trypanosomiasis,-0.767753,-1.349436,0.014056,0.457877,0.709,3/19,1.09%,GNAQ;PLCB4;PRKCA
2,prerank,Folate biosynthesis,-0.859278,-1.444292,0.010215,0.0999,0.18,3/10,7.53%,GPHN;PAH;AKR1C3
