In [1]:
import pandas as pd, numpy as np, sqlite3
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests


db_path = "results.db"
# Using with statement ensures proper connection handling
with sqlite3.connect(db_path) as conn:
    # Basic head with default 5 rows
    real = pd.read_sql_query("SELECT * FROM real LIMIT 10", conn)
    synth = pd.read_sql_query("SELECT * FROM synth LIMIT 10", conn)

In [2]:
def get_filtered_data(db_path, table_name, threshold=0.5):
    query = f"""
    SELECT 
        id,
        wt_prediction,
        mut_prediction,
        pred_difference,
        vcf_id,
        mirna_accession,
        gene_id,
        is_intron,
        mutation_context,
        is_gene_upregulated,
        mutsig,
        gene_name,
        cancer_type
    FROM {table_name}
    WHERE gene_id != 'not_found'
    AND vcf_id != 'PD4120a'
    AND ABS(wt_prediction - mut_prediction) > {threshold}
    """

    dtype_dict = {
        'id': 'int32',
        'wt_prediction': 'float32',
        'mut_prediction': 'float32',
        'pred_difference': 'float32',
        'vcf_id': 'category',
        'mirna_accession': 'category',
        'gene_id': 'category',
        'is_intron': 'bool',
        'mutation_context': 'category',
        'is_gene_upregulated': 'bool',
        'mutsig': 'category',
        'gene_name': 'category',
        'cancer_type': 'category'
    }

    with sqlite3.connect(db_path) as conn:
        df = pd.read_sql_query(
            query, 
            conn,
            dtype=dtype_dict
        )

    print(f"Total rows: {len(df)}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    
    return df

def get_gene_regulation_counts(db_path, table_name, threshold=0.5):
    
    query = f"""
    SELECT 
        gene_id,
        COUNT(*) FILTER (WHERE is_gene_upregulated = TRUE) as upregulated,
        COUNT(*) FILTER (WHERE is_gene_upregulated = FALSE) as downregulated
    FROM {table_name}
    WHERE vcf_id != 'PD4120a' 
    AND gene_id != 'not_found'
    AND ABS(wt_prediction - mut_prediction) > {threshold}
    GROUP BY gene_id
    """
    
    with sqlite3.connect(db_path) as conn:
        counts_df = pd.read_sql_query(query, conn)
    
    print(f"Total unique genes: {len(counts_df)}")
    return counts_df

def calculate_log2_odds_ratio(a, b, c, d, k=0.5):

    odds_ratio = ((a + k) * (d + k)) / ((b + k) * (c + k))
    return np.log2(odds_ratio)

def shrink_log2_odds(values, prior_scale=1.0, min_count=10):
    """Implement empirical Bayes shrinkage for log2 odds ratios."""
    total_counts = (values['upregulated_real'] + values['downregulated_real'] +
                   values['upregulated_synth'] + values['downregulated_synth'])
    
    raw_log2_odds = values.apply(lambda row: calculate_log2_odds_ratio(
        row['upregulated_real'], 
        row['downregulated_real'], 
        row['upregulated_synth'], 
        row['downregulated_synth'],
        k=0.5
    ), axis=1)

    weights = 1 - np.exp(-total_counts / min_count)
    prior_mean = np.average(raw_log2_odds, weights=weights)
    prior_var = np.var(raw_log2_odds)
    posterior_scale = prior_scale / (1 + weights * prior_scale)
    
    return weights * raw_log2_odds + (1 - weights) * prior_mean


In [None]:
real_counts = get_gene_regulation_counts(db_path, "real", 0.5)
synth_counts = get_gene_regulation_counts(db_path, "synth", 0.5)

counts = pd.merge(real_counts, synth_counts, how="inner", on="gene_id", suffixes=["_real", "_synth"])
counts["upregulated_synth"] = counts["upregulated_synth"] / 10
counts["downregulated_synth"] = counts["downregulated_synth"] / 10

real_counts.to_csv("results/sql/real_postsql.csv", index=False)
synth_counts.to_csv("results/sql/synth_postsql.csv", index=False)
counts.to_csv("results/sql/merged_postsql.csv", index=False)

In [32]:
counts = pd.read_csv("results/sql/merged_postsql.csv")

# counts['log2_odds_ratio'] = counts.apply(lambda row: calculate_log2_odds_ratio(
#     row['upregulated_real'], 
#     row['downregulated_real'], 
#     row['upregulated_synth'], 
#     row['downregulated_synth']
# ), axis=1)

# counts['shrunk_log2_odds'] = shrink_log2_odds(counts)

In [6]:
def perform_fisher_test_vectorized(df, pseudocount=0.05):
    # Add pseudocount to the table
    table = np.array([
        [df['upregulated_real'] + pseudocount, df['downregulated_real'] + pseudocount],
        [df['upregulated_synth'] + pseudocount, df['downregulated_synth'] + pseudocount]
    ]).transpose((2, 0, 1))  # reshape for 2x2 tables

    p_values = np.zeros(len(df))

    for i in range(len(df)):
        _, p_values[i] = fisher_exact(table[i])

    df['p_value'] = p_values
    df['p_adj'] = multipletests(p_values, method='fdr_bh')[1]
    
    return df
def add_z_score(df):
    # Calculate mean and standard deviation of log2 odds ratios
    mean_log2or = df['log2_odds_ratio'].mean()
    std_log2or = df['log2_odds_ratio'].std()
    
    # Calculate Z-score
    df['z_score'] = (df['log2_odds_ratio'] - mean_log2or) / std_log2or
    
    return df


df = perform_fisher_test_vectorized(counts)
# df = add_z_score(df)

In [33]:
def perform_fisher_test_cancer(df, pseudocount=0.01):
    """
    Performs Fisher's exact test comparing real cancer mutation effects vs synthetic background
    
    Parameters:
    df: DataFrame with columns [upregulated_real, downregulated_real, upregulated_synth, downregulated_synth]
    pseudocount: small number to avoid zero issues
    """
    # Create contingency tables
    table = np.array([
        [df['upregulated_real'] + pseudocount, df['downregulated_real'] + pseudocount],
        [np.round(df['upregulated_synth']) + pseudocount, np.round(df['downregulated_synth']) + pseudocount]
    ]).transpose((2, 0, 1))

    p_values = np.zeros(len(df))
    odds_ratios = np.zeros(len(df))

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for i in range(len(df)):
            odds_ratio, p_value = fisher_exact(table[i])
            p_values[i] = p_value
            odds_ratios[i] = odds_ratio

    # Add results to dataframe
    # df['fisher_odds_ratio'] = odds_ratios
    df['p_value'] = p_values
    df['p_adj'] = multipletests(p_values, method='fdr_bh')[1]
    # df['log10_p_adj'] = -np.log10(df['p_adj'])
    
    return df

import warnings
counts = perform_fisher_test_cancer(counts)

In [34]:
counts

Unnamed: 0,gene_id,upregulated_real,downregulated_real,upregulated_synth,downregulated_synth,p_value,p_adj
0,ENSG00000000003,11,2,33.7,4.1,0.637677,1.000000
1,ENSG00000000005,11,2,10.8,3.8,0.654589,1.000000
2,ENSG00000000419,20,0,22.5,4.7,0.062737,1.000000
3,ENSG00000000457,29,16,40.0,13.4,0.271180,1.000000
4,ENSG00000000460,258,50,188.5,74.7,0.000527,0.186283
...,...,...,...,...,...,...,...
28227,ENSG00000273442,0,1,0.1,0.5,1.000000,1.000000
28228,ENSG00000273471,4,0,2.8,6.6,0.069930,1.000000
28229,ENSG00000273472,5,0,3.9,0.0,1.000000,1.000000
28230,ENSG00000273481,0,2,2.1,1.8,0.466667,1.000000


In [35]:
counts[counts.p_adj < 0.05]

Unnamed: 0,gene_id,upregulated_real,downregulated_real,upregulated_synth,downregulated_synth,p_value,p_adj
371,ENSG00000018189,42,62,96.2,38.3,1.606308e-06,0.004845006
657,ENSG00000049618,325,53,325.0,118.8,6.930279e-06,0.01505043
674,ENSG00000050628,226,154,305.1,108.8,2.336874e-05,0.03879331
1595,ENSG00000084093,10,15,51.0,7.0,1.548207e-05,0.03122071
1962,ENSG00000093167,108,13,97.1,49.5,5.835759e-06,0.01372959
1972,ENSG00000094914,2,19,30.9,10.6,1.71614e-06,0.004845006
2815,ENSG00000104324,641,136,593.2,213.3,2.094238e-05,0.03695284
4175,ENSG00000114650,10,25,54.2,19.8,2.008089e-05,0.03695284
4468,ENSG00000116754,41,0,40.0,18.5,2.502905e-05,0.03879331
5848,ENSG00000129116,191,164,383.7,146.6,1.928242e-08,0.0001088762
