In [4]:

import dask.dataframe as dd
import duckdb
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
from scripts.pyensembl_operations import import_pyensembl
import sqlite3
pd.set_option('display.max_columns', None)   # Show all columns


In [5]:
real_files = "read_csv_auto('results/dec7_combined.csv')" 


In [6]:
real_files

"read_csv_auto('results/dec7_combined.csv')"

In [2]:
def import_libraries():
    """Import all required libraries."""
    import dask.dataframe as dd
    import duckdb
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    from scripts.pyensembl_operations import import_pyensembl
    import sqlite3

    
    pd.set_option('display.max_columns', None)
    return dd, duckdb, pd, np, sns, plt, sqlite3, import_pyensembl

def initialize_resources():
    """Initialize pyensembl and set file paths."""
    g37 = import_pyensembl(37)
    real_files = "read_csv_auto('results/dec7_combined.csv')" 
    synth_files = "read_parquet('results/processed/synth/**/*.parquet')"
    return g37, real_files, synth_files

def pivot_counts(df):
    """Pivot a given DataFrame with upreg counts."""
    result = (df.groupby(['gene_id', 'is_gene_upregulated'])
              .size()
              .reset_index()
              .compute())

    result = (result.pivot(index='gene_id', 
                          columns='is_gene_upregulated', 
                          values=0)
              .fillna(0)
              .reset_index())

    result.columns = ['gene_id', 'downreg_count', 'upreg_count']
    return result

def get_common_genes(real_files, synth_files, duckdb):
    """Get common gene_ids from real and synthetic files."""
    query_real_genes = f"""
        SELECT DISTINCT gene_id
        FROM {real_files}
        WHERE vcf_id != 'PD4120a' 
        AND gene_id != 'not_found'
    """

    query_synth_genes = f"""
        SELECT DISTINCT gene_id
        FROM {synth_files}
        WHERE vcf_id != 'PD4120a' 
        AND gene_id != 'not_found'
    """

    genes_in_real = set(duckdb.sql(query_real_genes).df()['gene_id'])
    genes_in_synth = set(duckdb.sql(query_synth_genes).df()['gene_id'])
    return genes_in_real.intersection(genes_in_synth)

def load_and_process_data(dd):
    """Load and filter real and synthetic data."""
    real = dd.read_parquet("results/dec7_optimized.parquet")
    synth = dd.read_parquet("results/processed/synth/**/*.parquet")

    real = real[real.vcf_id != 'PD4120a']
    synth = synth[synth.vcf_id != 'PD4120a']
    real = real[real.gene_id != "not_found"]
    synth = synth[synth.gene_id != "not_found"]
    
    return real, synth

def process_common_genes(real, synth, common_genes):
    """Process and merge data for common genes."""
    result_real = pivot_counts(real)
    result_synth = pivot_counts(synth)
    

    result_real = result_real[result_real.gene_id.isin(common_genes)]
    result_synth = result_synth[result_synth.gene_id.isin(common_genes)]
    
    # Divide synthetic values by 10
    result_synth[['upreg_count', 'downreg_count']] /= 10
    
    results_merged = pd.merge(result_real, result_synth, on='gene_id', how='left', suffixes=('_real', '_synth'))
    results_merged["downreg_count_real"] = results_merged["downreg_count_real"].astype(int)
    results_merged["upreg_count_real"] = results_merged["upreg_count_real"].astype(int)
    
    return results_merged

def calculate_log2_odds_ratio(a, b, c, d, k=0.5):
    """Calculate log2 odds ratio with smoothing constant k."""
    odds_ratio = ((a + k) * (d + k)) / ((b + k) * (c + k))
    return np.log2(odds_ratio)

def shrink_log2_odds(values, prior_scale=1.0, min_count=10):
    """Implement empirical Bayes shrinkage for log2 odds ratios."""
    total_counts = (values['upreg_count_real'] + values['downreg_count_real'] +
                   values['upreg_count_synth'] + values['downreg_count_synth'])
    
    raw_log2_odds = values.apply(lambda row: calculate_log2_odds_ratio(
        row['upreg_count_real'], 
        row['downreg_count_real'], 
        row['upreg_count_synth'], 
        row['downreg_count_synth'],
        k=0.5
    ), axis=1)

    weights = 1 - np.exp(-total_counts / min_count)
    prior_mean = np.average(raw_log2_odds, weights=weights)
    prior_var = np.var(raw_log2_odds)
    posterior_scale = prior_scale / (1 + weights * prior_scale)
    
    return weights * raw_log2_odds + (1 - weights) * prior_mean

def perform_fisher_test_vectorized(df, pseudocount=0.5):
    """Perform Fisher's exact test with FDR correction."""
    table = np.array([
        [df['upreg_count_real'] + pseudocount, df['downreg_count_real'] + pseudocount],
        [df['upreg_count_synth'] + pseudocount, df['downreg_count_synth'] + pseudocount]
    ]).transpose((2, 0, 1))

    p_values = np.zeros(len(df))
    for i in range(len(df)):
        _, p_values[i] = fisher_exact(table[i])

    df['p_value'] = p_values
    df['p_adj'] = multipletests(p_values, method='fdr_bh')[1]
    return df

def add_z_score(df):
    """Add Z-scores to the DataFrame."""
    mean_log2or = df['log2_odds_ratio'].mean()
    std_log2or = df['log2_odds_ratio'].std()
    df['z_score'] = (df['log2_odds_ratio'] - mean_log2or) / std_log2or
    return df

def add_gene_information(df, g37, sqlite_path='data/mirscribe.db'):
    """Add gene names, biotypes, and additional gene information."""
    gene_names = {gene.gene_id: gene.gene_name for gene in g37.genes()}
    biotypes = {gene.gene_id: gene.biotype for gene in g37.genes()}

    df["gene_name"] = df["gene_id"].map(gene_names)
    df["biotype"] = df["gene_id"].map(biotypes)
    df["is_significant"] = df['p_adj'] < 0.05

    sqlite_conn = sqlite3.connect(sqlite_path)
    genes = pd.read_sql('SELECT * FROM genes', sqlite_conn)
    mirnas = pd.read_sql('SELECT * FROM mirnas', sqlite_conn)
    sqlite_conn.close()

    cols_to_merge = ['gene_id', 'is_oncogene_oncokb', 'is_tsupp_oncokb',
                     'is_brca_driver', 'tier_cosmic', 'is_hallmark_cosmic',
                     'is_tsupp_cosmic', 'is_oncogene_cosmic', 'is_oncogene_consensus',
                     'is_tsupp_consensus', 'cancer_gene_role']

    df = pd.merge(df, genes[cols_to_merge], how="left", on="gene_id")
    return df


In [3]:
# dd, duckdb, pd, np, sns, plt, sqlite3, import_pyensembl = import_libraries()
g37, real_files, synth_files = initialize_resources()

# Load and process data
real, synth = load_and_process_data(dd)
common_genes = get_common_genes(real_files, synth_files, duckdb)
results_merged = process_common_genes(real, synth, common_genes)

# Calculate statistics
results_merged['shrunk_log2_odds'] = shrink_log2_odds(results_merged)
df = perform_fisher_test_vectorized(results_merged)

# Add gene information
df = add_gene_information(df, g37)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

OSError: [Errno 28] No space left on device

In [None]:
df = df[df.is_significant == True]

In [None]:
df.head()