In [1]:
import dask.dataframe as dd
import duckdb
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
from scripts.pyensembl_operations import import_pyensembl
import sqlite3

# Initialize pyensembl
g37 = import_pyensembl(37)

# For DuckDB queries, use correct file format readers
real_files = "read_csv_auto('results/dec7_combined.csv')"  # Changed from parquet to csv
synth_files = "read_parquet('results/processed/synth/**/*.parquet')"


INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


In [2]:
query_real = f"""
    SELECT 
        gene_id,
        COUNT(*) FILTER (WHERE is_gene_upregulated = TRUE) as upregulated,
        COUNT(*) FILTER (WHERE is_gene_upregulated = FALSE) as downregulated
    FROM {real_files}
    WHERE vcf_id != 'PD4120a' 
    AND gene_id != 'not_found'
    GROUP BY gene_id
"""

result_real = duckdb.sql(query_real).df()

query_synth = f"""
    SELECT 
        gene_id,
        COUNT(*) FILTER (WHERE is_gene_upregulated = TRUE) as upregulated,
        COUNT(*) FILTER (WHERE is_gene_upregulated = FALSE) as downregulated
    FROM {synth_files}
    WHERE vcf_id != 'PD4120a'
    AND gene_id != 'not_found'
    GROUP BY gene_id
"""

result_synth = duckdb.sql(query_synth).df()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
real = dd.read_csv("results/dec7_combined.csv")
synth = dd.read_parquet("results/processed/synth/**/*.parquet")
# Filter out PD4120a

real = real[real.vcf_id != 'PD4120a']
synth = synth[synth.vcf_id != 'PD4120a']


# taking the crossection and merging the dfs together

In [None]:
common_genes = list(set(result_real.gene_id)  & set(result_synth.gene_id))
result_real = result_real[result_real.gene_id.isin(common_genes)]
result_synth = result_synth[result_synth.gene_id.isin(common_genes)]

# divide synth amounts by 10
result_synth["upregulated"] = result_synth["upregulated"] / 10
result_synth["downregulated"] = result_synth["downregulated"] / 10

# remove not_found
result_real = result_real[result_real.gene_id != "not_found"]
result_synth = result_synth[result_synth.gene_id != "not_found"]

values = pd.merge(result_real, result_synth, on="gene_id", suffixes=["_real", "_synth"])


# steps

drop PD4120a

drop gene_id == not_found

take crossection of real and synth

take avg of 10 synths


# log2odds

In [None]:
def calculate_log2_odds_ratio(a, b, c, d, k=0.5):
    # a, b, c, d are the four cells of the 2x2 contingency table
    # k is the smoothing constant
    #
    odds_ratio = ((a + k) * (d + k)) / ((b + k) * (c + k))
    return np.log2(odds_ratio)

# laplace smoothing with k=0.5 (Jeffreys prior)
values['log2_odds_ratio'] = values.apply(lambda row: calculate_log2_odds_ratio(
    row['upregulated_real'], 
    row['downregulated_real'], 
    row['upregulated_synth'], 
    row['downregulated_synth']
), axis=1)

values

# fisher's exact

In [None]:
def perform_fisher_test_vectorized(df, pseudocount=0.5):
    # Add pseudocount to the table
    table = np.array([
        [df['upregulated_real'] + pseudocount, df['downregulated_real'] + pseudocount],
        [df['upregulated_synth'] + pseudocount, df['downregulated_synth'] + pseudocount]
    ]).transpose((2, 0, 1))  # reshape for 2x2 tables

    p_values = np.zeros(len(df))

    for i in range(len(df)):
        _, p_values[i] = fisher_exact(table[i])

    df['p_value'] = p_values
    df['p_adj'] = multipletests(p_values, method='fdr_bh')[1]
    
    return df
def add_z_score(df):
    # Calculate mean and standard deviation of log2 odds ratios
    mean_log2or = df['log2_odds_ratio'].mean()
    std_log2or = df['log2_odds_ratio'].std()
    
    # Calculate Z-score
    df['z_score'] = (df['log2_odds_ratio'] - mean_log2or) / std_log2or
    
    return df


df = perform_fisher_test_vectorized(values)
df = add_z_score(df)
df.head()

# adding pyensembl stuff

In [None]:
# Create dictionaries for both gene names and biotypes
gene_names = {gene.gene_id: gene.gene_name for gene in g37.genes()}
biotypes = {gene.gene_id: gene.biotype for gene in g37.genes()}

# Add both columns to the DataFrame
df["gene_name"] = df["gene_id"].map(gene_names)
df["biotype"] = df["gene_id"].map(biotypes)

df.head()


# further augments

In [None]:
df["is_significant"] = df['p_adj'] < 0.05

# add genes table
sqlite_conn = sqlite3.connect('data/mirscribe.db')
genes = pd.read_sql('SELECT * FROM genes', sqlite_conn)
mirnas = pd.read_sql('SELECT * FROM mirnas', sqlite_conn)
sqlite_conn.close()


In [None]:
genes 

In [None]:


# add genes table details
cols_to_merge = ['gene_id', 'is_oncogene_oncokb', 'is_tsupp_oncokb',
       'is_brca_driver', 'tier_cosmic', 'is_hallmark_cosmic',
       'is_tsupp_cosmic', 'is_oncogene_cosmic', 'is_oncogene_consensus',
       'is_tsupp_consensus', 'cancer_gene_role', "is_brca_driver"]

df = pd.merge(df, genes[cols_to_merge], how="left", on="gene_id" )



df.head()

In [None]:
df.to_csv("results/dec7_final_data.csv", index=False)
