In [31]:
import dask.dataframe as dd
import pandas as pd
import sqlite3
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

PSEUDOCOUNT = 0.1

In [2]:
# add genes table details
cols_to_merge = ['gene_id', 'is_oncogene_oncokb', 'is_tsupp_oncokb',
       'is_driver_intogen', 'tier_cosmic', 'is_hallmark_cosmic',
       'is_tsupp_cosmic', 'is_oncogene_cosmic', 'is_oncogene_consensus',
       'is_tsupp_consensus', 'is_gene_of_interest', 'cancer_gene_role']

In [3]:

# Lazy reads with pyarrow string
real = dd.read_parquet("results/processed/real/0/*.parquet").astype({'gene_id': 'string[pyarrow]'})
synth = dd.read_parquet("results/processed/synth/**/*.parquet").astype({'gene_id': 'string[pyarrow]'})

# Get genes data with pyarrow string
with sqlite3.connect('data/mirscribe_backup.db') as conn:
    genes = pd.read_sql(f"SELECT {','.join(cols_to_merge)} FROM genes", conn)
    genes['gene_id'] = genes['gene_id'].astype('string[pyarrow]')
genes_dd = dd.from_pandas(genes[cols_to_merge], npartitions=1)

# Lazy joins
real = real.merge(genes_dd, on='gene_id', how='left')
synth = synth.merge(genes_dd, on='gene_id', how='left')

In [None]:
# from scripts.pyensembl_operations import import_pyensembl
# g37 = import_pyensembl(37)

# # Create dictionaries for both gene names and biotypes
# gene_names = {gene.gene_id: gene.gene_name for gene in g37.genes()}
# biotypes = {gene.gene_id: gene.biotype for gene in g37.genes()}

# # Add both columns to the DataFrame
# real["gene_name"] = real["gene_id"].map(gene_names)
# real["biotype"] = real["gene_id"].map(biotypes)

# synth["gene_name"] = synth["gene_id"].map(gene_names)
# synth["biotype"] = synth["gene_id"].map(biotypes)

In [None]:
# real_2 = (real[real['is_oncogene_consensus'] == True]
#           .groupby(['gene_id', 'is_gene_upregulated'])
#           .size()
#           .to_frame('count')  # use to_frame instead of reset_index(name=...)
#           .reset_index()      # then reset_index separately
#           .compute())

# pivoted = real_2.pivot(index='gene_id', columns='is_gene_upregulated', values='count').fillna(0).astype(int)

# with np.errstate(divide='raise', invalid='ignore'):
#     upreg_downreg_ratio = (pivoted[True] + PSEUDOCOUNT) / (pivoted[False] + PSEUDOCOUNT)
#     upreg_all_ratio = (pivoted[True]) / (pivoted[True] + pivoted[False])

# pivoted['upreg_downreg_ratio'] = upreg_downreg_ratio.round(3)


In [33]:
def calculate_gene_regulation_ratios(df, filter_column='is_oncogene_consensus', pseudocount=0.1):

    # Get counts
    counts = (df[df[filter_column] == True]
             .groupby(['gene_id', 'is_gene_upregulated'])
             .size()
             .to_frame('count')
             .reset_index()
             .compute())
    
    # Pivot and calculate ratios
    pivoted = counts.pivot(
        index='gene_id', 
        columns='is_gene_upregulated', 
        values='count'
    ).fillna(0).astype(int)
    
    # Calculate ratios
    with np.errstate(divide='raise', invalid='ignore'):
        upreg_downreg_ratio = (pivoted[True] + pseudocount) / (pivoted[False] + pseudocount)
        upreg_all_ratio = (pivoted[True]) / (pivoted[True] + pivoted[False])
    
    pivoted['upreg_downreg_ratio'] = upreg_downreg_ratio.round(3)
    
    return pivoted

# Usage:
# result = calculate_gene_regulation_ratios(real, pseudocount=0.1)


In [None]:
oncogene_ratio_real = calculate_gene_regulation_ratios(real, 'is_oncogene_consensus')
tsupp_ratio_real = calculate_gene_regulation_ratios(real, 'is_tsupp_consensus')

oncogene_ratio_synth = calculate_gene_regulation_ratios(synth, 'is_oncogene_consensus')
tsupp_ratio_synth = calculate_gene_regulation_ratios(synth, 'is_tsupp_consensus')


In [39]:
import os

# Create a 'pickle' directory if it doesn't exist
pickle_folder = 'pickle'
os.makedirs(pickle_folder, exist_ok=True)

oncogene_ratio_real.to_pickle(os.path.join(pickle_folder, 'oncogene_ratio_real.pkl'))
tsupp_ratio_real.to_pickle(os.path.join(pickle_folder, 'tsupp_ratio_real.pkl'))

oncogene_ratio_synth.to_pickle(os.path.join(pickle_folder, 'oncogene_ratio_synth.pkl'))
tsupp_ratio_synth.to_pickle(os.path.join(pickle_folder, 'tsupp_ratio_synth.pkl'))


In [49]:
# Merge the real and synthetic ratios
oncogene_details = pd.merge(oncogene_ratio_real, oncogene_ratio_synth, on='gene_id', suffixes=('_real', '_synth'))
tsupp_details = pd.merge(tsupp_ratio_real, tsupp_ratio_synth, on='gene_id', suffixes=('_real', '_synth'))

# Calculate the log ratio differences
oncogene_details['ratio_diff'] = oncogene_details['upreg_downreg_ratio_real'] - oncogene_details['upreg_downreg_ratio_synth']
tsupp_details['ratio_diff'] = tsupp_details['upreg_downreg_ratio_real'] - tsupp_details['upreg_downreg_ratio_synth']

# Sort the merged DataFrames by log ratio difference
oncogene_details = oncogene_details.sort_values(by=['ratio_diff'], ascending=False)
tsupp_details = tsupp_details.sort_values(by=['ratio_diff'])

real_ratios = oncogene_details['upreg_downreg_ratio_real']
synth_ratios = oncogene_details['upreg_downreg_ratio_synth']
mw_statistic, mw_p_value = stats.mannwhitneyu(real_ratios, synth_ratios)

In [52]:
tsupp_details[(tsupp_details.upreg_downreg_ratio_real > 0.32)]

is_gene_upregulated,False_real,True_real,upreg_downreg_ratio_real,False_synth,True_synth,upreg_downreg_ratio_synth,ratio_diff
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000134086,65,218,3.350,521,4174,8.010,-4.660
ENSG00000132781,70,112,1.599,390,2027,5.196,-3.597
ENSG00000103241,5,3,0.608,109,414,3.796,-3.188
ENSG00000078399,53,44,0.831,611,2401,3.929,-3.098
ENSG00000184557,15,11,0.735,163,561,3.440,-2.705
...,...,...,...,...,...,...,...
ENSG00000227507,0,12,121.000,20,171,8.512,112.488
ENSG00000105722,0,13,131.000,80,531,6.630,124.370
ENSG00000153487,0,17,171.000,361,868,2.404,168.596
ENSG00000125285,0,24,241.000,18,300,16.580,224.420


In [50]:
tsupp_details[(tsupp_details.upreg_downreg_ratio_real > 0.32) & ~(tsupp_details.upreg_downreg_ratio_synth > 0.32)].sort_values(by="upreg_downreg_ratio_real", ascending=False)


is_gene_upregulated,False_real,True_real,upreg_downreg_ratio_real,False_synth,True_synth,upreg_downreg_ratio_synth,ratio_diff
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
