In [None]:
import dask.dataframe as dd
import pandas as pd
import pickle
from pathlib import Path

real = dd.read_parquet("results/560_real/*.parquet")
synth = dd.read_parquet("results/5600_synth/1/*.parquet")


In [None]:
# import dask.dataframe as dd
# from dask.distributed import Client, LocalCluster

# # More conservative memory settings
# cluster = LocalCluster(
#     n_workers=6,  # Using 6 cores instead of 8 to be safer
#     threads_per_worker=1,  # 1 thread per worker
#     memory_limit='2GB'  # 2GB per worker (12GB total, leaving 4GB for system/overhead)
# )
# client = Client(cluster)
# print(f"Dashboard link: {client.dashboard_link}")

# # Now compute with controlled memory usage
# real_genes = real['gene_id'].unique().compute(scheduler='distributed')


In [2]:
def checkpoint(variable, name=None, folder="checkpoints", save=False):
    """
    Save or load a variable to/from a pickle file.
    
    Args:
        variable: The variable to save, or None if loading
        name (str): Name for the checkpoint file. If None, tries to use variable name
        folder (str): Directory for checkpoint files
        save (bool): If True, save variable. If False, load variable.
    
    Returns:
        The loaded variable if loading, None if saving
    """
    Path(folder).mkdir(exist_ok=True)
    filepath = Path(folder) / f"{name}.pkl"
    
    if save:
        with open(filepath, 'wb') as f:
            pickle.dump(variable, f)
        print(f"Saved checkpoint: {filepath}")
        return None
    else:
        try:
            with open(filepath, 'rb') as f:
                data = pickle.load(f)
            print(f"Loaded checkpoint: {filepath}")
            return data
        except FileNotFoundError:
            print(f"No checkpoint found: {filepath}")
            return None

# Usage examples:
# Save: checkpoint(my_variable, "my_variable", save=True)
# Load: my_variable = checkpoint(None, "my_variable") or default_value


In [3]:
# Get unique gene_ids safely
real_genes = real['gene_id'].unique().compute()
synth_genes = synth['gene_id'].unique().compute()

# checkpoint(real_genes, "real_genes2", save=True)
# checkpoint(synth_genes, "synth_genes", save=True)

# real_genes = checkpoint(None, "real_genes")
# synth_genes = checkpoint(None, "synth_genes")

In [4]:
# Convert to sets if they aren't already
real_genes_set = set(real_genes)
synth_genes_set = set(synth_genes)

# Print initial stats
print("Initial gene counts:")
print(f"Real genes: {len(real_genes_set)}")
print(f"Synth genes: {len(synth_genes_set)}")
print(f"Intersection: {len(real_genes_set & synth_genes_set)}")

# Get common genes and filter data
common_genes = list(real_genes_set & synth_genes_set)

# Filter out 'not_found' if present
if 'not_found' in common_genes:
    common_genes.remove('not_found')

Initial gene counts:
Real genes: 30747
Synth genes: 31120
Intersection: 27528


In [5]:

# Alternative approach for creating contingency tables
real_grouped = (real[real['gene_id'].isin(common_genes)]
                .groupby(['gene_id', 'is_gene_upregulated'])
                .size()
                .to_frame('count')
                .reset_index()
                .compute())

synth_grouped = (synth[synth['gene_id'].isin(common_genes)]
                 .groupby(['gene_id', 'is_gene_upregulated'])
                 .size()
                 .to_frame('count')
                 .reset_index()
                 .compute())

In [22]:
# Pivot tables
real_pivot = real_grouped.pivot(
    index='gene_id', 
    columns='is_gene_upregulated', 
    values='count'
).fillna(0)

synth_pivot = synth_grouped.pivot(
    index='gene_id', 
    columns='is_gene_upregulated', 
    values='count'
).fillna(0)

# synth_pivot = synth_grouped.pivot(
#     index='gene_id', 
#     columns='is_gene_upregulated', 
#     values='count'
# ).fillna(0).div(10)


# Rename columns for clarity
real_pivot.columns = [f'{col}_real' for col in real_pivot.columns]
synth_pivot.columns = [f'{col}_synth' for col in synth_pivot.columns]


# Merge the tables
values = pd.merge(
    real_pivot, 
    synth_pivot, 
    left_index=True, 
    right_index=True)

In [26]:
len(synth)

74656605

In [23]:
values

Unnamed: 0_level_0,False_real,True_real,False_synth,True_synth
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000003,78,153,194,526
ENSG00000000005,43,211,63,268
ENSG00000000419,75,388,238,417
ENSG00000000457,381,628,283,588
ENSG00000000460,1112,4073,1740,2706
...,...,...,...,...
ENSG00000273471,22,111,279,167
ENSG00000273472,0,26,0,8
ENSG00000273481,25,5,18,203
ENSG00000273492,161,914,364,847
