In [None]:
import dask.dataframe as dd
import duckdb
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
from scripts.pyensembl_operations import import_pyensembl
import sqlite3

# Initialize pyensembl
g37 = import_pyensembl(37)

# For DuckDB queries
real_files = "read_csv_auto('results/dec7_combined.csv')" 
synth_files = "read_parquet('results/processed/synth/**/*.parquet')"

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


In [5]:
query_real_genes = f"""
    SELECT DISTINCT gene_id
    FROM {real_files}
    WHERE vcf_id != 'PD4120a' 
    AND gene_id != 'not_found'
"""

genes_in_real = set(duckdb.sql(query_real_genes).df()['gene_id'])

query_synth_genes = f"""
    SELECT DISTINCT gene_id
    FROM {synth_files}
    WHERE vcf_id != 'PD4120a' 
    AND gene_id != 'not_found'
"""

genes_in_synth = set(duckdb.sql(query_synth_genes).df()['gene_id'])

# Get common genes
common_genes = genes_in_real.intersection(genes_in_synth)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
real = dd.read_csv("results/dec7_combined.csv")
synth = dd.read_parquet("results/processed/synth/**/*.parquet")

# Filter out PD4120a
real = real[real.vcf_id != 'PD4120a']
synth = synth[synth.vcf_id != 'PD4120a']
# filter_out not_found
real = real[real.gene_id != "not_found"]
synth = synth[synth.gene_id != "not_found"]

# filter for common genes
real_common = real[real.gene_id.isin(common_genes)]
synth_common = synth[synth.gene_id.isin(common_genes)]

In [22]:
# First compute the grouped counts
result_real = (real_common.groupby(['gene_id', 'is_gene_upregulated'])
               .size()
               .reset_index()
               .compute())

# Now that we have it as a pandas DataFrame, we can pivot
result_real = (result_real.pivot(index='gene_id', 
                                columns='is_gene_upregulated', 
                                values=0)
               .fillna(0)
               .reset_index())

# Rename columns
result_real.columns = ['gene_id', 'False', 'True']


In [23]:
result_real

Unnamed: 0,gene_id,False,True
0,ENSG00000000003,78.0,203.0
1,ENSG00000000005,59.0,232.0
2,ENSG00000000419,75.0,388.0
3,ENSG00000000457,397.0,735.0
4,ENSG00000000460,1281.0,4376.0
...,...,...,...
31325,ENSG00000273472,0.0,26.0
31326,ENSG00000273477,2.0,16.0
31327,ENSG00000273481,25.0,5.0
31328,ENSG00000273489,1.0,3.0


In [11]:
real_common.groupby('gene_id')['is_gene_upregulated'].value_counts().unstack(fill_value=0).reset_index()

AttributeError: 'Series' object has no attribute 'unstack'

In [10]:
(real_common
    .groupby('gene_id')
    .agg({
        'is_gene_upregulated': [
            ('upregulated', lambda x: (x == True).sum()),
            ('downregulated', lambda x: (x == False).sum())
        ]
    })).reset_index()

ValueError: unknown aggregate ('upregulated', <function <lambda> at 0x740cb401c6