In [1]:
import dask.dataframe as dd
import pandas as pd
import sqlite3

In [None]:
# Define data types once
DTYPE_REAL = {
    'id': 'int32',
    'pred_difference': 'float32', 
    'vcf_id': 'category',
    'mirna_accession': 'category',
    'gene_id': 'category', 
    'is_intron': 'bool',
    'is_gene_upregulated': 'bool',
    'mutsig': 'category', 
    'cancer_type': 'category'
}

DTYPE_MIRNA = {
    'mirna_accession': 'str',
    'mirna_family': 'category',
    'upregulated_in_brca': 'bool',
    'downregulated_in_brca': 'bool'
}

DTYPE_GENES = {
    'gene_id': 'str',
    'is_oncogene_consensus': 'bool',
    'is_tsupp_consensus': 'bool',
    "is_brca_driver": "bool"
}

real = pd.read_csv("results/dec7_combined_final.csv", dtype=DTYPE_REAL)

In [None]:


# dropping pd4120a because it doesn't have mutsig probs
real = real[real.vcf_id != "PD4120a"]

with sqlite3.connect('data/mirscribe.db') as conn:
    # Select only needed columns from SQL - reduces memory
    mirnas = pd.read_sql_query("""
        SELECT mirna_accession, mirna_family, 
               upregulated_in_brca, downregulated_in_brca 
        FROM mirnas""", conn).astype(DTYPE_MIRNA)
    
    genes = pd.read_sql_query("""
        SELECT gene_id, is_oncogene_consensus, is_tsupp_consensus, is_brca_driver
        FROM genes""", conn).astype(DTYPE_GENES)

# Merge in sequence
real = real.merge(mirnas, how="left", on="mirna_accession")
real = real.merge(genes, how="left", on="gene_id")

# filling in na's in is_onc and is_tsupp columns
real['is_oncogene_consensus'] = real['is_oncogene_consensus'].fillna(0).astype(bool)
real['is_tsupp_consensus'] = real['is_tsupp_consensus'].fillna(0).astype(bool)


# final dtypes before export
real['is_oncogene_consensus'] = real['is_oncogene_consensus'].astype(bool)
real['is_tsupp_consensus'] = real['is_tsupp_consensus'].astype(bool)

real["mirna_accession"] = real["mirna_accession"].astype("category")
real["gene_id"] = real["gene_id"].astype("category")
real['is_brca_driver'] = real['is_brca_driver'].astype(bool)



In [20]:
rename_dict = {
    "upregulated_in_brca": "is_mirna_upregulated_in_brca",
    "downregulated_in_brca": "is_mirna_downregulated_in_brca"
}

real = real.rename(columns=rename_dict)

In [21]:
real.dtypes

id                                   int32
pred_difference                    float32
vcf_id                            category
mirna_accession                   category
gene_id                           category
is_intron                             bool
is_gene_upregulated                   bool
mutsig                            category
cancer_type                       category
mirna_family                      category
is_mirna_upregulated_in_brca          bool
is_mirna_downregulated_in_brca        bool
is_oncogene_consensus                 bool
is_tsupp_consensus                    bool
is_brca_driver                        bool
dtype: object

In [22]:
real.to_parquet("results/dec7_optimized.parquet")
