In [1]:
import pandas as pd
from scripts.pyensembl_operations import import_pyensembl
import sqlite3
pd.set_option('display.max_columns', None)   # Show all columns


# Initialize pyensembl
g37 = import_pyensembl(37)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


In [None]:
def determine_result(row):
    """
    Determines if gene regulation supports or contradicts its cancer role
    """
    if pd.isna(row['cancer_gene_role']):
        return 'unknown'
    
    if row['cancer_gene_role'] == 'dual_role':
        return 'supports'
    elif row['cancer_gene_role'] == 'oncogene' and row['is_gene_upregulated']:
        return 'supports'
    elif row['cancer_gene_role'] == 'tumor_suppressor' and not row['is_gene_upregulated']:
        return 'supports'
    else:
        return 'contradicts'


def load_and_merge_gene_data(counts_file_path, db_path='data/mirscribe.db'):

    # Read counts data
    df1 = pd.read_csv(counts_file_path)
    
    # Connect to SQLite and get gene/mirna data
    with sqlite3.connect(db_path) as sqlite_conn:
        genes = pd.read_sql('SELECT * FROM genes', sqlite_conn)
        mirnas = pd.read_sql('SELECT * FROM mirnas', sqlite_conn)
    
    # Columns to merge from genes table
    cols_to_merge_genes = [
        'gene_id', 'gene_name', 
        'is_oncogene_oncokb', 'is_tsupp_oncokb',
        'is_brca_driver', 'tier_cosmic', 
        'is_hallmark_cosmic',
        'is_tsupp_cosmic', 'is_oncogene_cosmic', 
        'is_oncogene_consensus',
        'is_tsupp_consensus', 'cancer_gene_role'
    ]
    
    # Merge with genes data
    df1 = pd.merge(df1, genes[cols_to_merge_genes], how="left", on="gene_id")
    
    # Add derived columns
    df1["is_gene_upregulated"] = df1["log2_odds_ratio"] > 0
    df1["is_brca_driver"] = df1["is_brca_driver"].astype("bool")
    df1['result'] = df1.apply(determine_result, axis=1)
    
    return df1


In [15]:
df1 = load_and_merge_gene_data("results/last/035/counts_sig134_adj0_filter0.10.csv")
df2 = load_and_merge_gene_data("results/last/035/counts_sig3654_adj755_filter0.25.csv")
df3 = load_and_merge_gene_data("results/last/035/counts_sig8194_adj4839.csv")


In [22]:
df1[df1.is_brca_driver & df1.is_significant][["gene_name", "is_gene_upregulated", "cancer_gene_role", "result"]].sort_values(by="result", ascending=False)

Unnamed: 0,gene_name,is_gene_upregulated,cancer_gene_role,result
2591,UBR5,True,oncogene,supports
4532,PIK3CA,True,oncogene,supports
599,ARID1B,True,tumor_suppressor,contradicts
8647,BRAF,False,oncogene,contradicts
13431,FAT4,True,tumor_suppressor,contradicts


In [23]:
df2[df2.is_brca_driver & df2.is_significant_adj][["gene_name", "is_gene_upregulated", "cancer_gene_role", "result"]].sort_values(by="result", ascending=False)

Unnamed: 0,gene_name,is_gene_upregulated,cancer_gene_role,result
139,ETV1,True,oncogene,supports
2891,UBR5,True,oncogene,supports
664,ARID1B,True,tumor_suppressor,contradicts
1017,ATP2B3,True,tumor_suppressor,contradicts
1503,HSP90AA1,False,oncogene,contradicts
15980,FAT4,True,tumor_suppressor,contradicts


In [24]:
df3[df3.is_brca_driver & df3.is_significant_adj][["gene_name", "is_gene_upregulated", "cancer_gene_role", "result"]].sort_values(by="result", ascending=False)


Unnamed: 0,gene_name,is_gene_upregulated,cancer_gene_role,result
140,ETV1,True,oncogene,supports
2900,UBR5,True,oncogene,supports
16373,ARHGEF12,False,dual_role,supports
13973,ERBB4,False,dual_role,supports
10490,JAK1,False,dual_role,supports
6719,NOTCH2,False,dual_role,supports
8202,SETDB1,False,dual_role,supports
750,KMT2C,False,tumor_suppressor,supports
932,SPEN,False,tumor_suppressor,supports
3989,MED23,False,neither,contradicts
