In [None]:
import pandas as pd
import sqlite3
import os
import re

pd.set_option('display.max_columns', None)   # Show all columns


FOLDER_PATH = "results/050/"

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /home/nazif/thesis/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


In [6]:
def load_and_merge_dataframes(folder_path, db_path="data/mirscribe.db"):
    # Get list of all CSV files in the folder
    csv_files = glob(os.path.join(folder_path, "*.csv"))

    # First read the genes data from SQLite
    with sqlite3.connect(db_path) as sqlite_conn:
        genes = pd.read_sql('SELECT * FROM genes', sqlite_conn)
        mirnas = pd.read_sql('SELECT * FROM mirnas', sqlite_conn)

    # Columns to merge from genes table
    cols_to_merge_genes = ['gene_id','gene_name', 'is_oncogene_oncokb', 'is_tsupp_oncokb',
           'is_brca_driver', 'tier_cosmic', 'is_hallmark_cosmic',
           'is_tsupp_cosmic', 'is_oncogene_cosmic', 'is_oncogene_consensus',
           'is_tsupp_consensus', 'cancer_gene_role']

    # Function to extract dataframe name from filename
    def get_df_name(filename):
        # Extract the part between 'counts_' and '.csv'
        match = re.search(r'counts_(.*?)\.csv', filename)
        if match:
            return f"df_{match.group(1)}"
        else:
            return f"df_{os.path.splitext(os.path.basename(filename))[0]}"

    dataframes = {}

    # Read each CSV file and merge with genes data
    for file in csv_files:
        df_name = get_df_name(file)
        # Read CSV
        df = pd.read_csv(file)
        # Merge with genes data
        df = pd.merge(df, genes[cols_to_merge_genes], 
                     how="left", 
                     on="gene_id")
        print(f"Loaded and merged {file} as {df_name}")
        dataframes[df_name] = df

    return dataframes

In [3]:
dfs = load_and_merge_dataframes(FOLDER_PATH)

NameError: name 'load_and_merge_dataframes' is not defined

In [48]:
for name, df in dfs.items():
    print(f"\n=== {name} Cancer Gene Role Counts ===")
    print(df[df.is_significant].cancer_gene_role.value_counts())
    print(f"Total significant genes: {df.is_significant.sum()}")



=== df_sig36_adj0_filter0.10 Cancer Gene Role Counts ===
cancer_gene_role
neither     34
oncogene     2
Name: count, dtype: int64
Total significant genes: 36

=== df_sig567_adj4_filter0.25 Cancer Gene Role Counts ===
cancer_gene_role
neither             544
tumor_suppressor     12
oncogene              9
dual_role             2
Name: count, dtype: int64
Total significant genes: 567

=== df_sig1306_adj21 Cancer Gene Role Counts ===
cancer_gene_role
neither             1252
tumor_suppressor      27
oncogene              21
dual_role              6
Name: count, dtype: int64
Total significant genes: 1306


In [53]:
for name, df in dfs.items():
    sig_brca = df[df['is_significant'] & df['is_brca_driver']]
    print(f"\n=== {name} DF ===")
    print(f"Total significant BRCA drivers: {len(sig_brca)}")
    print(f"Total significant genes: {df['is_significant'].sum()}")


=== df_sig36_adj0_filter0.10 DF ===
Total significant BRCA drivers: 1
Total significant genes: 36

=== df_sig567_adj4_filter0.25 DF ===
Total significant BRCA drivers: 4
Total significant genes: 567

=== df_sig1306_adj21 DF ===
Total significant BRCA drivers: 6
Total significant genes: 1306


In [61]:
for name, df in dfs.items():
    print(f"\n=== {name} ===")
    filtered_df = df[df['is_significant'] & df['is_brca_driver']][["gene_name", "log2_odds_ratio", "cancer_gene_role"]]
    filtered_df = filtered_df.sort_values('log2_odds_ratio', ascending=False)
    
    # Add new column based on conditions
    filtered_df['supports_role'] = filtered_df.apply(lambda row: 
        "Supports" if (row['cancer_gene_role'] == 'oncogene' and row['log2_odds_ratio'] > 0) or 
                      (row['cancer_gene_role'] == 'tumor_suppressor' and row['log2_odds_ratio'] < 0) 
        else "Contradicts", axis=1)
    
    print(filtered_df)
    print(f"\nTotal rows: {len(filtered_df)}")
    print(f"Supports: {(filtered_df['supports_role'] == 'Supports').sum()}")
    print(f"Contradicts: {(filtered_df['supports_role'] == 'Contradicts').sum()}")
    print("-" * 50)



=== df_sig36_adj0_filter0.10 ===
     gene_name  log2_odds_ratio cancer_gene_role supports_role
2285      UBR5         4.523104         oncogene      Supports

Total rows: 1
Supports: 1
Contradicts: 0
--------------------------------------------------

=== df_sig567_adj4_filter0.25 ===
     gene_name  log2_odds_ratio  cancer_gene_role supports_role
2797      UBR5         1.802573          oncogene      Supports
648     ARID1B         1.450559  tumor_suppressor   Contradicts
136       ETV1         1.378080          oncogene      Supports
1464  HSP90AA1        -1.558361          oncogene   Contradicts

Total rows: 4
Supports: 2
Contradicts: 2
--------------------------------------------------

=== df_sig1306_adj21 ===
      gene_name  log2_odds_ratio  cancer_gene_role supports_role
657      ARID1B         1.156983  tumor_suppressor   Contradicts
2846       UBR5         1.139366          oncogene      Supports
12407      PTEN         0.908239  tumor_suppressor   Contradicts
15510      FA