In [1]:
import os
import json
import pandas as pd
from glob import glob
import sqlite3
import pandas as pd
import re

In [2]:
def load_and_merge_dataframes(folder_path, db_path="data/mirscribe.db"):
    # Get list of all CSV files in the folder
    csv_files = glob(os.path.join(folder_path, "*.csv"))

    # First read the genes data from SQLite
    with sqlite3.connect(db_path) as sqlite_conn:
        genes = pd.read_sql('SELECT * FROM genes', sqlite_conn)
        mirnas = pd.read_sql('SELECT * FROM mirnas', sqlite_conn)

    # Columns to merge from genes table
    cols_to_merge_genes = ['gene_id','gene_name', 'is_oncogene_oncokb', 'is_tsupp_oncokb',
           'is_brca_driver', 'tier_cosmic', 'is_hallmark_cosmic',
           'is_tsupp_cosmic', 'is_oncogene_cosmic', 'is_oncogene_consensus',
           'is_tsupp_consensus', 'cancer_gene_role']

    # Function to extract dataframe name from filename
    def get_df_name(filename):
        # Extract the part between 'counts_' and '.csv'
        match = re.search(r'counts_(.*?)\.csv', filename)
        if match:
            return f"df_{match.group(1)}"
        else:
            return f"df_{os.path.splitext(os.path.basename(filename))[0]}"

    dataframes = {}

    # Read each CSV file and merge with genes data
    for file in csv_files:
        df_name = get_df_name(file)
        # Read CSV
        df = pd.read_csv(file)
        # Merge with genes data
        df = pd.merge(df, genes[cols_to_merge_genes], 
                     how="left", 
                     on="gene_id")
        print(f"Loaded and merged {file} as {df_name}")
        dataframes[df_name] = df

    return dataframes


def process_folder(folder_path):
    dfs = load_and_merge_dataframes(folder_path)
    results = {}

    for name, df in dfs.items():
        results[name] = {
            "cancer_gene_role_counts": df[df.is_significant].cancer_gene_role.value_counts().to_dict(),
            "total_significant_genes": int(df.is_significant.sum()),
            "significant_brca_drivers": {
                "total": int(len(df[df['is_significant'] & df['is_brca_driver']])),
                "genes": []
            }
        }

        # Get filtered dataframe with raw values
        filtered_df = df[df['is_significant'] & df['is_brca_driver']][
            ["gene_name", "log2_odds_ratio", "cancer_gene_role"]]
        filtered_df = filtered_df.sort_values('log2_odds_ratio', ascending=False)
        
        # Store raw values in the JSON
        results[name]["significant_brca_drivers"]["genes"] = [
            {
                "gene_name": row["gene_name"],
                "log2_odds_ratio": float(row["log2_odds_ratio"]),  # Convert to float for JSON serialization
                "cancer_gene_role": row["cancer_gene_role"]
            }
            for _, row in filtered_df.iterrows()
        ]

    return results


def crawl_and_compile(base_path):
    results = []

    for folder in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder)
        if os.path.isdir(folder_path):
            json_file = os.path.join(folder_path, f"{folder}_analysis_results.json")  # Look for individual JSONs
            if os.path.exists(json_file):
                with open(json_file, 'r') as f:
                    data = json.load(f)
                    for df_name, df_data in data.items():
                        row = {
                            'folder': folder,
                            'dataframe': df_name,
                            'total_significant_genes': df_data['total_significant_genes'],
                            'significant_brca_drivers': df_data['significant_brca_drivers']['total'],
                        }
                        
                        # Add cancer gene role counts
                        for role, count in df_data['cancer_gene_role_counts'].items():
                            row[f'role_{role}'] = count
                        
                        # Process BRCA driver genes
                        brca_genes = df_data['significant_brca_drivers']['genes']
                        
                        # Calculate statistics about log2_odds_ratio
                        if brca_genes:
                            log2_odds = [gene['log2_odds_ratio'] for gene in brca_genes]
                            row['mean_log2_odds'] = np.mean(log2_odds)
                            row['median_log2_odds'] = np.median(log2_odds)
                            row['max_log2_odds'] = max(log2_odds)
                            row['min_log2_odds'] = min(log2_odds)
                            
                            # Count positive and negative log2_odds for each role
                            for role in set(gene['cancer_gene_role'] for gene in brca_genes):
                                role_genes = [g for g in brca_genes if g['cancer_gene_role'] == role]
                                pos_odds = sum(1 for g in role_genes if g['log2_odds_ratio'] > 0)
                                neg_odds = sum(1 for g in role_genes if g['log2_odds_ratio'] < 0)
                                row[f'{role}_positive_odds'] = pos_odds
                                row[f'{role}_negative_odds'] = neg_odds
                        
                        results.append(row)

    return pd.DataFrame(results)




def main(base_path="results"):  # Default value added for convenience
    folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]

    # Dictionary to store all results
    all_results = {}

    for folder in folders:
        folder_path = os.path.join(base_path, folder)
        results = process_folder(folder_path)
        all_results[folder] = results
        
        # Save individual folder results to JSON
        output_file = os.path.join(folder_path, f"{folder}_analysis_results.json")
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2)
        
        print(f"Results for {folder} saved to {output_file}")
    
    # Save combined results to JSON
    combined_output = os.path.join(base_path, "combined_analysis_results.json")
    with open(combined_output, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    print(f"Combined results saved to {combined_output}")
    
    return all_results


In [3]:
base_path = "results/last/"  # Adjust this to your base results directory
results = main(base_path)  # or whatever your base path is


Loaded and merged results/last/050/counts_sig36_adj0_filter0.10.csv as df_sig36_adj0_filter0.10
Loaded and merged results/last/050/counts_sig567_adj4_filter0.25.csv as df_sig567_adj4_filter0.25
Loaded and merged results/last/050/counts_sig1306_adj21.csv as df_sig1306_adj21
Results for 050 saved to results/last/050/050_analysis_results.json
Loaded and merged results/last/040/counts_sig134_adj0_filter0.10.csv as df_sig134_adj0_filter0.10
Loaded and merged results/last/040/counts_sig2489_adj202_filter0.25.csv as df_sig2489_adj202_filter0.25
Loaded and merged results/last/040/counts_sig5573_adj2148.csv as df_sig5573_adj2148
Results for 040 saved to results/last/040/040_analysis_results.json
Loaded and merged results/last/045/counts_sig1368_adj26_filter0.25.csv as df_sig1368_adj26_filter0.25
Loaded and merged results/last/045/counts_sig86_adj0_filter0.10.csv as df_sig86_adj0_filter0.10
Loaded and merged results/last/045/counts_sig3155_adj421.csv as df_sig3155_adj421
Results for 045 saved to

In [4]:
import numpy as np

In [5]:
# Usage
final_df = crawl_and_compile(base_path)


In [8]:
final_df.sort_values(
    by="folder"
)

Unnamed: 0,folder,dataframe,total_significant_genes,significant_brca_drivers,role_neither,role_oncogene,mean_log2_odds,median_log2_odds,max_log2_odds,min_log2_odds,oncogene_positive_odds,oncogene_negative_odds,role_tumor_suppressor,role_dual_role,tumor_suppressor_positive_odds,tumor_suppressor_negative_odds,dual_role_positive_odds,dual_role_negative_odds,neither_positive_odds,neither_negative_odds
9,35,df_sig134_adj0_filter0.10,134,5,123,6,1.144111,1.797718,1.918932,-1.16432,2,1,4.0,1.0,2.0,0.0,,,,
10,35,df_sig8194_adj4839,8194,39,7896,127,0.060487,0.153496,1.695839,-1.200675,4,7,127.0,44.0,14.0,3.0,1.0,6.0,2.0,2.0
11,35,df_sig3654_adj755_filter0.25,3654,21,3518,59,0.201311,0.272333,1.630796,-1.456572,3,4,58.0,19.0,7.0,1.0,1.0,3.0,1.0,1.0
3,40,df_sig134_adj0_filter0.10,134,5,123,6,1.144111,1.797718,1.918932,-1.16432,2,1,4.0,1.0,2.0,0.0,,,,
4,40,df_sig2489_adj202_filter0.25,2489,14,2388,47,0.4476,0.735696,1.768542,-1.427446,2,4,43.0,11.0,6.0,0.0,1.0,0.0,1.0,0.0
5,40,df_sig5573_adj2148,5573,29,5365,91,0.099306,0.182465,1.418874,-1.237437,3,5,89.0,28.0,10.0,2.0,0.0,5.0,2.0,2.0
6,45,df_sig1368_adj26_filter0.25,1368,9,1307,27,0.604982,0.932441,1.915521,-1.482393,2,2,28.0,6.0,4.0,0.0,,,1.0,0.0
7,45,df_sig86_adj0_filter0.10,86,2,81,3,1.904414,1.904414,2.266558,1.54227,1,0,2.0,,1.0,0.0,,,,
8,45,df_sig3155_adj421,3155,16,3024,57,0.14748,0.461872,1.321334,-1.353683,3,5,59.0,15.0,5.0,1.0,0.0,1.0,1.0,0.0
0,50,df_sig36_adj0_filter0.10,36,1,34,2,4.523104,4.523104,4.523104,4.523104,1,0,,,,,,,,
