In [5]:
#!/usr/bin/env python3
"""
Generate summary table using FINAL setup from 006_OverlapFinalTable.ipynb:
- High degree outliers: 20th/80th percentile (not 5th/95th)
- High-mortality sinks: Z-score top 20%
- High-mortality bridges: Z-score top 5% with 10% min diff
"""

import pandas as pd
import numpy as np
import networkx as nx
from pathlib import Path

# Directories
DATA_DIR = Path('/mnt/user-data/uploads')
OUTPUT_DIR = Path('/mnt/user-data/outputs')

def count_outliers_20_80_percentile():
    """Count high degree outliers using 20th/80th percentile (final setup)"""
    
    import networkx as nx
    
    all_counts = []
    
    # Load ICD codes
    icd_df = pd.read_csv(DATA_DIR / 'ICD10_Diagnoses_All.csv')
    
    # Load prevalence
    df_prev = pd.read_csv(DATA_DIR / 'Prevalence_Sex_Age_Year_ICD.csv')
    
    # Age mapping
    age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
               5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    
    for gender in ['Female', 'Male']:
        for age_group in range(1, 9):
            age_str = age_map[age_group]
            print(f"  Processing {gender} age {age_group} (20th/80th percentile)...")
            
            # Load adjacency matrix
            adj_path = DATA_DIR / f'Adj_Matrix_{gender}_ICD_age_{age_group}.csv'
            A = pd.read_csv(adj_path, sep=' ', header=None).values
            
            # Create graph
            G = nx.from_numpy_array(A)
            
            # Get prevalence for this age-sex
            prev_subset = df_prev[
                (df_prev['sex'] == gender) & 
                (df_prev['Age_Group'] == age_str) &
                (df_prev['year'] == 2014)
            ]
            
            prev_dict = dict(zip(prev_subset['icd_code'], prev_subset['p']))
            
            # Build node data
            nodes = []
            for node in range(len(A)):
                degree = G.degree(node)
                if degree > 0:
                    icd_row = icd_df[icd_df['diagnose_id'] == node + 1]
                    if len(icd_row) > 0:
                        icd_code = icd_row.iloc[0]['icd_code']
                        prev = prev_dict.get(icd_code, 0)
                        if prev > 0:
                            ratio = degree / prev
                            log_ratio = np.log10(ratio)
                            nodes.append({
                                'log_ratio': log_ratio
                            })
            
            df_nodes = pd.DataFrame(nodes)
            
            if len(df_nodes) > 0:
                # Calculate 20th and 80th percentile thresholds
                lower_bound = df_nodes['log_ratio'].quantile(0.20)
                upper_bound = df_nodes['log_ratio'].quantile(0.80)
                
                # Count high degree outliers (above 80th percentile)
                count = len(df_nodes[df_nodes['log_ratio'] >= upper_bound])
            else:
                count = 0
            
            all_counts.append({
                'Sex': gender,
                'Age_Group': age_group,
                'Count': count
            })
    
    return pd.DataFrame(all_counts)

def generate_latex_table(df_outliers, df_sinks, df_bridges):
    """Generate LaTeX table"""
    
    # Pivot data
    outliers_pivot = df_outliers.pivot(index='Age_Group', columns='Sex', values='Count').fillna(0).astype(int)
    sinks_pivot = df_sinks.pivot(index='Age_Group', columns='Sex', values='Count').fillna(0).astype(int)
    bridges_pivot = df_bridges.pivot(index='Age_Group', columns='Sex', values='Count').fillna(0).astype(int)
    
    latex = """\\begin{table}[htbp]
\\centering
\\caption{Summary of identified critical disease nodes and edges across age groups in female and male comorbidity networks.}
\\label{tab:summary_critical_nodes}
\\begin{tabular}{lcccccccccccccccc}
\\toprule
& \\multicolumn{8}{c}{\\textbf{Female}} & \\multicolumn{8}{c}{\\textbf{Male}} \\\\
\\cmidrule(lr){2-9} \\cmidrule(lr){10-17}
\\textbf{Age Group} & 0-9 & 10-19 & 20-29 & 30-39 & 40-49 & 50-59 & 60-69 & 70-79 & 0-9 & 10-19 & 20-29 & 30-39 & 40-49 & 50-59 & 60-69 & 70-79 \\\\
\\midrule
"""
    
    # Row 1: High degree outliers (80th percentile)
    latex += "High degree outliers (80\\textsuperscript{th} p)"
    for age in range(1, 9):
        val = outliers_pivot.loc[age, 'Female'] if age in outliers_pivot.index else 0
        latex += f" & {val}"
    for age in range(1, 9):
        val = outliers_pivot.loc[age, 'Male'] if age in outliers_pivot.index else 0
        latex += f" & {val}"
    latex += " \\\\\n"
    
    # Row 2: High-mortality sinks (Z-score top 20%)
    latex += "High-mortality sinks (Z-score 20\\textsuperscript{th} p)"
    for age in range(1, 9):
        val = sinks_pivot.loc[age, 'Female'] if age in sinks_pivot.index else 0
        latex += f" & {val}"
    for age in range(1, 9):
        val = sinks_pivot.loc[age, 'Male'] if age in sinks_pivot.index else 0
        latex += f" & {val}"
    latex += " \\\\\n"
    
    # Row 3: High-mortality bridges (Z-score top 5%)
    latex += "High-mortality bridges (Z-score 5\\textsuperscript{th} p)"
    for age in range(1, 9):
        val = bridges_pivot.loc[age, 'Female'] if age in bridges_pivot.index else 0
        latex += f" & {val}"
    for age in range(1, 9):
        val = bridges_pivot.loc[age, 'Male'] if age in bridges_pivot.index else 0
        latex += f" & {val}"
    latex += " \\\\\n"
    
    latex += """\\bottomrule
\\end{tabular}
\\end{table}
"""
    
    return latex

def main():
    print("="*80)
    print("SUMMARY TABLE - FINAL SETUP (from 006_OverlapFinalTable.ipynb)")
    print("="*80)
    print("\nUsing:")
    print("  - High degree outliers: 20th/80th percentile")
    print("  - High-mortality sinks: Z-score top 20%")
    print("  - High-mortality bridges: Z-score top 5%")
    
    # Count outliers with 20th/80th percentile
    print("\n1. Counting high degree outliers (80th percentile)...")
    df_outliers = count_outliers_20_80_percentile()
    print(f"   Total: {df_outliers['Count'].sum()}")
    
    # Load sinks from existing file (already top 20%)
    print("\n2. Loading high-mortality sinks (Z-score top 20%)...")
    sinks_file = OUTPUT_DIR / 'high_mortality_sinks_ZSCORE.csv'
    if sinks_file.exists():
        df_sinks_all = pd.read_csv(sinks_file)
        df_sinks = df_sinks_all.groupby(['Sex', 'Age_Group']).size().reset_index(name='Count')
        print(f"   Total: {df_sinks['Count'].sum()}")
    else:
        print("   ERROR: File not found")
        return
    
    # Load bridges from existing file (already top 5%)
    print("\n3. Loading high-mortality bridges (Z-score top 5%)...")
    bridges_file = OUTPUT_DIR / 'bridge_edges_mortality_ZSCORE.csv'
    if bridges_file.exists():
        df_bridges_all = pd.read_csv(bridges_file)
        df_bridges = df_bridges_all.groupby(['Sex', 'Age_Group']).size().reset_index(name='Count')
        print(f"   Total: {df_bridges['Count'].sum()}")
    else:
        print("   ERROR: File not found")
        return
    
    # Generate LaTeX
    print("\n4. Generating LaTeX table...")
    latex = generate_latex_table(df_outliers, df_sinks, df_bridges)
    
    # Save
    tex_file = OUTPUT_DIR / 'summary_table_FINAL_SETUP.tex'
    with open(tex_file, 'w') as f:
        f.write(latex)
    print(f"   ✓ Saved to: {tex_file}")
    
    # Save CSV
    df_outliers['Type'] = 'High degree outliers (80th p)'
    df_sinks['Type'] = 'High-mortality sinks (Z-score 20th p)'
    df_bridges['Type'] = 'High-mortality bridges (Z-score 5th p)'
    
    df_all = pd.concat([df_outliers, df_sinks, df_bridges], ignore_index=True)
    
    csv_file = OUTPUT_DIR / 'summary_table_FINAL_SETUP.csv'
    df_all.to_csv(csv_file, index=False)
    print(f"   ✓ Saved to: {csv_file}")
    
    # Summary
    print("\n" + "="*80)
    print("FINAL SUMMARY")
    print("="*80)
    for type_name in df_all['Type'].unique():
        subset = df_all[df_all['Type'] == type_name]
        total = subset['Count'].sum()
        female = subset[subset['Sex'] == 'Female']['Count'].sum()
        male = subset[subset['Sex'] == 'Male']['Count'].sum()
        print(f"\n{type_name}:")
        print(f"  Total: {total} (Female: {female}, Male: {male})")
    
    print("\n" + "="*80)
    print("✓ COMPLETE")
    print("="*80)

if __name__ == '__main__':
    main()

GENERATING SUMMARY TABLE FROM NOTEBOOK OUTPUTS
High-mortality sinks (Z-score top 20%): 89 total
High-mortality bridges (Z-score top 5%, 10% min diff): 411 total
High degree outliers (95th percentile): 777 total

✓ LaTeX table saved to: outputs/summary_table_critical_nodes_FIXED.tex
✓ Data CSV saved to: outputs/summary_table_critical_nodes_FIXED.csv

SUMMARY

High degree outliers (95th p):
  Total: 777 (Female: 386, Male: 391)

High-mortality sinks (Z-score 20th p):
  Total: 89 (Female: 46, Male: 43)

High-mortality bridges (Z-score 5th p):
  Total: 411 (Female: 210, Male: 201)

✓ COMPLETE
