In [5]:
#!/usr/bin/env python3
"""
High Betweenness + High Mortality Nodes Analysis
Identifies critical bridge diseases with high mortality rates
"""

import pandas as pd
import numpy as np
import networkx as nx
from pathlib import Path

# Directories
DATA_DIR = Path('Data/')
OUTPUT_DIR = Path('outputs/')

def load_network_with_mortality(gender, age_group):
    """Load network and assign betweenness and mortality to each node"""
    
    # Load adjacency matrix
    adj_path = DATA_DIR / f'Adj_Matrix_{gender}_ICD_age_{age_group}.csv'
    A = pd.read_csv(adj_path, sep=' ', header=None).values
    
    # Create graph
    G = nx.from_numpy_array(A)
    
    # Calculate betweenness centrality
    betweenness = nx.betweenness_centrality(G, weight=None)
    
    # Load ICD codes
    icd_df = pd.read_csv(DATA_DIR / 'ICD10_Diagnoses_All.csv')
    
    # Load mortality data
    if gender == 'Female':
        mortality_df = pd.read_csv(DATA_DIR / 'mortality_diag_Female.csv')
    else:
        mortality_df = pd.read_csv(DATA_DIR / 'mortality_diag_Male.csv')
    
    # Filter mortality for this age group
    mortality_age = mortality_df[mortality_df['age_10'] == age_group]
    mortality_dict = dict(zip(mortality_age['icd_code'], mortality_age['mortality']))
    
    # Build results
    results = []
    for node in range(len(A)):
        degree = G.degree(node)
        if degree > 0:  # Only connected nodes
            # Get ICD code
            icd_row = icd_df[icd_df['diagnose_id'] == node + 1]
            if len(icd_row) > 0:
                icd_code = icd_row.iloc[0]['icd_code']
                descr = icd_row.iloc[0]['descr']
                
                # Get betweenness and mortality
                bet = betweenness.get(node, 0)
                mort = mortality_dict.get(icd_code, 0)
                
                results.append({
                    'Sex': gender,
                    'Age_Group': age_group,
                    'ICD_Code': icd_code,
                    'Description_GER': descr,
                    'Degree': degree,
                    'Betweenness': bet,
                    'Mortality': mort
                })
    
    return pd.DataFrame(results)

def identify_high_betweenness_high_mortality(df_all):
    """Identify nodes with high betweenness AND high mortality"""
    
    print("Identifying nodes with high betweenness and high mortality...")
    
    all_high = []
    
    # Process each sex-age group separately
    for sex in ['Female', 'Male']:
        for age_group in sorted(df_all['Age_Group'].unique()):
            subset = df_all[
                (df_all['Sex'] == sex) & 
                (df_all['Age_Group'] == age_group)
            ].copy()
            
            if len(subset) == 0:
                continue
            
            # Calculate thresholds (90th percentile for both)
            bet_threshold = subset['Betweenness'].quantile(0.80)
            mort_threshold = subset['Mortality'].quantile(0.80)
            
            # Identify nodes with BOTH high betweenness AND high mortality
            high_nodes = subset[
                (subset['Betweenness'] >= bet_threshold) & 
                (subset['Mortality'] >= mort_threshold)
            ].copy()
            
            if len(high_nodes) > 0:
                high_nodes['Betweenness_Percentile'] = 90
                high_nodes['Mortality_Percentile'] = 90
                all_high.append(high_nodes)
    
    return pd.concat(all_high, ignore_index=True)

def add_english_descriptions(df):
    """Add English descriptions"""
    
    # Load English descriptions
    eng_df = pd.read_csv(DATA_DIR / 'DiagAll_Eng__2_.csv')
    icd_to_eng = dict(zip(eng_df['Code'], eng_df['ShortDescription']))
    
    df['Description_Eng'] = df['ICD_Code'].map(icd_to_eng)
    df['Description_Eng'] = df['Description_Eng'].fillna(df['Description_GER'])
    
    return df

def generate_latex_table(df):
    """Generate LaTeX table"""
    
    # Sort by sex, age, and betweenness
    age_order = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8}
    df['age_num'] = df['Age_Group'].map(age_order)
    df = df.sort_values(['Sex', 'age_num', 'Betweenness'], ascending=[True, True, False])
    
    # Map age group to range
    age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
               5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    df['Age_Range'] = df['Age_Group'].map(age_map)
    
    latex = """\\begin{longtable}{llllrrr}
\\caption{Diseases with High Betweenness and High Mortality} \\label{tab:high_bet_mort} \\\\
\\toprule
Sex & Age & ICD Code & Description & Degree & Betweenness & Mortality \\\\
\\midrule
\\endfirsthead

\\multicolumn{7}{c}{\\tablename\\ \\thetable\\ -- Continued from previous page} \\\\
\\toprule
Sex & Age & ICD Code & Description & Degree & Betweenness & Mortality \\\\
\\midrule
\\endhead

\\midrule
\\multicolumn{7}{r}{Continued on next page} \\\\
\\endfoot

\\bottomrule
\\endlastfoot

"""
    
    for idx, row in df.iterrows():
        desc = str(row['Description_Eng'])[:40]
        desc = desc.replace('&', '\\&').replace('_', '\\_').replace('%', '\\%')
        
        latex += f"{row['Sex']} & {row['Age_Range']} & {row['ICD_Code']} & {desc} & "
        latex += f"{row['Degree']} & {row['Betweenness']:.4f} & {row['Mortality']:.4f} \\\\\n"
        
        # Add midrule after age group change
        if idx < len(df) - 1:
            next_row = df.iloc[idx + 1]
            if (row['Sex'] != next_row['Sex']) or (row['Age_Group'] != next_row['Age_Group']):
                latex += "\\midrule\n"
    
    latex += """\\end{longtable}
"""
    
    return latex

def print_summary(df):
    """Print summary statistics"""
    
    print("\n" + "="*80)
    print("SUMMARY: HIGH BETWEENNESS + HIGH MORTALITY NODES")
    print("="*80)
    
    print(f"\nTotal nodes identified: {len(df)}")
    
    for sex in ['Female', 'Male']:
        sex_data = df[df['Sex'] == sex]
        print(f"\n{sex}: {len(sex_data)} nodes")
        
        age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
                   5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
        
        for age in sorted(sex_data['Age_Group'].unique()):
            age_data = sex_data[sex_data['Age_Group'] == age]
            age_str = age_map[age]
            print(f"  {age_str}: {len(age_data)} nodes")
            
            # Show top 3
            if len(age_data) > 0:
                top3 = age_data.nlargest(3, 'Betweenness')
                for _, node in top3.iterrows():
                    print(f"    - {node['ICD_Code']:4} (Bet={node['Betweenness']:.4f}, Mort={node['Mortality']:.4f})")

def main():
    """Main execution"""
    
    print("="*80)
    print("HIGH BETWEENNESS + HIGH MORTALITY ANALYSIS")
    print("="*80)
    
    # Load data for all sex-age groups
    print("\nLoading network data and calculating betweenness...")
    all_data = []
    
    for gender in ['Female', 'Male']:
        for age_group in range(1, 9):
            print(f"  Processing {gender} age {age_group}...")
            df = load_network_with_mortality(gender, age_group)
            all_data.append(df)
    
    df_all = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal nodes analyzed: {len(df_all)}")
    
    # Identify high betweenness + high mortality nodes
    df_high = identify_high_betweenness_high_mortality(df_all)
    print(f"Nodes with high betweenness AND high mortality: {len(df_high)}")
    
    # Add English descriptions
    print("\nAdding English descriptions...")
    df_high = add_english_descriptions(df_high)
    
    # Generate LaTeX table
    print("\nGenerating LaTeX table...")
    latex = generate_latex_table(df_high)
    
    # Save outputs
    print("\nSaving outputs...")
    
    # Save LaTeX
    tex_file = OUTPUT_DIR / 'high_betweenness_mortality.tex'
    with open(tex_file, 'w') as f:
        f.write(latex)
    print(f"✓ LaTeX table saved to: {tex_file}")
    
    # Save CSV
    csv_file = OUTPUT_DIR / 'high_betweenness_mortality.csv'
    df_high.to_csv(csv_file, index=False)
    print(f"✓ Data CSV saved to: {csv_file}")
    
    # Print summary
    print_summary(df_high)
    
    print("\n" + "="*80)
    print("✓ ANALYSIS COMPLETE")
    print("="*80)

if __name__ == '__main__':
    main()

HIGH BETWEENNESS + HIGH MORTALITY ANALYSIS

Loading network data and calculating betweenness...
  Processing Female age 1...
  Processing Female age 2...
  Processing Female age 3...
  Processing Female age 4...
  Processing Female age 5...
  Processing Female age 6...
  Processing Female age 7...
  Processing Female age 8...
  Processing Male age 1...
  Processing Male age 2...
  Processing Male age 3...
  Processing Male age 4...
  Processing Male age 5...
  Processing Male age 6...
  Processing Male age 7...
  Processing Male age 8...

Total nodes analyzed: 3927
Identifying nodes with high betweenness and high mortality...
Nodes with high betweenness AND high mortality: 130

Adding English descriptions...

Generating LaTeX table...

Saving outputs...
✓ LaTeX table saved to: outputs/high_betweenness_mortality.tex
✓ Data CSV saved to: outputs/high_betweenness_mortality.csv

SUMMARY: HIGH BETWEENNESS + HIGH MORTALITY NODES

Total nodes identified: 130

Female: 65 nodes
  0-9: 4 nodes
 

In [13]:
#!/usr/bin/env python3
"""
High Betweenness + High Mortality Nodes Analysis (Z-Score Method)
Identifies critical "high-mortality sinks" using manuscript methodology
"""

import pandas as pd
import numpy as np
import networkx as nx
from pathlib import Path

# Directories
DATA_DIR = Path('Data/')
OUTPUT_DIR = Path('outputs/')

def load_network_with_mortality(gender, age_group):
    """Load network and assign betweenness and mortality to each node"""
    
    # Load adjacency matrix
    adj_path = DATA_DIR / f'Adj_Matrix_{gender}_ICD_age_{age_group}.csv'
    A = pd.read_csv(adj_path, sep=' ', header=None).values
    
    # Create graph
    G = nx.from_numpy_array(A)
    
    # Calculate betweenness centrality
    betweenness = nx.betweenness_centrality(G, weight=None)
    
    # Load ICD codes
    icd_df = pd.read_csv(DATA_DIR / 'ICD10_Diagnoses_All.csv')
    
    # Load mortality data
    if gender == 'Female':
        mortality_df = pd.read_csv(DATA_DIR / 'mortality_diag_Female.csv')
    else:
        mortality_df = pd.read_csv(DATA_DIR / 'mortality_diag_Male.csv')
    
    # Filter mortality for this age group
    mortality_age = mortality_df[mortality_df['age_10'] == age_group]
    mortality_dict = dict(zip(mortality_age['icd_code'], mortality_age['mortality']))
    
    # Build results
    results = []
    for node in range(len(A)):
        degree = G.degree(node)
        if degree > 0:  # Only connected nodes
            # Get ICD code
            icd_row = icd_df[icd_df['diagnose_id'] == node + 1]
            if len(icd_row) > 0:
                icd_code = icd_row.iloc[0]['icd_code']
                descr = icd_row.iloc[0]['descr']
                
                # Get betweenness and mortality
                bet = betweenness.get(node, 0)
                mort = mortality_dict.get(icd_code, 0)
                
                results.append({
                    'Sex': gender,
                    'Age_Group': age_group,
                    'ICD_Code': icd_code,
                    'Description_GER': descr,
                    'Degree': degree,
                    'Betweenness': bet,
                    'Mortality': mort
                })
    
    return pd.DataFrame(results)

def identify_high_mortality_sinks_zscore(df_all, top_percent=20):
    """
    Identify high-mortality sinks using Z-score product method (manuscript methodology)
    
    As described in manuscript:
    - Compute z-scores for betweenness and mortality per sex-age group
    - Calculate product: z(betweenness) × z(mortality)
    - Select top X% of positive z-score products
    - Calculate geometric mean: sqrt(z_betweenness × z_mortality) for ranking
    
    Parameters:
    - top_percent: Percentage of top z-score products to select (default 40%)
    """
    
    print(f"\nZ-SCORE METHOD (Manuscript):")
    print(f"  - Computing z(betweenness) × z(mortality)")
    print(f"  - Selecting top {top_percent}% of z-score products")
    print(f"  - Only considering positive z-scores")
    
    all_high = []
    
    # Process each sex-age group separately
    for sex in ['Female', 'Male']:
        for age_group in sorted(df_all['Age_Group'].unique()):
            subset = df_all[
                (df_all['Sex'] == sex) & 
                (df_all['Age_Group'] == age_group)
            ].copy()
            
            if len(subset) == 0:
                continue
            
            # Calculate z-scores for betweenness
            bet_mean = subset['Betweenness'].mean()
            bet_std = subset['Betweenness'].std()
            if bet_std > 0:
                subset['z_betweenness'] = (subset['Betweenness'] - bet_mean) / bet_std
            else:
                subset['z_betweenness'] = 0
            
            # Calculate z-scores for mortality
            mort_mean = subset['Mortality'].mean()
            mort_std = subset['Mortality'].std()
            if mort_std > 0:
                subset['z_mortality'] = (subset['Mortality'] - mort_mean) / mort_std
            else:
                subset['z_mortality'] = 0
            
            # Calculate z-score product
            subset['z_product'] = subset['z_betweenness'] * subset['z_mortality']
            
            # Calculate geometric mean (as mentioned in manuscript for reporting)
            # Only for positive z-scores
            subset['z_geom_mean'] = np.where(
                (subset['z_betweenness'] > 0) & (subset['z_mortality'] > 0),
                np.sqrt(subset['z_betweenness'] * subset['z_mortality']),
                0
            )
            
            # Filter: positive z-scores and top X% of product
            threshold_percentile = 100 - top_percent
            z_threshold = subset['z_product'].quantile(threshold_percentile / 100)
            
            high_nodes = subset[
                (subset['z_betweenness'] > 0) &
                (subset['z_mortality'] > 0) &
                (subset['z_product'] >= z_threshold)
            ].copy()
            
            if len(high_nodes) > 0:
                all_high.append(high_nodes)
    
    return pd.concat(all_high, ignore_index=True) if len(all_high) > 0 else pd.DataFrame()

def add_english_descriptions(df):
    """Add English descriptions"""
    
    # Load English descriptions
    eng_df = pd.read_csv(DATA_DIR / 'DiagAll_Eng__2_.csv')
    icd_to_eng = dict(zip(eng_df['Code'], eng_df['ShortDescription']))
    
    df['Description_Eng'] = df['ICD_Code'].map(icd_to_eng)
    df['Description_Eng'] = df['Description_Eng'].fillna(df['Description_GER'])
    
    return df

def generate_latex_table(df):
    """Generate LaTeX table"""
    
    # Sort by sex, age, and geometric mean (descending)
    age_order = {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8}
    df['age_num'] = df['Age_Group'].map(age_order)
    df = df.sort_values(['Sex', 'age_num', 'z_geom_mean'], ascending=[True, True, False])
    
    # Map age group to range
    age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
               5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    df['Age_Range'] = df['Age_Group'].map(age_map)
    
    latex = """\\begin{longtable}{llllrrrr}
\\caption{High-Mortality Sinks: Nodes with High Betweenness and High Mortality (Z-Score Method)} \\label{tab:high_mort_sinks} \\\\
\\toprule
Sex & Age & ICD & Description & Degree & Betweenness & Mortality & Z-Product \\\\
\\midrule
\\endfirsthead

\\multicolumn{8}{c}{\\tablename\\ \\thetable\\ -- Continued from previous page} \\\\
\\toprule
Sex & Age & ICD & Description & Degree & Betweenness & Mortality & Z-Product \\\\
\\midrule
\\endhead

\\midrule
\\multicolumn{8}{r}{Continued on next page} \\\\
\\endfoot

\\bottomrule
\\endlastfoot

"""
    
    for idx, row in df.iterrows():
        # Truncate description
        desc = str(row['Description_Eng'])[:35]
        desc = desc.replace('&', '\\&').replace('_', '\\_').replace('%', '\\%')
        
        latex += f"{row['Sex']} & {row['Age_Range']} & {row['ICD_Code']} & {desc} & "
        latex += f"{row['Degree']} & {row['Betweenness']:.5f} & "
        latex += f"{row['Mortality']:.4f} & {row['z_product']:.3f} \\\\\n"
        
        # Add midrule after age group change
        if idx < len(df) - 1:
            next_row = df.iloc[idx + 1]
            if (row['Sex'] != next_row['Sex']) or (row['Age_Group'] != next_row['Age_Group']):
                latex += "\\midrule\n"
    
    latex += """\\end{longtable}
"""
    
    return latex

def print_summary(df):
    """Print summary statistics"""
    
    print("\n" + "="*80)
    print("SUMMARY: HIGH-MORTALITY SINKS (Z-SCORE METHOD)")
    print("="*80)
    
    print(f"\nTotal nodes: {len(df)}")
    
    age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
               5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    
    for sex in ['Female', 'Male']:
        sex_data = df[df['Sex'] == sex]
        print(f"\n{sex}: {len(sex_data)} nodes")
        
        for age in sorted(sex_data['Age_Group'].unique()):
            age_data = sex_data[sex_data['Age_Group'] == age]
            age_str = age_map[age]
            print(f"  {age_str}: {len(age_data)} nodes")
            
            # Show top 3 by geometric mean
            if len(age_data) > 0:
                top3 = age_data.nlargest(3, 'z_geom_mean')
                for _, node in top3.iterrows():
                    print(f"    - {node['ICD_Code']:4} (Z-GeoMean={node['z_geom_mean']:.3f}, "
                          f"Bet={node['Betweenness']:.5f}, Mort={node['Mortality']:.4f})")
    
    # Overall top 10
    print("\n" + "="*80)
    print("TOP 10 HIGH-MORTALITY SINKS (ALL AGES)")
    print("="*80)
    
    top10 = df.nlargest(10, 'z_geom_mean')
    for i, (idx, node) in enumerate(top10.iterrows(), 1):
        age_str = age_map[node['Age_Group']]
        desc = str(node['Description_Eng'])[:40]
        print(f"\n{i}. {node['ICD_Code']} - {desc}")
        print(f"   {node['Sex']} Age {age_str}")
        print(f"   Z-GeoMean: {node['z_geom_mean']:.3f} | Betweenness: {node['Betweenness']:.5f} | Mortality: {node['Mortality']:.4f}")

def main():
    """Main execution"""
    
    print("="*80)
    print("HIGH-MORTALITY SINKS ANALYSIS (Z-SCORE METHOD)")
    print("="*80)
    print("\nUsing manuscript methodology: z(betweenness) × z(mortality)")
    
    # Load data for all sex-age groups
    print("\nLoading network data and calculating betweenness...")
    all_data = []
    
    for gender in ['Female', 'Male']:
        for age_group in range(1, 9):
            print(f"  Processing {gender} age {age_group}...")
            df = load_network_with_mortality(gender, age_group)
            all_data.append(df)
    
    df_all = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal nodes analyzed: {len(df_all)}")
    
    # Identify high-mortality sinks using Z-score method
    df_high = identify_high_mortality_sinks_zscore(df_all, top_percent=20)
    
    if len(df_high) == 0:
        print("\nNo high-mortality sinks found!")
        return
    
    print(f"\nHigh-mortality sinks identified: {len(df_high)}")
    
    # Add English descriptions
    print("\nAdding English descriptions...")
    df_high = add_english_descriptions(df_high)
    
    # Generate LaTeX table
    print("\nGenerating LaTeX table...")
    latex = generate_latex_table(df_high)
    
    # Save outputs
    print("\nSaving outputs...")
    
    # Save LaTeX
    tex_file = OUTPUT_DIR / 'high_mortality_sinks_ZSCORE.tex'
    with open(tex_file, 'w') as f:
        f.write(latex)
    print(f"✓ LaTeX table saved to: {tex_file}")
    
    # Save CSV
    csv_file = OUTPUT_DIR / 'high_mortality_sinks_ZSCORE.csv'
    df_high.to_csv(csv_file, index=False)
    print(f"✓ Data CSV saved to: {csv_file}")
    
    # Print summary
    print_summary(df_high)
    
    print("\n" + "="*80)
    print("✓ ANALYSIS COMPLETE")
    print("="*80)

if __name__ == '__main__':
    main()

HIGH-MORTALITY SINKS ANALYSIS (Z-SCORE METHOD)

Using manuscript methodology: z(betweenness) × z(mortality)

Loading network data and calculating betweenness...
  Processing Female age 1...
  Processing Female age 2...
  Processing Female age 3...
  Processing Female age 4...
  Processing Female age 5...
  Processing Female age 6...
  Processing Female age 7...
  Processing Female age 8...
  Processing Male age 1...
  Processing Male age 2...
  Processing Male age 3...
  Processing Male age 4...
  Processing Male age 5...
  Processing Male age 6...
  Processing Male age 7...
  Processing Male age 8...

Total nodes analyzed: 3927

Z-SCORE METHOD (Manuscript):
  - Computing z(betweenness) × z(mortality)
  - Selecting top 20% of z-score products
  - Only considering positive z-scores

High-mortality sinks identified: 89

Adding English descriptions...

Generating LaTeX table...

Saving outputs...
✓ LaTeX table saved to: outputs/high_mortality_sinks_ZSCORE.tex
✓ Data CSV saved to: outputs/

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
