In [3]:
#!/usr/bin/env python3
"""
Find intersection of:
1. Degree outliers (high degree relative to prevalence)
2. High mortality + high betweenness nodes (Z-score method)

Per age group and sex
"""

import pandas as pd
import numpy as np
import networkx as nx
from pathlib import Path

 # Directories
DATA_DIR = Path('Data/')
OUTPUT_DIR = Path('outputs/')

def load_degree_outliers():
    """Load the degree outliers from previous analysis"""
    
    # Check if outliers file exists
    outliers_file = OUTPUT_DIR / 'outliers_data_FINAL.csv'
    
    if not outliers_file.exists():
        print("ERROR: outliers_data_FINAL.csv not found!")
        print("Please run the degree outlier analysis first.")
        return None
    
    df = pd.read_csv(outliers_file)
    
    # Convert Age_Group format from "age_1" to numeric 1
    df['Age_Group'] = df['Age_Group'].str.replace('age_', '').astype(int)
    
    # Filter for high degree outliers only (outlier_type = high_degree)
    df_high = df[df['outlier_type'] == 'high_degree'].copy()
    
    print(f"Loaded {len(df_high)} high degree outliers")
    
    return df_high

def calculate_high_mortality_betweenness_nodes_zscore(top_percent=40):
    """
    Calculate nodes with high mortality AND high betweenness using Z-score product method
    
    As described in manuscript:
    - Compute z-scores for betweenness and mortality per sex-age group
    - Calculate product: z(betweenness) × z(mortality)
    - Select top X% of positive z-score products
    
    Parameters:
    - top_percent: Percentage of top z-score products to select (default 40%)
    """
    
    print(f"\nZ-SCORE METHOD (Manuscript approach):")
    print(f"  - Computing z(betweenness) × z(mortality)")
    print(f"  - Selecting top {top_percent}% of z-score products")
    print(f"  - Only considering positive z-scores\n")
    
    all_nodes = []
    
    # Load ICD codes
    icd_df = pd.read_csv(DATA_DIR / 'ICD10_Diagnoses_All.csv')
    
    for gender in ['Female', 'Male']:
        # Load mortality
        if gender == 'Female':
            mortality_df = pd.read_csv(DATA_DIR / 'mortality_diag_Female.csv')
        else:
            mortality_df = pd.read_csv(DATA_DIR / 'mortality_diag_Male.csv')
        
        for age_group in range(1, 9):
            print(f"  Processing {gender} age {age_group}...")
            
            # Load adjacency matrix
            adj_path = DATA_DIR / f'Adj_Matrix_{gender}_ICD_age_{age_group}.csv'
            A = pd.read_csv(adj_path, sep=' ', header=None).values
            
            # Create graph
            G = nx.from_numpy_array(A)
            
            # Calculate betweenness
            betweenness = nx.betweenness_centrality(G, weight=None)
            
            # Get mortality for this age
            mortality_age = mortality_df[mortality_df['age_10'] == age_group]
            mortality_dict = dict(zip(mortality_age['icd_code'], mortality_age['mortality']))
            
            # Build node data
            for node in range(len(A)):
                degree = G.degree(node)
                if degree > 0:  # Only connected nodes
                    # Get ICD code
                    icd_row = icd_df[icd_df['diagnose_id'] == node + 1]
                    if len(icd_row) > 0:
                        icd_code = icd_row.iloc[0]['icd_code']
                        descr = icd_row.iloc[0]['descr']
                        
                        bet = betweenness.get(node, 0)
                        mort = mortality_dict.get(icd_code, 0)
                        
                        all_nodes.append({
                            'Sex': gender,
                            'Age_Group': age_group,
                            'ICD_Code': icd_code,
                            'Description': descr,
                            'Degree': degree,
                            'Betweenness': bet,
                            'Mortality': mort
                        })
    
    df_all = pd.DataFrame(all_nodes)
    
    # Calculate z-scores and filter per group
    high_nodes = []
    
    for sex in ['Female', 'Male']:
        for age_group in range(1, 9):
            subset = df_all[
                (df_all['Sex'] == sex) & 
                (df_all['Age_Group'] == age_group)
            ].copy()
            
            if len(subset) == 0:
                continue
            
            # Calculate z-scores for betweenness
            bet_mean = subset['Betweenness'].mean()
            bet_std = subset['Betweenness'].std()
            if bet_std > 0:
                subset['z_betweenness'] = (subset['Betweenness'] - bet_mean) / bet_std
            else:
                subset['z_betweenness'] = 0
            
            # Calculate z-scores for mortality
            mort_mean = subset['Mortality'].mean()
            mort_std = subset['Mortality'].std()
            if mort_std > 0:
                subset['z_mortality'] = (subset['Mortality'] - mort_mean) / mort_std
            else:
                subset['z_mortality'] = 0
            
            # Calculate z-score product
            subset['z_product'] = subset['z_betweenness'] * subset['z_mortality']
            
            # Calculate geometric mean for ranking
            subset['z_geom_mean'] = np.where(
                (subset['z_betweenness'] > 0) & (subset['z_mortality'] > 0),
                np.sqrt(subset['z_betweenness'] * subset['z_mortality']),
                0
            )
            
            # Filter: positive z-scores and top X% of product
            threshold_percentile = 100 - top_percent
            z_threshold = subset['z_product'].quantile(threshold_percentile / 100)
            
            high_subset = subset[
                (subset['z_betweenness'] > 0) &
                (subset['z_mortality'] > 0) &
                (subset['z_product'] >= z_threshold)
            ].copy()
            
            if len(high_subset) > 0:
                high_nodes.append(high_subset)
    
    df_high = pd.concat(high_nodes, ignore_index=True) if len(high_nodes) > 0 else pd.DataFrame()
    
    print(f"Found {len(df_high)} nodes with high mortality & betweenness")
    
    return df_high

def find_intersection(df_outliers, df_high_mort_bet):
    """Find intersection of degree outliers and high mortality/betweenness nodes"""
    
    print("\n" + "="*80)
    print("FINDING INTERSECTION")
    print("="*80)
    
    # Create unique identifiers
    df_outliers['node_id'] = df_outliers['Sex'] + '_' + df_outliers['Age_Group'].astype(str) + '_' + df_outliers['ICD_Code']
    df_high_mort_bet['node_id'] = df_high_mort_bet['Sex'] + '_' + df_high_mort_bet['Age_Group'].astype(str) + '_' + df_high_mort_bet['ICD_Code']
    
    # Find intersection
    intersection_ids = set(df_outliers['node_id']) & set(df_high_mort_bet['node_id'])
    
    print(f"\nDegree outliers (high): {len(df_outliers)}")
    print(f"High mortality + betweenness (Z-score): {len(df_high_mort_bet)}")
    print(f"Intersection: {len(intersection_ids)}")
    
    # Get full data for intersection
    df_intersection = df_high_mort_bet[df_high_mort_bet['node_id'].isin(intersection_ids)].copy()
    
    # Merge with outlier data to get log ratio
    outlier_data = df_outliers[['node_id', 'Log_ratio', 'Prevalence']].copy()
    outlier_data = outlier_data.rename(columns={'Log_ratio': 'Log_Ratio'})
    df_intersection = df_intersection.merge(outlier_data, on='node_id', how='left')
    
    # Sort by sex, age, and z_geom_mean
    df_intersection = df_intersection.sort_values(['Sex', 'Age_Group', 'z_geom_mean'], ascending=[True, True, False])
    
    return df_intersection

def generate_latex_table(df):
    """Generate LaTeX table"""
    
    # Map age group
    age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
               5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    df['Age_Range'] = df['Age_Group'].map(age_map)
    
    latex = """\\begin{longtable}{llllrrrrr}
\\caption{Critical Nodes: High Degree Outliers with High Mortality and Betweenness (Z-Score Method)} \\label{tab:critical_nodes} \\\\
\\toprule
Sex & Age & ICD & Description & Degree & Betweenness & Mortality & Z-Product & Log Ratio \\\\
\\midrule
\\endfirsthead

\\multicolumn{9}{c}{\\tablename\\ \\thetable\\ -- Continued from previous page} \\\\
\\toprule
Sex & Age & ICD & Description & Degree & Betweenness & Mortality & Z-Product & Log Ratio \\\\
\\midrule
\\endhead

\\midrule
\\multicolumn{9}{r}{Continued on next page} \\\\
\\endfoot

\\bottomrule
\\endlastfoot

"""
    
    for idx, row in df.iterrows():
        # Truncate description
        desc = str(row['Description_Eng'] if 'Description_Eng' in row else row['Description'])[:30]
        desc = desc.replace('&', '\\&').replace('_', '\\_').replace('%', '\\%')
        
        latex += f"{row['Sex']} & {row['Age_Range']} & {row['ICD_Code']} & {desc} & "
        latex += f"{row['Degree']} & {row['Betweenness']:.5f} & "
        latex += f"{row['Mortality']:.4f} & {row['z_product']:.3f} & {row['Log_Ratio']:.2f} \\\\\n"
        
        # Add midrule after age group change
        if idx < len(df) - 1:
            next_row = df.iloc[idx + 1]
            if (row['Sex'] != next_row['Sex']) or (row['Age_Group'] != next_row['Age_Group']):
                latex += "\\midrule\n"
    
    latex += """\\end{longtable}
"""
    
    return latex

def print_summary(df):
    """Print summary statistics"""
    
    print("\n" + "="*80)
    print("SUMMARY: CRITICAL NODES (INTERSECTION)")
    print("="*80)
    
    print(f"\nTotal critical nodes: {len(df)}")
    
    age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
               5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    
    for sex in ['Female', 'Male']:
        sex_data = df[df['Sex'] == sex]
        print(f"\n{sex}: {len(sex_data)} nodes")
        
        for age in sorted(sex_data['Age_Group'].unique()):
            age_data = sex_data[sex_data['Age_Group'] == age]
            age_str = age_map[age]
            print(f"  {age_str}: {len(age_data)} nodes")
            
            # Show top 3 by z_geom_mean
            if len(age_data) > 0:
                top3 = age_data.nlargest(3, 'z_geom_mean')
                for _, node in top3.iterrows():
                    print(f"    - {node['ICD_Code']:4} (Z-GeoMean={node['z_geom_mean']:.3f}, "
                          f"Bet={node['Betweenness']:.5f}, Mort={node['Mortality']:.4f}, LogRatio={node['Log_Ratio']:.2f})")

def add_english_descriptions(df):
    """Add English descriptions"""
    
    eng_df = pd.read_csv(DATA_DIR / 'DiagAll_Eng__2_.csv')
    icd_to_eng = dict(zip(eng_df['Code'], eng_df['ShortDescription']))
    
    df['Description_Eng'] = df['ICD_Code'].map(icd_to_eng)
    df['Description_Eng'] = df['Description_Eng'].fillna(df['Description'])
    
    return df

def main():
    """Main execution"""
    
    print("="*80)
    print("INTERSECTION ANALYSIS: DEGREE OUTLIERS × HIGH MORTALITY/BETWEENNESS")
    print("="*80)
    print("\nUsing Z-score method (manuscript methodology)")
    
    # Load degree outliers
    print("\n1. Loading degree outliers...")
    df_outliers = load_degree_outliers()
    
    if df_outliers is None:
        return
    
    # Calculate high mortality/betweenness nodes using Z-score
    print("\n2. Calculating high mortality & betweenness nodes (Z-score method)...")
    df_high_mort_bet = calculate_high_mortality_betweenness_nodes_zscore(top_percent=40)
    
    if len(df_high_mort_bet) == 0:
        print("\nNo high mortality/betweenness nodes found!")
        return
    
    # Find intersection
    print("\n3. Finding intersection...")
    df_intersection = find_intersection(df_outliers, df_high_mort_bet)
    
    if len(df_intersection) == 0:
        print("\nNo intersection found!")
        return
    
    # Add English descriptions
    print("\n4. Adding English descriptions...")
    df_intersection = add_english_descriptions(df_intersection)
    
    # Generate LaTeX table
    print("\n5. Generating LaTeX table...")
    latex = generate_latex_table(df_intersection)
    
    # Save outputs
    print("\n6. Saving outputs...")
    
    tex_file = OUTPUT_DIR / 'critical_nodes_intersection_ZSCORE.tex'
    with open(tex_file, 'w') as f:
        f.write(latex)
    print(f"✓ LaTeX table saved to: {tex_file}")
    
    csv_file = OUTPUT_DIR / 'critical_nodes_intersection_ZSCORE.csv'
    df_intersection.to_csv(csv_file, index=False)
    print(f"✓ Data CSV saved to: {csv_file}")
    
    # Print summary
    print_summary(df_intersection)
    
    print("\n" + "="*80)
    print("✓ ANALYSIS COMPLETE")
    print("="*80)

if __name__ == '__main__':
    main()

INTERSECTION ANALYSIS: DEGREE OUTLIERS × HIGH MORTALITY/BETWEENNESS

Using Z-score method (manuscript methodology)

1. Loading degree outliers...
Loaded 201 high degree outliers

2. Calculating high mortality & betweenness nodes (Z-score method)...

Z-SCORE METHOD (Manuscript approach):
  - Computing z(betweenness) × z(mortality)
  - Selecting top 40% of z-score products
  - Only considering positive z-scores

  Processing Female age 1...
  Processing Female age 2...
  Processing Female age 3...
  Processing Female age 4...
  Processing Female age 5...
  Processing Female age 6...
  Processing Female age 7...
  Processing Female age 8...
  Processing Male age 1...
  Processing Male age 2...
  Processing Male age 3...
  Processing Male age 4...
  Processing Male age 5...
  Processing Male age 6...
  Processing Male age 7...
  Processing Male age 8...
Found 98 nodes with high mortality & betweenness

3. Finding intersection...

FINDING INTERSECTION

Degree outliers (high): 201
High morta

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [18]:
#!/usr/bin/env python3
"""
Outlier Detection - Exact Replication of Notebook Method
Uses the exact same approach as graph.ipynb
"""

import pandas as pd
import numpy as np
import networkx as nx
from pathlib import Path

 # Directories
DATA_DIR = Path('Data/')
OUTPUT_DIR = Path('outputs/')

def create_graph_data(gender, age_group):
    """Load network and prevalence data (replicating notebook's create_graph function)"""
    # Load adjacency matrix
    adj_path = DATA_DIR / f'Adj_Matrix_{gender}_ICD_age_{age_group}.csv'
    A = pd.read_csv(adj_path, sep=' ', header=None).values
    
    # Create graph
    G = nx.from_numpy_array(A)
    
    # Load ICD codes
    df_icd = pd.read_csv(DATA_DIR / 'ICD10_Diagnoses_All.csv')
    
    # Load prevalence data
    df_prev = pd.read_csv(DATA_DIR / 'Prevalence_Sex_Age_Year_ICD.csv')
    age_dict = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
                5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    age_col = age_dict[age_group]
    
    # Filter prevalence for this sex, age, and year 2014
    df_prev = df_prev[(df_prev['Age_Group'] == age_col) & 
                      (df_prev['sex'] == gender) & 
                      (df_prev['year'] == 2014)]
    
    # Create prevalence dictionary
    prevalence_dict = df_prev.set_index('icd_code')['p'].to_dict()
    
    # Build dataframe with degree and prevalence
    results = []
    for node in range(len(A)):
        degree = G.degree(node)
        if degree > 0:  # Only connected nodes
            # Get ICD code for this node
            icd_row = df_icd[df_icd['diagnose_id'] == node + 1]
            if len(icd_row) > 0:
                icd_code = icd_row.iloc[0]['icd_code']
                descr = icd_row.iloc[0]['descr']
                
                # Get prevalence (default to 0 if missing)
                prevalence = prevalence_dict.get(icd_code, 0)
                
                results.append({
                    'Sex': gender,
                    'Age_Group': f'age_{age_group}',
                    'Age_Range': age_col,
                    'ICD_Code': icd_code,
                    'ICD_Description': descr,
                    'Degree': degree,
                    'Prevalence': prevalence
                })
    
    return pd.DataFrame(results)

def deg_prev_df():
    """Replicate the deg_prev_df function from notebook"""
    all_data = []
    
    for gender in ['Male', 'Female']:
        for age_group in range(1, 9):
            print(f'Loading {gender} age_{age_group}')
            df = create_graph_data(gender, age_group)
            all_data.append(df)
    
    return pd.concat(all_data, ignore_index=True)

def modified_zscore(x, median, mad):
    """Modified z-score as in notebook"""
    return 0.6745 * (x - median) / mad

def find_outliers(df):
    """
    Exact replication of notebook's find_outliers function (Cell 15)
    Uses 5th and 95th percentiles per sex-age group
    """
    # Remove zero prevalence
    df = df[df['Prevalence'] > 0].copy()
    
    # Calculate ratio and log ratio
    df['Ratio'] = df['Degree'] / df['Prevalence']
    df['Log_ratio'] = df['Ratio'].apply(lambda x: np.log10(x) if x > 0 else np.nan)
    
    df_out = pd.DataFrame(columns=df.columns.tolist() + ['Deviation', 'Outlier'])
    
    # Process each sex-age group separately
    for sex in ['Male', 'Female']:
        for age_id in df['Age_Group'].unique():
            df_subset = df[(df['Sex'] == sex) & (df['Age_Group'] == age_id)].copy()
            
            if len(df_subset) == 0:
                continue
            
            # Calculate percentiles (exact method from notebook)
            lower_bound = df_subset['Log_ratio'].quantile(0.2)
            upper_bound = df_subset['Log_ratio'].quantile(0.80)
            
            # Modified z-score
            median = df_subset['Log_ratio'].median()
            mad = (df_subset['Log_ratio'] - median).abs().median()
            df_subset['Deviation'] = df_subset['Log_ratio'].apply(
                lambda x: modified_zscore(x, median, mad)
            )
            
            # Mark outliers
            df_subset['Outlier'] = (
                (df_subset['Log_ratio'] < lower_bound) | 
                (df_subset['Log_ratio'] > upper_bound)
            )
            
            df_out = pd.concat([df_out, df_subset], ignore_index=True)
    
    return df_out

def main():
    """Main analysis pipeline"""
    print("="*80)
    print("OUTLIER DETECTION - EXACT NOTEBOOK REPLICATION")
    print("="*80)
    
    # Step 1: Build degree-prevalence dataframe
    print("\nStep 1: Building degree-prevalence dataframe...")
    df_deg_prev = deg_prev_df()
    print(f"Total records: {len(df_deg_prev)}")
    
    # Save raw data
    raw_file = OUTPUT_DIR / 'Degree_Prevalence_ICD_raw_EXACT.csv'
    df_deg_prev.to_csv(raw_file, index=False)
    print(f"Raw data saved to: {raw_file}")
    
    # Step 2: Find outliers
    print("\nStep 2: Finding outliers using 5th/95th percentile method...")
    df_outliers = find_outliers(df_deg_prev)
    
    # Save processed data
    processed_file = OUTPUT_DIR / 'Degree_Prevalence_ICD_EXACT.csv'
    df_outliers.to_csv(processed_file, index=False)
    print(f"Processed data saved to: {processed_file}")
    
    # Step 3: Filter to only outliers
    df_outliers_only = df_outliers[df_outliers['Outlier'] == True].copy()
    
    outliers_file = OUTPUT_DIR / 'Outliers_EXACT.csv'
    df_outliers_only.to_csv(outliers_file, index=False)
    print(f"Outliers saved to: {outliers_file}")
    
    # Summary statistics
    print("\n" + "="*80)
    print("SUMMARY")
    print("="*80)
    print(f"\nTotal diseases analyzed: {len(df_outliers)}")
    print(f"Total outliers identified: {len(df_outliers_only)}")
    print(f"  Female outliers: {len(df_outliers_only[df_outliers_only['Sex'] == 'Female'])}")
    print(f"  Male outliers: {len(df_outliers_only[df_outliers_only['Sex'] == 'Male'])}")
    
    # Check F17 specifically
    print("\n" + "="*80)
    print("F17 (Nicotine Dependence) Check")
    print("="*80)
    f17_all = df_outliers[df_outliers['ICD_Code'] == 'F17']
    f17_outliers = df_outliers_only[df_outliers_only['ICD_Code'] == 'F17']
    
    print(f"\nF17 total entries: {len(f17_all)}")
    print(f"F17 outliers: {len(f17_outliers)}")
    
    if len(f17_outliers) > 0:
        print("\nF17 outlier entries:")
        print(f17_outliers[['Sex', 'Age_Range', 'Degree', 'Prevalence', 'Log_ratio', 'Outlier']])
    
    # Specifically check Male 10-19
    f17_male_10_19 = df_outliers[
        (df_outliers['Sex'] == 'Male') & 
        (df_outliers['Age_Range'] == '10-19') & 
        (df_outliers['ICD_Code'] == 'F17')
    ]
    
    print(f"\nF17 Male 10-19:")
    if len(f17_male_10_19) > 0:
        print(f17_male_10_19[['Degree', 'Prevalence', 'Log_ratio', 'Outlier']])
    else:
        print("  NOT FOUND (missing from 2014 data)")
    
    print("\n" + "="*80)
    print("Analysis complete!")
    print("="*80)

if __name__ == '__main__':
    main()

OUTLIER DETECTION - EXACT NOTEBOOK REPLICATION

Step 1: Building degree-prevalence dataframe...
Loading Male age_1
Loading Male age_2
Loading Male age_3
Loading Male age_4
Loading Male age_5
Loading Male age_6
Loading Male age_7
Loading Male age_8
Loading Female age_1
Loading Female age_2
Loading Female age_3
Loading Female age_4
Loading Female age_5
Loading Female age_6
Loading Female age_7
Loading Female age_8
Total records: 3927
Raw data saved to: outputs/Degree_Prevalence_ICD_raw_EXACT.csv

Step 2: Finding outliers using 5th/95th percentile method...
Processed data saved to: outputs/Degree_Prevalence_ICD_EXACT.csv
Outliers saved to: outputs/Outliers_EXACT.csv

SUMMARY

Total diseases analyzed: 3880
Total outliers identified: 1551
  Female outliers: 771
  Male outliers: 780

F17 (Nicotine Dependence) Check

F17 total entries: 13
F17 outliers: 13

F17 outlier entries:
         Sex Age_Range Degree  Prevalence  Log_ratio Outlier
262     Male     20-29     30     0.00012   5.397940    

  df_out = pd.concat([df_out, df_subset], ignore_index=True)


In [22]:
#!/usr/bin/env python3
"""
Find intersection of:
1. Degree outliers (high degree relative to prevalence)
2. High mortality + high betweenness nodes (Z-score method)

Per age group and sex
"""

import pandas as pd
import numpy as np
import networkx as nx
from pathlib import Path
 # Directories
DATA_DIR = Path('Data/')
OUTPUT_DIR = Path('outputs/')

def load_degree_outliers():
    """Load the degree outliers from previous analysis (95th percentile)"""
    
    # Use the EXACT file with all 95th percentile outliers
    outliers_file = OUTPUT_DIR / 'Outliers_EXACT.csv'
    
    if not outliers_file.exists():
        print("ERROR: Outliers_EXACT.csv not found!")
        print("Please run the degree outlier analysis first (outlier_detection_exact.py)")
        return None
    
    df = pd.read_csv(outliers_file)
    
    # Convert Age_Group format from "age_1" to numeric 1
    df['Age_Group'] = df['Age_Group'].str.replace('age_', '').astype(int)
    
    # Filter for high degree outliers only (above 95th percentile, positive deviation)
    df_high = df[(df['Outlier'] == True) & (df['Deviation'] > 0)].copy()
    
    # Rename for consistency
    df_high = df_high.rename(columns={'ICD_Description': 'Description_GER'})
    
    print(f"Loaded {len(df_high)} high degree outliers (95th percentile)")
    
    return df_high

def calculate_high_mortality_betweenness_nodes_zscore(top_percent=20):
    """
    Calculate nodes with high mortality AND high betweenness using Z-score product method
    
    As described in manuscript:
    - Compute z-scores for betweenness and mortality per sex-age group
    - Calculate product: z(betweenness) × z(mortality)
    - Select top X% of positive z-score products
    
    Parameters:
    - top_percent: Percentage of top z-score products to select (default 40%)
    """
    
    print(f"\nZ-SCORE METHOD (Manuscript approach):")
    print(f"  - Computing z(betweenness) × z(mortality)")
    print(f"  - Selecting top {top_percent}% of z-score products")
    print(f"  - Only considering positive z-scores\n")
    
    all_nodes = []
    
    # Load ICD codes
    icd_df = pd.read_csv(DATA_DIR / 'ICD10_Diagnoses_All.csv')
    
    for gender in ['Female', 'Male']:
        # Load mortality
        if gender == 'Female':
            mortality_df = pd.read_csv(DATA_DIR / 'mortality_diag_Female.csv')
        else:
            mortality_df = pd.read_csv(DATA_DIR / 'mortality_diag_Male.csv')
        
        for age_group in range(1, 9):
            print(f"  Processing {gender} age {age_group}...")
            
            # Load adjacency matrix
            adj_path = DATA_DIR / f'Adj_Matrix_{gender}_ICD_age_{age_group}.csv'
            A = pd.read_csv(adj_path, sep=' ', header=None).values
            
            # Create graph
            G = nx.from_numpy_array(A)
            
            # Calculate betweenness
            betweenness = nx.betweenness_centrality(G, weight=None)
            
            # Get mortality for this age
            mortality_age = mortality_df[mortality_df['age_10'] == age_group]
            mortality_dict = dict(zip(mortality_age['icd_code'], mortality_age['mortality']))
            
            # Build node data
            for node in range(len(A)):
                degree = G.degree(node)
                if degree > 0:  # Only connected nodes
                    # Get ICD code
                    icd_row = icd_df[icd_df['diagnose_id'] == node + 1]
                    if len(icd_row) > 0:
                        icd_code = icd_row.iloc[0]['icd_code']
                        descr = icd_row.iloc[0]['descr']
                        
                        bet = betweenness.get(node, 0)
                        mort = mortality_dict.get(icd_code, 0)
                        
                        all_nodes.append({
                            'Sex': gender,
                            'Age_Group': age_group,
                            'ICD_Code': icd_code,
                            'Description': descr,
                            'Degree': degree,
                            'Betweenness': bet,
                            'Mortality': mort
                        })
    
    df_all = pd.DataFrame(all_nodes)
    
    # Calculate z-scores and filter per group
    high_nodes = []
    
    for sex in ['Female', 'Male']:
        for age_group in range(1, 9):
            subset = df_all[
                (df_all['Sex'] == sex) & 
                (df_all['Age_Group'] == age_group)
            ].copy()
            
            if len(subset) == 0:
                continue
            
            # Calculate z-scores for betweenness
            bet_mean = subset['Betweenness'].mean()
            bet_std = subset['Betweenness'].std()
            if bet_std > 0:
                subset['z_betweenness'] = (subset['Betweenness'] - bet_mean) / bet_std
            else:
                subset['z_betweenness'] = 0
            
            # Calculate z-scores for mortality
            mort_mean = subset['Mortality'].mean()
            mort_std = subset['Mortality'].std()
            if mort_std > 0:
                subset['z_mortality'] = (subset['Mortality'] - mort_mean) / mort_std
            else:
                subset['z_mortality'] = 0
            
            # Calculate z-score product
            subset['z_product'] = subset['z_betweenness'] * subset['z_mortality']
            
            # Calculate geometric mean for ranking
            subset['z_geom_mean'] = np.where(
                (subset['z_betweenness'] > 0) & (subset['z_mortality'] > 0),
                np.sqrt(subset['z_betweenness'] * subset['z_mortality']),
                0
            )
            
            # Filter: positive z-scores and top X% of product
            threshold_percentile = 100 - top_percent
            z_threshold = subset['z_product'].quantile(threshold_percentile / 100)
            
            high_subset = subset[
                (subset['z_betweenness'] > 0) &
                (subset['z_mortality'] > 0) &
                (subset['z_product'] >= z_threshold)
            ].copy()
            
            if len(high_subset) > 0:
                high_nodes.append(high_subset)
    
    df_high = pd.concat(high_nodes, ignore_index=True) if len(high_nodes) > 0 else pd.DataFrame()
    
    print(f"Found {len(df_high)} nodes with high mortality & betweenness")
    
    return df_high

def find_intersection(df_outliers, df_high_mort_bet):
    """Find intersection of degree outliers and high mortality/betweenness nodes"""
    
    print("\n" + "="*80)
    print("FINDING INTERSECTION")
    print("="*80)
    
    # Create unique identifiers
    df_outliers['node_id'] = df_outliers['Sex'] + '_' + df_outliers['Age_Group'].astype(str) + '_' + df_outliers['ICD_Code']
    df_high_mort_bet['node_id'] = df_high_mort_bet['Sex'] + '_' + df_high_mort_bet['Age_Group'].astype(str) + '_' + df_high_mort_bet['ICD_Code']
    
    # Find intersection
    intersection_ids = set(df_outliers['node_id']) & set(df_high_mort_bet['node_id'])
    
    print(f"\nDegree outliers (high): {len(df_outliers)}")
    print(f"High mortality + betweenness (Z-score): {len(df_high_mort_bet)}")
    print(f"Intersection: {len(intersection_ids)}")
    
    # Get full data for intersection
    df_intersection = df_high_mort_bet[df_high_mort_bet['node_id'].isin(intersection_ids)].copy()
    
    # Merge with outlier data to get log ratio and prevalence
    outlier_data = df_outliers[['node_id', 'Log_ratio', 'Prevalence']].copy()
    outlier_data = outlier_data.rename(columns={'Log_ratio': 'Log_Ratio'})
    df_intersection = df_intersection.merge(outlier_data, on='node_id', how='left')
    
    # Sort by sex, age, and z_geom_mean
    df_intersection = df_intersection.sort_values(['Sex', 'Age_Group', 'z_geom_mean'], ascending=[True, True, False])
    
    return df_intersection

def generate_latex_table(df):
    """Generate LaTeX table"""
    
    # Map age group
    age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
               5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    df['Age_Range'] = df['Age_Group'].map(age_map)
    
    latex = """\\begin{longtable}{llllrrrrr}
\\caption{Critical Nodes: High Degree Outliers with High Mortality and Betweenness (Z-Score Method)} \\label{tab:critical_nodes} \\\\
\\toprule
Sex & Age & ICD & Description & Degree & Betweenness & Mortality & Z-Product & Log Ratio \\\\
\\midrule
\\endfirsthead

\\multicolumn{9}{c}{\\tablename\\ \\thetable\\ -- Continued from previous page} \\\\
\\toprule
Sex & Age & ICD & Description & Degree & Betweenness & Mortality & Z-Product & Log Ratio \\\\
\\midrule
\\endhead

\\midrule
\\multicolumn{9}{r}{Continued on next page} \\\\
\\endfoot

\\bottomrule
\\endlastfoot

"""
    
    for idx, row in df.iterrows():
        # Truncate description
        desc = str(row['Description_Eng'] if 'Description_Eng' in row else row['Description'])[:30]
        desc = desc.replace('&', '\\&').replace('_', '\\_').replace('%', '\\%')
        
        latex += f"{row['Sex']} & {row['Age_Range']} & {row['ICD_Code']} & {desc} & "
        latex += f"{row['Degree']} & {row['Betweenness']:.5f} & "
        latex += f"{row['Mortality']:.4f} & {row['z_product']:.3f} & {row['Log_Ratio']:.2f} \\\\\n"
        
        # Add midrule after age group change
        if idx < len(df) - 1:
            next_row = df.iloc[idx + 1]
            if (row['Sex'] != next_row['Sex']) or (row['Age_Group'] != next_row['Age_Group']):
                latex += "\\midrule\n"
    
    latex += """\\end{longtable}
"""
    
    return latex

def print_summary(df):
    """Print summary statistics"""
    
    print("\n" + "="*80)
    print("SUMMARY: CRITICAL NODES (INTERSECTION)")
    print("="*80)
    
    print(f"\nTotal critical nodes: {len(df)}")
    
    age_map = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
               5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    
    for sex in ['Female', 'Male']:
        sex_data = df[df['Sex'] == sex]
        print(f"\n{sex}: {len(sex_data)} nodes")
        
        for age in sorted(sex_data['Age_Group'].unique()):
            age_data = sex_data[sex_data['Age_Group'] == age]
            age_str = age_map[age]
            print(f"  {age_str}: {len(age_data)} nodes")
            
            # Show top 3 by z_geom_mean
            if len(age_data) > 0:
                top3 = age_data.nlargest(3, 'z_geom_mean')
                for _, node in top3.iterrows():
                    print(f"    - {node['ICD_Code']:4} (Z-GeoMean={node['z_geom_mean']:.3f}, "
                          f"Bet={node['Betweenness']:.5f}, Mort={node['Mortality']:.4f}, LogRatio={node['Log_Ratio']:.2f})")

def add_english_descriptions(df):
    """Add English descriptions"""
    
    eng_df = pd.read_csv(DATA_DIR / 'DiagAll_Eng__2_.csv')
    icd_to_eng = dict(zip(eng_df['Code'], eng_df['ShortDescription']))
    
    df['Description_Eng'] = df['ICD_Code'].map(icd_to_eng)
    df['Description_Eng'] = df['Description_Eng'].fillna(df['Description'])
    
    return df

def main():
    """Main execution"""
    
    print("="*80)
    print("INTERSECTION ANALYSIS: DEGREE OUTLIERS × HIGH MORTALITY/BETWEENNESS")
    print("="*80)
    print("\nUsing Z-score method (manuscript methodology)")
    
    # Load degree outliers
    print("\n1. Loading degree outliers...")
    df_outliers = load_degree_outliers()
    
    if df_outliers is None:
        return
    
    # Calculate high mortality/betweenness nodes using Z-score
    print("\n2. Calculating high mortality & betweenness nodes (Z-score method)...")
    df_high_mort_bet = calculate_high_mortality_betweenness_nodes_zscore(top_percent=40)
    
    if len(df_high_mort_bet) == 0:
        print("\nNo high mortality/betweenness nodes found!")
        return
    
    # Find intersection
    print("\n3. Finding intersection...")
    df_intersection = find_intersection(df_outliers, df_high_mort_bet)
    
    if len(df_intersection) == 0:
        print("\nNo intersection found!")
        return
    
    # Add English descriptions
    print("\n4. Adding English descriptions...")
    df_intersection = add_english_descriptions(df_intersection)
    
    # Generate LaTeX table
    print("\n5. Generating LaTeX table...")
    latex = generate_latex_table(df_intersection)
    
    # Save outputs
    print("\n6. Saving outputs...")
    
    tex_file = OUTPUT_DIR / 'critical_nodes_intersection_ZSCORE.tex'
    with open(tex_file, 'w') as f:
        f.write(latex)
    print(f"✓ LaTeX table saved to: {tex_file}")
    
    csv_file = OUTPUT_DIR / 'critical_nodes_intersection_ZSCORE.csv'
    df_intersection.to_csv(csv_file, index=False)
    print(f"✓ Data CSV saved to: {csv_file}")
    
    # Print summary
    print_summary(df_intersection)
    
    print("\n" + "="*80)
    print("✓ ANALYSIS COMPLETE")
    print("="*80)

if __name__ == '__main__':
    main()

INTERSECTION ANALYSIS: DEGREE OUTLIERS × HIGH MORTALITY/BETWEENNESS

Using Z-score method (manuscript methodology)

1. Loading degree outliers...
Loaded 777 high degree outliers (95th percentile)

2. Calculating high mortality & betweenness nodes (Z-score method)...

Z-SCORE METHOD (Manuscript approach):
  - Computing z(betweenness) × z(mortality)
  - Selecting top 40% of z-score products
  - Only considering positive z-scores

  Processing Female age 1...
  Processing Female age 2...
  Processing Female age 3...
  Processing Female age 4...
  Processing Female age 5...
  Processing Female age 6...
  Processing Female age 7...
  Processing Female age 8...
  Processing Male age 1...
  Processing Male age 2...
  Processing Male age 3...
  Processing Male age 4...
  Processing Male age 5...
  Processing Male age 6...
  Processing Male age 7...
  Processing Male age 8...
Found 98 nodes with high mortality & betweenness

3. Finding intersection...

FINDING INTERSECTION

Degree outliers (hig

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
