In [9]:
#!/usr/bin/env python3
"""
Complete Standalone Script: Outlier Detection and LaTeX Table Generation
Reads raw data, detects outliers, generates Overleaf-ready tables
No dependency on intermediate output files
"""

import pandas as pd
import numpy as np
import networkx as nx
from pathlib import Path

# Directories
DATA_DIR = Path('Data/')
OUTPUT_DIR = Path('outputs/')

def format_prevalence_scientific(val):
    """Format prevalence in scientific notation for LaTeX"""
    if val == 0 or pd.isna(val) or val < 1e-5:
        return "$< 10^{-5}$"
    exponent = int(np.floor(np.log10(abs(val))))
    mantissa = val / (10 ** exponent)
    if abs(mantissa - 1.0) < 0.01:
        return f"$10^{{{exponent}}}$"
    return f"${mantissa:.1f} \\times 10^{{{exponent}}}$"

def load_network_and_prevalence(gender, age_group):
    """Load network data and calculate degree and prevalence for each disease"""
    
    # Load adjacency matrix
    adj_path = DATA_DIR / f'Adj_Matrix_{gender}_ICD_age_{age_group}.csv'
    A = pd.read_csv(adj_path, sep=' ', header=None).values
    
    # Create graph
    G = nx.from_numpy_array(A)
    
    # Load ICD codes
    icd_df = pd.read_csv(DATA_DIR / 'ICD10_Diagnoses_All.csv')
    
    # Load prevalence data
    prev_df = pd.read_csv(DATA_DIR / 'Prevalence_Sex_Age_Year_ICD.csv')
    
    # Map age group number to string
    age_dict = {1: '0-9', 2: '10-19', 3: '20-29', 4: '30-39',
                5: '40-49', 6: '50-59', 7: '60-69', 8: '70-79'}
    age_col = age_dict[age_group]
    
    # Filter prevalence for 2014 (exact notebook method)
    prev_2014 = prev_df[
        (prev_df['Age_Group'] == age_col) & 
        (prev_df['sex'] == gender) & 
        (prev_df['year'] == 2014)
    ]
    
    # Create prevalence dictionary
    prevalence_dict = prev_2014.set_index('icd_code')['p'].to_dict()
    
    # Build results dataframe
    results = []
    for node in range(len(A)):
        degree = G.degree(node)
        if degree > 0:  # Only connected nodes
            # Get ICD code for this node
            icd_row = icd_df[icd_df['diagnose_id'] == node + 1]
            if len(icd_row) > 0:
                icd_code = icd_row.iloc[0]['icd_code']
                descr = icd_row.iloc[0]['descr']
                
                # Get prevalence (default to 0 if missing)
                prevalence = prevalence_dict.get(icd_code, 0)
                
                if prevalence > 0:  # Only include if prevalence exists
                    results.append({
                        'Sex': gender,
                        'Age_Group': f'age_{age_group}',
                        'Age_Range': age_col,
                        'ICD_Code': icd_code,
                        'Degree': degree,
                        'Prevalence': prevalence,
                        'Description_GER': descr
                    })
    
    return pd.DataFrame(results)

def detect_outliers(df_all):
    """Detect outliers using 5th and 95th percentile method"""
    
    print("Detecting outliers using 5th/95th percentile method...")
    
    # Calculate ratio and log ratio
    df_all['Ratio'] = df_all['Degree'] / df_all['Prevalence']
    df_all['Log_ratio'] = df_all['Ratio'].apply(lambda x: np.log10(x) if x > 0 else np.nan)
    
    all_outliers = []
    
    # Process each sex-age group separately
    for sex in ['Female', 'Male']:
        for age_group in df_all['Age_Group'].unique():
            subset = df_all[
                (df_all['Sex'] == sex) & 
                (df_all['Age_Group'] == age_group)
            ].copy()
            
            if len(subset) == 0:
                continue
            
            # Calculate percentile thresholds
            lower_bound = subset['Log_ratio'].quantile(0.05)
            upper_bound = subset['Log_ratio'].quantile(0.95)
            
            # Mark outliers
            subset['Outlier'] = (
                (subset['Log_ratio'] < lower_bound) | 
                (subset['Log_ratio'] > upper_bound)
            )
            
            # Keep only outliers
            outliers = subset[subset['Outlier'] == True]
            if len(outliers) > 0:
                all_outliers.append(outliers)
    
    return pd.concat(all_outliers, ignore_index=True)

def add_english_descriptions(df_outliers):
    """Add English descriptions to outliers"""
    
    print("Adding English descriptions...")
    
    # Load English descriptions
    eng_df = pd.read_csv(DATA_DIR / 'DiagAll_Eng__2_.csv')
    icd_to_eng = dict(zip(eng_df['Code'], eng_df['ShortDescription']))
    
    # Map descriptions
    df_outliers['Description_Eng'] = df_outliers['ICD_Code'].map(icd_to_eng)
    
    # Fill missing with German description
    df_outliers['Description_Eng'] = df_outliers['Description_Eng'].fillna(
        df_outliers['Description_GER']
    )
    
    return df_outliers

def select_top_outliers(df_outliers, n_high=20, n_low=10):
    """Select top N high and low degree outliers per sex-age group"""
    
    print(f"Selecting top {n_high} high and top {n_low} low outliers per group...")
    
    results = []
    
    for sex in ['Female', 'Male']:
        sex_data = df_outliers[df_outliers['Sex'] == sex]
        
        for age_range in sorted(sex_data['Age_Range'].unique()):
            age_data = sex_data[sex_data['Age_Range'] == age_range]
            
            if len(age_data) == 0:
                continue
            
            # Split by median to get high/low
            age_median = age_data['Log_ratio'].median()
            
            # Top N high degree outliers
            high_degree = age_data[age_data['Log_ratio'] > age_median].nlargest(n_high, 'Log_ratio')
            high_degree = high_degree.copy()
            high_degree['outlier_type'] = 'high_degree'
            
            # Top N low degree outliers
            low_degree = age_data[age_data['Log_ratio'] <= age_median].nsmallest(n_low, 'Log_ratio')
            low_degree = low_degree.copy()
            low_degree['outlier_type'] = 'low_degree'
            
            results.append(high_degree)
            results.append(low_degree)
    
    # Combine and sort
    table_data = pd.concat(results, ignore_index=True)
    
    table_data['age_num'] = table_data['Age_Range'].map({
        '0-9': 1, '10-19': 2, '20-29': 3, '30-39': 4,
        '40-49': 5, '50-59': 6, '60-69': 7, '70-79': 8
    })
    table_data['type_order'] = table_data['outlier_type'].map({
        'high_degree': 0, 'low_degree': 1
    })
    
    table_data = table_data.sort_values(
        ['Sex', 'age_num', 'type_order', 'Log_ratio'], 
        ascending=[True, True, True, False]
    )
    
    return table_data

def generate_latex_table(table_data):
    """Generate LaTeX longtable"""
    
    print("Generating LaTeX longtable...")
    
    latex = """\\begin{longtable}{lllrccc}
\\caption{Outlier Diseases by Age Group: Top 20 High-Degree and Top 10 Low-Degree Outliers} \\label{tab:outliers_extended} \\\\
\\toprule
Sex & Age & ICD Code & Degree & Prevalence & Log Ratio & Type \\\\
\\midrule
\\endfirsthead

\\multicolumn{7}{c}{\\tablename\\ \\thetable\\ -- Continued from previous page} \\\\
\\toprule
Sex & Age & ICD Code & Degree & Prevalence & Log Ratio & Type \\\\
\\midrule
\\endhead

\\midrule
\\multicolumn{7}{r}{Continued on next page} \\\\
\\endfoot

\\bottomrule
\\endlastfoot

"""
    
    for idx, row in table_data.iterrows():
        prev_formatted = format_prevalence_scientific(row['Prevalence'])
        type_label = 'High' if row['outlier_type'] == 'high_degree' else 'Low'
        
        latex += f"{row['Sex']} & {row['Age_Range']} & {row['ICD_Code']} & {row['Degree']} & {prev_formatted} & {row['Log_ratio']:.2f} & {type_label} \\\\\n"
        
        # Add midrule after each age group change
        if idx < len(table_data) - 1:
            next_row = table_data.iloc[idx + 1]
            if (row['Sex'] != next_row['Sex']) or (row['Age_Range'] != next_row['Age_Range']):
                latex += "\\midrule\n"
    
    latex += """\\end{longtable}
"""
    
    return latex

def generate_icd_legend(table_data):
    """Generate ICD code legend"""
    
    print("Generating ICD legend...")
    
    unique_codes = sorted(table_data['ICD_Code'].unique())
    
    legend = """\\begin{longtable}{ll}
\\caption{ICD-10 Code Descriptions for Outlier Diseases} \\label{tab:icd_legend} \\\\
\\toprule
ICD Code & Description \\\\
\\midrule
\\endfirsthead

\\multicolumn{2}{c}{\\tablename\\ \\thetable\\ -- Continued from previous page} \\\\
\\toprule
ICD Code & Description \\\\
\\midrule
\\endhead

\\midrule
\\multicolumn{2}{r}{Continued on next page} \\\\
\\endfoot

\\bottomrule
\\endlastfoot

"""
    
    # Create ICD to description mapping
    desc_map = dict(zip(table_data['ICD_Code'], table_data['Description_Eng']))
    
    for code in unique_codes:
        desc = desc_map.get(code, 'Unknown')
        # Escape LaTeX special characters
        desc = str(desc).replace('&', '\\&').replace('_', '\\_').replace('%', '\\%')
        legend += f"{code} & {desc} \\\\\n"
    
    legend += """\\end{longtable}
"""
    
    return legend

def print_statistics(table_data, df_all_outliers):
    """Print summary statistics"""
    
    print("\n" + "="*80)
    print("SUMMARY STATISTICS")
    print("="*80)
    
    print(f"\nTotal outliers detected: {len(df_all_outliers)}")
    print(f"Rows in final table: {len(table_data)}")
    
    # Count by sex and age
    for sex in ['Female', 'Male']:
        print(f"\n{sex}:")
        sex_data = table_data[table_data['Sex'] == sex]
        total_high = 0
        total_low = 0
        
        for age in sorted(sex_data['Age_Range'].unique()):
            age_data = sex_data[sex_data['Age_Range'] == age]
            high_count = len(age_data[age_data['outlier_type'] == 'high_degree'])
            low_count = len(age_data[age_data['outlier_type'] == 'low_degree'])
            total_high += high_count
            total_low += low_count
            print(f"  {age:5}: {high_count:2} high + {low_count:2} low = {len(age_data):2} total")
        
        print(f"  {'TOTAL':5}: {total_high:2} high + {total_low:2} low = {len(sex_data):2} total")

def main():
    """Main execution function"""
    
    print("\n" + "="*80)
    print("STANDALONE OUTLIER DETECTION AND TABLE GENERATION")
    print("="*80)
    
    # Step 1: Load all data
    print("\nStep 1: Loading network and prevalence data...")
    all_data = []
    
    for gender in ['Female', 'Male']:
        for age_group in range(1, 9):
            print(f"  Loading {gender} age {age_group}...")
            df = load_network_and_prevalence(gender, age_group)
            all_data.append(df)
    
    df_all = pd.concat(all_data, ignore_index=True)
    print(f"Total diseases with degree > 0 and prevalence > 0: {len(df_all)}")
    
    # Step 2: Detect outliers
    print("\nStep 2: Detecting outliers...")
    df_outliers = detect_outliers(df_all)
    print(f"Total outliers detected: {len(df_outliers)}")
    
    # Step 3: Add English descriptions
    print("\nStep 3: Adding English descriptions...")
    df_outliers = add_english_descriptions(df_outliers)
    
    # Step 4: Select top outliers for table
    print("\nStep 4: Selecting top outliers for table...")
    table_data = select_top_outliers(df_outliers, n_high=20, n_low=10)
    print(f"Rows in final table: {len(table_data)}")
    
    # Step 5: Generate LaTeX tables
    print("\nStep 5: Generating LaTeX tables...")
    latex_table = generate_latex_table(table_data)
    latex_legend = generate_icd_legend(table_data)
    
    # Step 6: Save outputs
    print("\nStep 6: Saving outputs...")
    
    # Save main table
    table_file = OUTPUT_DIR / 'outliers_table_FINAL.tex'
    with open(table_file, 'w') as f:
        f.write(latex_table)
    print(f"✓ Main table saved to: {table_file}")
    
    # Save legend
    legend_file = OUTPUT_DIR / 'icd_legend_FINAL.tex'
    with open(legend_file, 'w') as f:
        f.write(latex_legend)
    print(f"✓ ICD legend saved to: {legend_file}")
    
    # Save CSV for reference
    csv_file = OUTPUT_DIR / 'outliers_data_FINAL.csv'
    table_data.to_csv(csv_file, index=False)
    print(f"✓ Data CSV saved to: {csv_file}")
    
    # Step 7: Print statistics
    print_statistics(table_data, df_outliers)
    
    # Step 8: Generate usage instructions
    instructions = """# Overleaf Usage Instructions

## Files Generated

1. **outliers_table_FINAL.tex** - Main outlier table
2. **icd_legend_FINAL.tex** - ICD code descriptions
3. **outliers_data_FINAL.csv** - Raw data (for reference)

## How to Use in Overleaf

### Step 1: Add to preamble
```latex
\\usepackage{longtable}
\\usepackage{booktabs}
```

### Step 2: Include tables
```latex
% Main outlier table
\\input{outliers_table_FINAL.tex}

% ICD legend (in appendix)
\\input{icd_legend_FINAL.tex}
```

## Table Details

- **Top 20 high-degree** outliers per sex-age group
- **Top 10 low-degree** outliers per sex-age group
- Automatic page breaks with repeated headers
- Horizontal lines separate age groups

## No wrapper needed!
longtable handles everything - don't use \\begin{table}
"""
    
    inst_file = OUTPUT_DIR / 'USAGE_INSTRUCTIONS.txt'
    with open(inst_file, 'w') as f:
        f.write(instructions)
    print(f"✓ Instructions saved to: {inst_file}")
    
    print("\n" + "="*80)
    print("✓ ALL DONE!")
    print("="*80)
    print("\nFiles ready for download:")
    print("  1. outliers_table_FINAL.tex")
    print("  2. icd_legend_FINAL.tex")
    print("  3. outliers_data_FINAL.csv")
    print("  4. USAGE_INSTRUCTIONS.txt")
    print("\n" + "="*80)

if __name__ == '__main__':
    main()


STANDALONE OUTLIER DETECTION AND TABLE GENERATION

Step 1: Loading network and prevalence data...
  Loading Female age 1...
  Loading Female age 2...
  Loading Female age 3...
  Loading Female age 4...
  Loading Female age 5...
  Loading Female age 6...
  Loading Female age 7...
  Loading Female age 8...
  Loading Male age 1...
  Loading Male age 2...
  Loading Male age 3...
  Loading Male age 4...
  Loading Male age 5...
  Loading Male age 6...
  Loading Male age 7...
  Loading Male age 8...
Total diseases with degree > 0 and prevalence > 0: 3880

Step 2: Detecting outliers...
Detecting outliers using 5th/95th percentile method...
Total outliers detected: 403

Step 3: Adding English descriptions...
Adding English descriptions...

Step 4: Selecting top outliers for table...
Selecting top 20 high and top 10 low outliers per group...
Rows in final table: 340

Step 5: Generating LaTeX tables...
Generating LaTeX longtable...
Generating ICD legend...

Step 6: Saving outputs...
✓ Main table