# Disease NaN Summary Analysis

This notebook processes patient data to count NaN values per disease.
For each disease, a patient is counted as NaN only if ALL entity columns for that disease are NaN.


In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict


In [None]:
def process_disease_nan_summary(input_csv, output_csv, chunksize=10000):
    # First pass: read header to identify disease groups
    print(f"Reading header from {input_csv}...")
    df_sample = pd.read_csv(input_csv, nrows=0)
    columns = df_sample.columns.tolist()
    
    # Group columns by disease ID (first part before the first dot)
    disease_entities = defaultdict(list)
    for col in columns:
        if col != 'Population_ID':
            disease_id = col.split('.')[0]
            disease_entities[disease_id].append(col)
    
    print(f"Found {len(disease_entities)} unique diseases")
    
    # Initialize counters for each disease
    disease_nan_count = {disease: 0 for disease in disease_entities.keys()}
    total_patients = 0
    
    # Process CSV in chunks
    print(f"Processing {input_csv} in chunks of {chunksize} rows...")
    for chunk_num, chunk in enumerate(pd.read_csv(input_csv, chunksize=chunksize)):
        if (chunk_num + 1) % 5 == 0:
            print(f"Processing chunk {chunk_num + 1}... (Patients so far: {total_patients})")
        
        # Count patients in this chunk
        total_patients += len(chunk)
        
        # For each disease, check if all entities are NaN for each patient
        for disease_id, entity_cols in disease_entities.items():
            # Check if all entity columns are NaN for each row
            all_nan_mask = chunk[entity_cols].isna().all(axis=1)
            disease_nan_count[disease_id] += all_nan_mask.sum()
    
    print(f"\nTotal patients processed: {total_patients}")
    
    # Create summary DataFrame
    summary_data = []
    for disease_id in sorted(disease_entities.keys(), key=int):
        nan_count = disease_nan_count[disease_id]
        nan_proportion = nan_count / total_patients if total_patients > 0 else 0
        summary_data.append({
            'Disease': disease_id,
            'NaN_Count': nan_count,
            'NaN_Proportion': nan_proportion
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv(output_csv, index=False)
    print(f"Summary saved to {output_csv}")
    
    return summary_df


## Process Essential Info


In [5]:
print("=" * 60)
print("Processing Essential Info")
print("=" * 60)

essential_summary = process_disease_nan_summary(
    'population_fieldID_completed_essential_new.csv',
    'nan_summary_essential_new.csv',
    chunksize=10000
)

print("\nFirst 10 rows of essential summary:")
display(essential_summary.head(10))


Processing Essential Info
Reading header from population_fieldID_completed_essential_new.csv...
Found 204 unique diseases
Processing population_fieldID_completed_essential_new.csv in chunks of 10000 rows...
Processing chunk 5... (Patients so far: 40000)
Processing chunk 10... (Patients so far: 90000)

Total patients processed: 115625
Summary saved to nan_summary_essential_new.csv

First 10 rows of essential summary:


Unnamed: 0,Disease,NaN_Count,NaN_Proportion
0,31,0,0.0
1,46,493,0.004264
2,47,453,0.003918
3,48,167,0.001444
4,49,177,0.001531
5,50,221,0.001911
6,51,213,0.001842
7,102,6049,0.052316
8,670,0,0.0
9,680,302,0.002612


## Process Detailed Info


In [6]:
print("\n" + "=" * 60)
print("Processing Detailed Info")
print("=" * 60)

detailed_summary = process_disease_nan_summary(
    'population_fieldID_completed_detailed_new.csv',
    'nan_summary_detailed_new.csv',
    chunksize=10000
)

print("\nFirst 10 rows of detailed summary:")
display(detailed_summary.head(10))



Processing Detailed Info
Reading header from population_fieldID_completed_detailed_new.csv...
Found 50 unique diseases
Processing population_fieldID_completed_detailed_new.csv in chunks of 10000 rows...
Processing chunk 5... (Patients so far: 40000)
Processing chunk 10... (Patients so far: 90000)

Total patients processed: 115625
Summary saved to nan_summary_detailed_new.csv

First 10 rows of detailed summary:


Unnamed: 0,Disease,NaN_Count,NaN_Proportion
0,757,52868,0.457237
1,767,52868,0.457237
2,777,52893,0.457453
3,796,56669,0.49011
4,806,52868,0.457237
5,816,52868,0.457237
6,826,52868,0.457237
7,845,29809,0.257808
8,874,4532,0.039196
9,894,19525,0.168865


## Process Minor Info


In [7]:
print("\n" + "=" * 60)
print("Processing Minor Info")
print("=" * 60)

minor_summary = process_disease_nan_summary(
    'population_fieldID_completed_minor _new.csv',
    'nan_summary_minor_new.csv',
    chunksize=10000
)

print("\nFirst 10 rows of minor summary:")
display(minor_summary.head(10))



Processing Minor Info
Reading header from population_fieldID_completed_minor _new.csv...
Found 41 unique diseases
Processing population_fieldID_completed_minor _new.csv in chunks of 10000 rows...
Processing chunk 5... (Patients so far: 40000)
Processing chunk 10... (Patients so far: 90000)

Total patients processed: 115625
Summary saved to nan_summary_minor_new.csv

First 10 rows of minor summary:


Unnamed: 0,Disease,NaN_Count,NaN_Proportion
0,78,45967,0.397552
1,1100,0,0.0
2,1548,0,0.0
3,1618,25178,0.217756
4,1628,8205,0.070962
5,1677,0,0.0
6,1717,0,0.0
7,1727,0,0.0
8,1737,0,0.0
9,1747,0,0.0


## Summary Statistics


In [9]:
print("\n" + "=" * 60)
print("Summary Statistics")
print("=" * 60)

print(f"\nEssential Info: {len(essential_summary)} diseases")
print(f"Detailed Info: {len(detailed_summary)} diseases")
print(f"Minor Info: {len(minor_summary)} diseases")



Summary Statistics

Essential Info: 204 diseases
Detailed Info: 50 diseases
Minor Info: 41 diseases
