In [None]:
import pandas as pd


# Load all available indicators
df = pd.read_csv('../data/who_obesity_data.csv')
df.shape

In [None]:
# Examine dimension values
print("Dim1 (Sex) Value counts:")
print(df['Dim1'].value_counts(dropna=False))
print("\n" + "="*50 + "\n")

print("Dim2 (Categories) - Value counts:")
print(df['Dim2'].value_counts(dropna=False))
print("\n" + "="*50 + "\n")

print("Dim1Type values:")
print(df['Dim1Type'].value_counts(dropna=False))
print("\n" + "="*50 + "\n")

print("Dim2Type values:")
print(df['Dim2Type'].value_counts(dropna=False))

In [None]:

# Check all Dim1Type categories and their top values
print("=" * 60)
print("DIM1 TYPES AND THEIR VALUES")
print("=" * 60)

for dim_type in df['Dim1Type'].dropna().unique():
    dim_values = df[df['Dim1Type'] == dim_type]['Dim1'].value_counts().head(10)
    print(f"\n{dim_type}:")
    print(dim_values)
    print("-" * 40)

print("\n\n")
print("=" * 60)
print("DIM2 TYPES AND THEIR VALUES")
print("=" * 60)

for dim_type in df['Dim2Type'].dropna().unique():
    dim_values = df[df['Dim2Type'] == dim_type]['Dim2'].value_counts().head(10)
    print(f"\n{dim_type}:")
    print(dim_values)
    print("-" * 40)

In [None]:
# Define aggregate/total values for each dimension type
aggregate_filters = {
    # Dim1 aggregates
    'SEX': ['SEX_BTSX'],  # Both sexes
    'RESIDENCEAREATYPE': ['RESIDENCEAREATYPE_TOTL'],  # Total (urban + rural)
    'AGEGROUP': ['AGEGROUP_YEARSALL'],  # All ages
    'WEALTHQUINTILE': ['WEALTHQUINTILE_TOTL'],  # Total wealth
    'EDUCATIONLEVEL': ['EDUCATIONLEVEL_TOTL'],  # Total education
    'SEVERITY': ['SEVERITY_TOTAL'],  # Total severity
    'HOUSEHOLDWEALTH': ['HOUSEHOLDWEALTH_TOTL'],  # Total wealth
    'WEALTHTERCILE': ['WEALTHTERCILE_TOTL'],  # Total wealth
    'ALCOHOLTYPE': ['ALCOHOLTYPE_SA_TOTAL'],  # Total alcohol
    'TB_TREATMENTTYPE': ['TB_TREATMENTTYPE_TB_TREATMENTTYPE_TOTL'],  # Total TB treatment
    'CONSUMPTIONTYPE': ['CONSUMPTIONTYPE_CONSUMPTION_TOTAL'],  # Total consumption
    'ASSISTIVETECHPRODUCT': ['ASSISTIVETECHPRODUCT_ASSISTIVETECH_TOTAL'],  # Total assistive tech

    # Dim2 aggregates
    'GHECAUSE': ['GHECAUSE_GHE000000'],  # All causes
    'ENVCAUSE': ['ENVCAUSE_ENVCAUSE000'],  # All environmental causes (likely)
}

# Also handle AGEGROUP in Dim2
dim2_agegroup_aggregates = ['AGEGROUP_AGEAll', 'AGEGROUP_YEARS18-PLUS']

df_filtered = df.copy()

# Filter Dim1 to aggregates
for dim_type, allowed_values in aggregate_filters.items():
    df_filtered = df_filtered[
        (df_filtered['Dim1Type'] != dim_type) |  # Keep if not this dimension type
        (df_filtered['Dim1'].isin(allowed_values))  # Or keep if it's an aggregate value
    ]

# Filter Dim2 to aggregates
for dim_type, allowed_values in aggregate_filters.items():
    if dim_type == 'AGEGROUP':
        # Special handling for AGEGROUP in Dim2
        df_filtered = df_filtered[
            (df_filtered['Dim2Type'] != dim_type) |
            (df_filtered['Dim2'].isin(dim2_agegroup_aggregates))
        ]
    else:
        df_filtered = df_filtered[
            (df_filtered['Dim2Type'] != dim_type) |
            (df_filtered['Dim2'].isin(allowed_values))
        ]

# Count countries per indicator
indicator_coverage = df_filtered.groupby(['IndicatorCode',
'IndicatorName'])['SpatialDim'].nunique().reset_index(name='Country_Count')
indicator_coverage = indicator_coverage.sort_values('Country_Count', ascending=False)

print(f"Total records before filtering: {len(df):,}")
print(f"Total records after filtering: {len(df_filtered):,}")
print(f"Reduction: {((len(df) - len(df_filtered)) / len(df) * 100):.1f}%")
print(f"\nNumber of unique indicators: {indicator_coverage.shape[0]}")
print(f"\nTop 200 indicators by country coverage:")


# Remove BMI indicators and add only NCD_BMI_30A indicator
top200Indicators = indicator_coverage[~indicator_coverage['IndicatorCode'].str.lower().str.contains('bmi')].head(200)
top200Indicators = pd.concat([indicator_coverage[indicator_coverage['IndicatorCode'] == 'NCD_BMI_30A'],top200Indicators]).reset_index(drop=True)

with pd.option_context('display.max_rows', None):
    display(top200Indicators)

top200Indicators[['IndicatorCode', 'IndicatorName']].to_csv('../data/metadata/who_indicators_obesity_core.csv',index=False)