# Feature Categorization & Outcome Engineering


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import re

warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("="*80)
print("FEATURE CATEGORIZATION & OUTCOME VARIABLE ENGINEERING")
print("Steps 2 & 3: Variable Grouping and Target Engineering")
print("="*80)


In [None]:
def load_data():
    """Load the main dataset and data dictionary"""
    print("\n📊 STEP 1: LOADING DATA")
    print("-" * 50)
    
    try:
        # Load the main dataset
        df = pd.read_excel('Data/raw/IMPUTED_DATA_WITH REDUCED_columns_21_09_2025.xlsx')
        print("✅ Main dataset loaded successfully!")
        
        # Load the data dictionary
        dict_df = pd.read_excel('Data/external/PMNS F0 Mother, FAther & F1 Child Serial Final Data Dictionary (GCRF - MAS3)_20JULY2021-DEscr.xlsx')
        print("✅ Data dictionary loaded successfully!")
        
        return df, dict_df
        
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return None, None


In [None]:
def examine_data_dictionary(dict_df):
    """Examine the data dictionary structure"""
    print("\n📊 STEP 2: EXAMINING DATA DICTIONARY")
    print("-" * 50)
    
    print(f"📏 Dictionary shape: {dict_df.shape}")
    print(f"📋 Dictionary columns: {list(dict_df.columns)}")
    
    # Display first few rows
    print("\n👀 First 5 rows of dictionary:")
    print(dict_df.head())
    
    # Look for variable name patterns
    print("\n🔍 Variable name patterns in dictionary:")
    for col in dict_df.columns:
        if dict_df[col].dtype == 'object':
            sample_values = dict_df[col].dropna().head(10).tolist()
            print(f"  {col}: {sample_values}")
    
    return dict_df


In [None]:
def create_domain_mapping():
    """Create domain mapping based on prefix patterns"""
    print("\n📊 STEP 3: CREATING DOMAIN MAPPING")
    print("-" * 50)
    
    # Define domain patterns based on the requirements
    domain_patterns = {
        'Maternal_socio_demographic': {
            'prefixes': ['f0_edu_', 'f0_occ_', 'f0_caste_', 'f0_m_age', 'f0_m_marital', 'f0_m_religion'],
            'description': 'Mother\'s education, occupation, caste, age, marital status, religion',
            'examples': ['mother\'s education', 'caste', 'occupation']
        },
        'Maternal_clinical_biomarkers': {
            'prefixes': ['f0_f_', 'f0_m_hem', 'f0_m_gluc', 'f0_m_vit', 'f0_m_fol', 'f0_m_ferr'],
            'description': 'Hematocrit, glucose, Vit B12, folate, ferritin, clinical measurements',
            'examples': ['hematocrit', 'glucose', 'Vit B12', 'folate', 'ferritin']
        },
        'Paternal_socio_demographic_biomarkers': {
            'prefixes': ['f0_p_', 'f0_f_'],
            'description': 'Father\'s BMI, platelets, socio-demographic factors',
            'examples': ['father\'s BMI', 'platelets']
        },
        'Household_environment': {
            'prefixes': ['f0_type_fly', 'f0_fly_size', 'f0_house', 'f0_sanitation', 'f0_water'],
            'description': 'Family type, size, household characteristics, environment',
            'examples': ['family type', 'family size']
        },
        'Child_outcomes': {
            'prefixes': ['f1_bw', 'f1_', 'birthweight', 'gestational_age'],
            'description': 'Birthweight, derived LBW flag, child health outcomes',
            'examples': ['birthweight', 'LBW flag']
        },
        'Maternal_pregnancy': {
            'prefixes': ['f0_m_preg', 'f0_m_gest', 'f0_m_antenatal', 'f0_m_delivery'],
            'description': 'Pregnancy-related variables, antenatal care, delivery',
            'examples': ['pregnancy complications', 'antenatal care']
        },
        'Maternal_anthropometry': {
            'prefixes': ['f0_m_bmi', 'f0_m_height', 'f0_m_weight', 'f0_m_waist', 'f0_m_hip'],
            'description': 'Maternal body measurements, anthropometry',
            'examples': ['BMI', 'height', 'weight', 'waist circumference']
        }
    }
    
    print("📋 Domain patterns defined:")
    for domain, info in domain_patterns.items():
        print(f"  {domain}: {info['description']}")
        print(f"    Prefixes: {info['prefixes']}")
    
    return domain_patterns


In [None]:
def categorize_variables(df, domain_patterns):
    """Categorize variables based on domain patterns"""
    print("\n📊 STEP 4: CATEGORIZING VARIABLES")
    print("-" * 50)
    
    variable_categories = []
    
    for col in df.columns:
        col_lower = col.lower()
        domain = 'Uncategorized'
        variable_type = 'Unknown'
        unit = 'Unknown'
        
        # Determine domain based on prefix patterns
        for domain_name, info in domain_patterns.items():
            for prefix in info['prefixes']:
                if col_lower.startswith(prefix.lower()):
                    domain = domain_name
                    break
            if domain != 'Uncategorized':
                break
        
        # Determine variable type
        if df[col].dtype in ['int64', 'float64']:
            if df[col].nunique() <= 10 and df[col].dtype == 'int64':
                variable_type = 'Categorical'
            else:
                variable_type = 'Continuous'
        else:
            variable_type = 'Categorical'
        
        # Determine unit based on variable name patterns
        if 'bmi' in col_lower:
            unit = 'kg/m²'
        elif 'weight' in col_lower or 'bw' in col_lower:
            unit = 'grams'
        elif 'height' in col_lower:
            unit = 'cm'
        elif 'age' in col_lower:
            unit = 'years'
        elif 'gluc' in col_lower or 'glucose' in col_lower:
            unit = 'mg/dL'
        elif 'hem' in col_lower or 'hematocrit' in col_lower:
            unit = '%'
        elif 'vit' in col_lower or 'folate' in col_lower or 'ferritin' in col_lower:
            unit = 'ng/mL'
        elif 'gest' in col_lower:
            unit = 'weeks'
        else:
            unit = 'count/score'
        
        variable_categories.append({
            'Variable': col,
            'Domain': domain,
            'Type': variable_type,
            'Unit': unit,
            'Missing_Count': df[col].isnull().sum(),
            'Missing_Percent': (df[col].isnull().sum() / len(df)) * 100,
            'Unique_Values': df[col].nunique(),
            'Data_Type': str(df[col].dtype)
        })
    
    # Create DataFrame
    categorization_df = pd.DataFrame(variable_categories)
    
    # Display summary
    print(f"📊 Total variables categorized: {len(categorization_df)}")
    print(f"📊 Domain distribution:")
    domain_counts = categorization_df['Domain'].value_counts()
    for domain, count in domain_counts.items():
        print(f"  {domain}: {count} variables")
    
    print(f"📊 Type distribution:")
    type_counts = categorization_df['Type'].value_counts()
    for var_type, count in type_counts.items():
        print(f"  {var_type}: {count} variables")
    
    return categorization_df


In [None]:
def create_grouping_table(categorization_df):
    """Create the variable grouping table CSV"""
    print("\n📊 STEP 5: CREATING VARIABLE GROUPING TABLE")
    print("-" * 50)
    
    # Create output directory
    output_dir = Path('Data/processed')
    output_dir.mkdir(exist_ok=True)
    
    # Select relevant columns for the grouping table
    grouping_table = categorization_df[['Variable', 'Domain', 'Type', 'Unit']].copy()
    
    # Save to CSV
    output_file = output_dir / 'variable_grouping_table.csv'
    grouping_table.to_csv(output_file, index=False)
    print(f"✅ Variable grouping table saved: {output_file}")
    
    # Display sample
    print("\n👀 Sample of grouping table:")
    print(grouping_table.head(10))
    
    return output_file


In [None]:
def examine_birthweight_variables(df):
    """Examine birthweight and related variables"""
    print("\n📊 STEP 6: EXAMINING BIRTHWEIGHT VARIABLES")
    print("-" * 50)
    
    # Find birthweight related variables
    bw_vars = [col for col in df.columns if 'bw' in col.lower() or 'birthweight' in col.lower()]
    print(f"📊 Birthweight related variables found: {bw_vars}")
    
    # Find gestational age variables
    ga_vars = [col for col in df.columns if 'gest' in col.lower() or 'ga' in col.lower() or 'age' in col.lower()]
    print(f"📊 Gestational age related variables found: {ga_vars}")
    
    # Find sex variables
    sex_vars = [col for col in df.columns if 'sex' in col.lower() or 'gender' in col.lower()]
    print(f"📊 Sex related variables found: {sex_vars}")
    
    # Examine main birthweight variable (f1_bw)
    if 'f1_bw' in df.columns:
        bw_data = df['f1_bw'].dropna()
        print(f"\n📊 f1_bw analysis:")
        print(f"  Non-null values: {len(bw_data)}")
        print(f"  Mean: {bw_data.mean():.2f} grams")
        print(f"  Median: {bw_data.median():.2f} grams")
        print(f"  Std: {bw_data.std():.2f} grams")
        print(f"  Min: {bw_data.min():.2f} grams")
        print(f"  Max: {bw_data.max():.2f} grams")
        print(f"  Range: {bw_data.min():.2f} - {bw_data.max():.2f} grams")
        
        # Check for implausible values
        extreme_low = (bw_data < 1000).sum()
        extreme_high = (bw_data > 5000).sum()
        print(f"  Extreme low (< 1000g): {extreme_low} cases")
        print(f"  Extreme high (> 5000g): {extreme_high} cases")
        
        return {
            'bw_vars': bw_vars,
            'ga_vars': ga_vars,
            'sex_vars': sex_vars,
            'main_bw_var': 'f1_bw',
            'bw_stats': {
                'count': len(bw_data),
                'mean': bw_data.mean(),
                'std': bw_data.std(),
                'min': bw_data.min(),
                'max': bw_data.max(),
                'extreme_low': extreme_low,
                'extreme_high': extreme_high
            }
        }
    else:
        print("⚠️  f1_bw variable not found!")
        return None


In [None]:
def create_lbw_flag(df, bw_var='f1_bw'):
    """Create LBW flag variable"""
    print("\n📊 STEP 7: CREATING LBW FLAG")
    print("-" * 50)
    
    if bw_var not in df.columns:
        print(f"❌ {bw_var} variable not found!")
        return None
    
    # Create LBW flag: 1 if birthweight < 2500g, else 0
    df['LBW_flag'] = np.where(df[bw_var] < 2500, 1, 0)
    
    # Calculate statistics
    lbw_count = df['LBW_flag'].sum()
    lbw_percent = (lbw_count / len(df)) * 100
    
    print(f"📊 LBW flag created:")
    print(f"  LBW cases: {lbw_count} ({lbw_percent:.1f}%)")
    print(f"  Normal birthweight: {len(df) - lbw_count} ({100 - lbw_percent:.1f}%)")
    
    return df


In [None]:
def check_gestational_age_for_sga_lga(df, ga_vars):
    """Check if we can create SGA/LGA variables"""
    print("\n📊 STEP 8: CHECKING FOR SGA/LGA DERIVATION")
    print("-" * 50)
    
    if not ga_vars:
        print("⚠️  No gestational age variables found for SGA/LGA derivation")
        return None
    
    # Check each GA variable
    for ga_var in ga_vars:
        if ga_var in df.columns:
            ga_data = df[ga_var].dropna()
            if len(ga_data) > 0:
                print(f"📊 {ga_var} analysis:")
                print(f"  Non-null values: {len(ga_data)}")
                print(f"  Mean: {ga_data.mean():.2f}")
                print(f"  Range: {ga_data.min():.2f} - {ga_data.max():.2f}")
                
                # Check if values are in weeks (typical range 20-45)
                if ga_data.min() >= 15 and ga_data.max() <= 50:
                    print(f"  ✅ Appears to be in weeks - suitable for SGA/LGA derivation")
                    return ga_var
                else:
                    print(f"  ⚠️  Values outside typical gestational age range")
    
    return None


In [None]:
def create_birthweight_plots(df, bw_var='f1_bw'):
    """Create birthweight distribution plots"""
    print("\n📊 STEP 9: CREATING BIRTHWEIGHT PLOTS")
    print("-" * 50)
    
    if bw_var not in df.columns:
        print(f"❌ {bw_var} variable not found!")
        return
    
    # Create plots directory
    plots_dir = Path('PLOTS')
    plots_dir.mkdir(exist_ok=True)
    
    # 1. Histogram of birthweight
    plt.figure(figsize=(12, 8))
    bw_data = df[bw_var].dropna()
    
    plt.subplot(2, 2, 1)
    plt.hist(bw_data, bins=30, alpha=0.7, edgecolor='black')
    plt.axvline(2500, color='red', linestyle='--', label='LBW threshold (2500g)')
    plt.xlabel('Birthweight (grams)')
    plt.ylabel('Frequency')
    plt.title('Birthweight Distribution')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 2. Box plot
    plt.subplot(2, 2, 2)
    plt.boxplot(bw_data)
    plt.ylabel('Birthweight (grams)')
    plt.title('Birthweight Box Plot')
    plt.grid(True, alpha=0.3)
    
    # 3. LBW flag distribution
    plt.subplot(2, 2, 3)
    lbw_counts = df['LBW_flag'].value_counts()
    plt.pie(lbw_counts.values, labels=['Normal BW', 'LBW'], autopct='%1.1f%%', startangle=90)
    plt.title('Low Birthweight Distribution')
    
    # 4. Birthweight by LBW status
    plt.subplot(2, 2, 4)
    normal_bw = df[df['LBW_flag'] == 0][bw_var].dropna()
    lbw = df[df['LBW_flag'] == 1][bw_var].dropna()
    
    plt.hist([normal_bw, lbw], bins=20, alpha=0.7, label=['Normal BW', 'LBW'], 
             color=['blue', 'red'], edgecolor='black')
    plt.xlabel('Birthweight (grams)')
    plt.ylabel('Frequency')
    plt.title('Birthweight by LBW Status')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(plots_dir / 'birthweight_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("✅ Birthweight analysis plots saved")


In [None]:
def create_descriptive_table(df, bw_var='f1_bw', sex_vars=None, ga_vars=None):
    """Create descriptive table for birthweight"""
    print("\n📊 STEP 10: CREATING DESCRIPTIVE TABLE")
    print("-" * 50)
    
    # Create descriptive statistics
    descriptive_stats = []
    
    # Overall statistics
    bw_data = df[bw_var].dropna()
    overall_stats = {
        'Group': 'Overall',
        'N': len(bw_data),
        'Mean_BW': bw_data.mean(),
        'SD_BW': bw_data.std(),
        'Min_BW': bw_data.min(),
        'Max_BW': bw_data.max(),
        'LBW_Count': df['LBW_flag'].sum(),
        'LBW_Percent': (df['LBW_flag'].sum() / len(df)) * 100
    }
    descriptive_stats.append(overall_stats)
    
    # By sex if available
    if sex_vars:
        for sex_var in sex_vars:
            if sex_var in df.columns:
                sex_values = df[sex_var].dropna().unique()
                for sex_val in sex_values:
                    sex_data = df[df[sex_var] == sex_val][bw_var].dropna()
                    if len(sex_data) > 0:
                        sex_stats = {
                            'Group': f'Sex_{sex_val}',
                            'N': len(sex_data),
                            'Mean_BW': sex_data.mean(),
                            'SD_BW': sex_data.std(),
                            'Min_BW': sex_data.min(),
                            'Max_BW': sex_data.max(),
                            'LBW_Count': df[df[sex_var] == sex_val]['LBW_flag'].sum(),
                            'LBW_Percent': (df[df[sex_var] == sex_val]['LBW_flag'].sum() / 
                                           len(df[df[sex_var] == sex_val])) * 100
                        }
                        descriptive_stats.append(sex_stats)
    
    # By gestational age if available
    if ga_vars:
        for ga_var in ga_vars:
            if ga_var in df.columns:
                ga_data = df[ga_var].dropna()
                if len(ga_data) > 0:
                    # Create gestational age groups
                    ga_quartiles = ga_data.quantile([0.25, 0.5, 0.75])
                    
                    for i, (q_name, q_value) in enumerate(zip(['Q1', 'Q2', 'Q3', 'Q4'], 
                                                             [ga_data.min(), ga_quartiles[0.25], 
                                                              ga_quartiles[0.5], ga_quartiles[0.75]])):
                        if i == 0:
                            ga_group = df[df[ga_var] <= q_value]
                        elif i == 3:
                            ga_group = df[df[ga_var] > ga_quartiles[0.75]]
                        else:
                            ga_group = df[(df[ga_var] > ga_quartiles[0.25] * (i-1)) & 
                                        (df[ga_var] <= q_value)]
                        
                        ga_bw_data = ga_group[bw_var].dropna()
                        if len(ga_bw_data) > 0:
                            ga_stats = {
                                'Group': f'GA_{q_name}',
                                'N': len(ga_bw_data),
                                'Mean_BW': ga_bw_data.mean(),
                                'SD_BW': ga_bw_data.std(),
                                'Min_BW': ga_bw_data.min(),
                                'Max_BW': ga_bw_data.max(),
                                'LBW_Count': ga_group['LBW_flag'].sum(),
                                'LBW_Percent': (ga_group['LBW_flag'].sum() / len(ga_group)) * 100
                            }
                            descriptive_stats.append(ga_stats)
    
    # Create DataFrame
    descriptive_df = pd.DataFrame(descriptive_stats)
    
    # Round numeric columns
    numeric_cols = ['Mean_BW', 'SD_BW', 'Min_BW', 'Max_BW', 'LBW_Percent']
    for col in numeric_cols:
        if col in descriptive_df.columns:
            descriptive_df[col] = descriptive_df[col].round(2)
    
    # Save to CSV
    output_dir = Path('Data/processed')
    output_dir.mkdir(exist_ok=True)
    output_file = output_dir / 'birthweight_descriptive_table.csv'
    descriptive_df.to_csv(output_file, index=False)
    
    print(f"✅ Descriptive table saved: {output_file}")
    print("\n👀 Descriptive table:")
    print(descriptive_df)
    
    return descriptive_df


In [None]:
def main():
    """Main execution function"""
    # Load data
    df, dict_df = load_data()
    if df is None:
        return
    
    # Step 2: Feature Categorization
    dict_df = examine_data_dictionary(dict_df)
    domain_patterns = create_domain_mapping()
    categorization_df = categorize_variables(df, domain_patterns)
    grouping_table_file = create_grouping_table(categorization_df)
    
    # Step 3: Outcome Variable Engineering
    bw_analysis = examine_birthweight_variables(df)
    if bw_analysis:
        df = create_lbw_flag(df)
        ga_var = check_gestational_age_for_sga_lga(df, bw_analysis['ga_vars'])
        create_birthweight_plots(df)
        descriptive_df = create_descriptive_table(df, sex_vars=bw_analysis['sex_vars'], 
                                                ga_vars=bw_analysis['ga_vars'])
    
    # Final summary
    print("\n" + "="*80)
    print("FEATURE CATEGORIZATION & OUTCOME ENGINEERING COMPLETE")
    print("="*80)
    print("✅ Deliverables created:")
    print(f"  - Variable grouping table: {grouping_table_file}")
    print("  - Birthweight descriptive table: Data/processed/birthweight_descriptive_table.csv")
    print("  - Birthweight analysis plots: PLOTS/birthweight_analysis.png")
    print("  - LBW_flag variable added to dataset")
    print("="*80)


In [None]:
if __name__ == "__main__":
    main()
