In [1]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
import warnings
warnings.filterwarnings('ignore')

print(" Libraries imported successfully")

 Libraries imported successfully


In [None]:
def data_preprocessing(df, exclude_features, target_strategy='binary'):
    """
    Phase 1.2: Data Preprocessing for Rwanda DHS Dataset
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Raw dataset
    exclude_features : list
        Features to exclude due to data leakage
    target_strategy : str
        'binary' - exclude never-had-sex cases
        'three_class' - encode as Early/Late/Never
    """
    
    print("="*80)
    print("PHASE 1.2: DATA PREPROCESSING")
    print("="*80)
    
    # Create working copy
    df_processed = df.copy()
    
    print(f"Starting dataset shape: {df_processed.shape}")
    print(f"Excluded features: {exclude_features}")
    print(f"Target strategy: {target_strategy}")
    
    # ================================================================
    # 1.2.1 HANDLE MISSING VALUES
    # ================================================================
    
    print("\n" + "="*60)
    print("1.2.1 MISSING VALUE HANDLING")
    print("="*60)
    
    # Analyze missing patterns before processing
    missing_before = df_processed.isnull().sum()
    missing_pct_before = (missing_before / len(df_processed)) * 100
    
    print("Missing data summary (>1% missing):")
    high_missing = missing_pct_before[missing_pct_before > 1].sort_values(ascending=False)
    for var, pct in high_missing.items():
        count = missing_before[var]
        print(f"  {var}: {count:,} ({pct:.1f}%)")
    
    # Handle target variable based on strategy
    print(f"\n1.2.1.1 TARGET VARIABLE PROCESSING ({target_strategy})")
    print("-" * 40)
    
    target_before = df_processed['early_sexual_debut'].value_counts(dropna=False)
    print("Target variable before processing:")
    for val, count in target_before.items():
        pct = (count / len(df_processed)) * 100
        print(f"  {val}: {count:,} ({pct:.1f}%)")
    
    if target_strategy == 'binary':
        # Exclude cases where early_sexual_debut is NaN (never had sex)
        df_processed = df_processed.dropna(subset=['early_sexual_debut'])
        print(f"\nAfter excluding never-had-sex cases: {df_processed.shape[0]:,} records")
        
        # Convert to integer for efficiency
        df_processed['early_sexual_debut'] = df_processed['early_sexual_debut'].astype(int)
        
    elif target_strategy == 'three_class':
        # Encode NaN as 2 (Never), 1.0 as 1 (Early), 0.0 as 0 (Late)
        df_processed['early_sexual_debut'] = df_processed['early_sexual_debut'].fillna(2)
        df_processed['early_sexual_debut'] = df_processed['early_sexual_debut'].astype(int)
        print("Three-class encoding: 0=Late debut, 1=Early debut, 2=Never had sex")
    
    target_after = df_processed['early_sexual_debut'].value_counts(dropna=False)
    print("Target variable after processing:")
    for val, count in target_after.items():
        pct = (count / len(df_processed)) * 100
        print(f"  {val}: {count:,} ({pct:.1f}%)")
    
    # Remove excluded features
    print(f"\n1.2.1.2 REMOVING LEAKAGE FEATURES")
    print("-" * 40)
    
    features_to_remove = [f for f in exclude_features if f in df_processed.columns]
    print(f"Removing features: {features_to_remove}")
    
    df_processed = df_processed.drop(columns=features_to_remove)
    print(f"Dataset shape after feature removal: {df_processed.shape}")
    
    # Impute missing values for other variables
    print(f"\n1.2.1.3 IMPUTATION STRATEGY")
    print("-" * 40)
    
    # Identify variable types for appropriate imputation
    numeric_vars = df_processed.select_dtypes(include=[np.number]).columns.tolist()
    numeric_vars = [var for var in numeric_vars if var != 'early_sexual_debut']
    
    categorical_vars = df_processed.select_dtypes(include=['object']).columns.tolist()
    
    # Remove ID variables from imputation
    id_vars = ['caseid', 'household_id', 'v001', 'v002']
    numeric_vars = [var for var in numeric_vars if var not in id_vars]
    categorical_vars = [var for var in categorical_vars if var not in id_vars]
    
    print(f"Numeric variables for imputation: {len(numeric_vars)}")
    print(f"Categorical variables for imputation: {len(categorical_vars)}")
    
    # Numeric imputation - use median for robustness
    if len(numeric_vars) > 0:
        numeric_imputer = SimpleImputer(strategy='median')
        df_processed[numeric_vars] = numeric_imputer.fit_transform(df_processed[numeric_vars])
        print("Applied median imputation to numeric variables")
    
    # Categorical imputation - use mode
    if len(categorical_vars) > 0:
        categorical_imputer = SimpleImputer(strategy='most_frequent')
        df_processed[categorical_vars] = categorical_imputer.fit_transform(df_processed[categorical_vars])
        print("Applied mode imputation to categorical variables")
    
    # ================================================================
    # 1.2.2 FEATURE ENGINEERING
    # ================================================================
    
    print("\n" + "="*60)
    print("1.2.2 FEATURE ENGINEERING")
    print("="*60)
    
    # Create binary indicators from categorical variables
    print("\n1.2.2.1 BINARY INDICATORS FROM CATEGORICAL VARIABLES")
    print("-" * 50)
    
    # Education level (v106) - Primary/Secondary/Higher vs None
    if 'v106' in df_processed.columns:
        df_processed['has_education'] = (df_processed['v106'] > 0).astype(int)
        df_processed['has_secondary_plus'] = (df_processed['v106'] >= 2).astype(int)
        print("Created education indicators: has_education, has_secondary_plus")
    
    # Religion (v130) - Major religions
    if 'v130' in df_processed.columns:
        df_processed['is_catholic'] = (df_processed['v130'] == 1).astype(int)
        df_processed['is_protestant'] = (df_processed['v130'] == 2).astype(int)
        df_processed['is_muslim'] = (df_processed['v130'] == 4).astype(int)
        print("Created religion indicators: is_catholic, is_protestant, is_muslim")
    
    # Marital status (v501) - Ever married
    if 'v501' in df_processed.columns:
        df_processed['ever_married'] = (df_processed['v501'] > 0).astype(int)
        df_processed['currently_married'] = (df_processed['v501'] == 1).astype(int)
        print("Created marital indicators: ever_married, currently_married")
    
    # Employment (v714)
    if 'v714' in df_processed.columns:
        df_processed['is_employed'] = df_processed['v714'].astype(int)
        print("Created employment indicator: is_employed")
    
    # Household assets
    asset_vars = ['hv206', 'hv207', 'hv208']  # electricity, radio, TV
    asset_names = ['has_electricity', 'has_radio', 'has_tv']
    
    for asset_var, asset_name in zip(asset_vars, asset_names):
        if asset_var in df_processed.columns:
            df_processed[asset_name] = df_processed[asset_var].astype(int)
    
    # Asset count
    if all(var in df_processed.columns for var in asset_names):
        df_processed['total_assets'] = df_processed[asset_names].sum(axis=1)
        print("Created asset indicators and total_assets count")
    
    # Scale continuous variables
    print("\n1.2.2.2 SCALING CONTINUOUS VARIABLES")
    print("-" * 40)
    
    continuous_to_scale = ['v107', 'v191', 'hv271']  # education years, wealth scores
    continuous_present = [var for var in continuous_to_scale if var in df_processed.columns]
    
    if len(continuous_present) > 0:
        scaler = StandardScaler()
        
        # Create scaled versions
        scaled_names = [f"{var}_scaled" for var in continuous_present]
        df_processed[scaled_names] = scaler.fit_transform(df_processed[continuous_present])
        
        print(f"Scaled variables: {continuous_present}")
        print(f"New scaled variables: {scaled_names}")
        
        # Store scaler for later use
        scaling_info = {
            'scaler': scaler,
            'original_vars': continuous_present,
            'scaled_vars': scaled_names
        }
    else:
        scaling_info = None
    
    # Generate interaction terms for key predictors
    print("\n1.2.2.3 INTERACTION TERMS")
    print("-" * 30)
    
    interactions_created = []
    
    # Age Ã— Education interaction
    if 'v012' in df_processed.columns and 'v107' in df_processed.columns:
        df_processed['age_education_interaction'] = df_processed['v012'] * df_processed['v107']
        interactions_created.append('age_education_interaction')
    
    # Age Ã— Wealth interaction  
    if 'v012' in df_processed.columns and 'v191' in df_processed.columns:
        df_processed['age_wealth_interaction'] = df_processed['v012'] * df_processed['v191']
        interactions_created.append('age_wealth_interaction')
    
    # Education Ã— Wealth interaction
    if 'v107' in df_processed.columns and 'v191' in df_processed.columns:
        df_processed['education_wealth_interaction'] = df_processed['v107'] * df_processed['v191']
        interactions_created.append('education_wealth_interaction')
    
    # Urban Ã— Education interaction
    if 'v102' in df_processed.columns and 'v107' in df_processed.columns:
        df_processed['urban_education_interaction'] = df_processed['v102'] * df_processed['v107']
        interactions_created.append('urban_education_interaction')
    
    print(f"Created interaction terms: {interactions_created}")
    
    # ================================================================
    # 1.2.3 GEOGRAPHIC STRATIFICATION
    # ================================================================
    
    print("\n" + "="*60)
    print("1.2.3 GEOGRAPHIC STRATIFICATION")
    print("="*60)
    
    # Create region indicators
    if 'v101' in df_processed.columns:
        print("\n1.2.3.1 REGIONAL INDICATORS")
        print("-" * 30)
        
        region_names = {1: 'kigali', 2: 'south', 3: 'west', 4: 'north', 5: 'east'}
        
        for region_code, region_name in region_names.items():
            col_name = f'region_{region_name}'
            df_processed[col_name] = (df_processed['v101'] == region_code).astype(int)
        
        region_distribution = df_processed['v101'].value_counts().sort_index()
        print("Regional distribution:")
        for region_code, count in region_distribution.items():
            region_name = region_names.get(region_code, f'Unknown_{region_code}')
            pct = (count / len(df_processed)) * 100
            print(f"  {region_name.title()}: {count:,} ({pct:.1f}%)")
        
        print(f"Created regional dummy variables: {list(region_names.values())}")
    
    # Urban/rural stratification
    if 'v102' in df_processed.columns:
        print("\n1.2.3.2 URBAN/RURAL STRATIFICATION")
        print("-" * 35)
        
        df_processed['is_urban'] = (df_processed['v102'] == 1).astype(int)
        df_processed['is_rural'] = (df_processed['v102'] == 2).astype(int)
        
        urban_rural_dist = df_processed['v102'].value_counts()
        print("Urban/Rural distribution:")
        print(f"  Urban: {urban_rural_dist.get(1, 0):,} ({urban_rural_dist.get(1, 0)/len(df_processed)*100:.1f}%)")
        print(f"  Rural: {urban_rural_dist.get(2, 0):,} ({urban_rural_dist.get(2, 0)/len(df_processed)*100:.1f}%)")
        
        print("Created urban/rural indicators: is_urban, is_rural")
    
    # ================================================================
    # 1.2.4 FINAL PREPROCESSING SUMMARY
    # ================================================================
    
    print("\n" + "="*60)
    print("1.2.4 PREPROCESSING SUMMARY")
    print("="*60)
    
    # Data shape summary
    print(f"Final dataset shape: {df_processed.shape}")
    print(f"Records processed: {df_processed.shape[0]:,}")
    print(f"Features available: {df_processed.shape[1]:,}")
    
    # Feature categories
    feature_summary = {
        'target': ['early_sexual_debut'],
        'demographics': ['v012', 'v013'],
        'geographic': ['v101', 'v102'] + [col for col in df_processed.columns if col.startswith('region_') or col.startswith('is_urban') or col.startswith('is_rural')],
        'education': ['v106', 'v107', 'v149', 'v150', 'v151', 'v152'] + [col for col in df_processed.columns if 'education' in col],
        'socioeconomic': ['v130', 'v190', 'v191', 'hv270', 'hv271'] + [col for col in df_processed.columns if any(x in col for x in ['wealth', 'religion', 'catholic', 'protestant', 'muslim'])],
        'household': ['hv009'] + [col for col in df_processed.columns if any(x in col for x in ['asset', 'electricity', 'radio', 'tv'])],
        'marital': ['v501', 'v502'] + [col for col in df_processed.columns if 'married' in col],
        'fertility': ['v201', 'v213', 'bord', 'b5'],
        'family_planning': ['v301', 'v312', 'v602'],
        'health': ['v157', 'v158', 'v384a', 'v384b'],
        'employment': ['v714'] + [col for col in df_processed.columns if 'employed' in col],
        'interactions': interactions_created,
        'scaled': scaled_names if scaling_info else []
    }
    
    print("\nFeature categories:")
    total_features = 0
    for category, features in feature_summary.items():
        available_features = [f for f in features if f in df_processed.columns]
        if available_features:
            print(f"  {category.title()}: {len(available_features)} features")
            total_features += len(available_features)
    
    print(f"\nTotal categorized features: {total_features}")
    
    # Missing data after preprocessing
    missing_after = df_processed.isnull().sum().sum()
    print(f"Missing values after preprocessing: {missing_after:,}")
    
    if missing_after == 0:
        print("âœ“ All missing values handled successfully")
    else:
        print(f"  {missing_after} missing values remain")
    
    # Data types summary
    print(f"\nData types summary:")
    dtype_counts = df_processed.dtypes.value_counts()
    for dtype, count in dtype_counts.items():
        print(f"  {dtype}: {count} variables")
    
    # Memory usage
    memory_mb = df_processed.memory_usage(deep=True).sum() / 1024**2
    print(f"\nMemory usage: {memory_mb:.1f} MB")
    
    return df_processed, {
        'scaling_info': scaling_info,
        'feature_summary': feature_summary,
        'target_strategy': target_strategy,
        'interactions_created': interactions_created,
        'preprocessing_stats': {
            'original_shape': df.shape,
            'final_shape': df_processed.shape,
            'missing_before': missing_before.sum(),
            'missing_after': missing_after,
            'memory_mb': memory_mb
        }
    }

print("âœ“ Preprocessing function defined successfully")


âœ“ Preprocessing function defined successfully


In [None]:
try:
    df = pd.read_csv(dataset_path)
    print("âœ“ Dataset loaded successfully")
    print(f"  Shape: {df.shape}")
    print(f"  Records: {df.shape[0]:,}")
    print(f"  Features: {df.shape[1]:,}")
    
    # Display basic info
    print("\nDataset info:")
    print(df.info())
    
except FileNotFoundError:
    print(" Dataset not found. Please check the file path.")
except Exception as e:
    print(f" Error loading dataset: {e}")


âœ“ Dataset loaded successfully
  Shape: (14634, 44)
  Records: 14,634
  Features: 44

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14634 entries, 0 to 14633
Data columns (total 44 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   v525                14634 non-null  int64  
 1   early_sexual_debut  14634 non-null  float64
 2   v012                14634 non-null  int64  
 3   v013                14634 non-null  int64  
 4   v101                14634 non-null  int64  
 5   v102                14634 non-null  int64  
 6   v106                14634 non-null  int64  
 7   v107                13282 non-null  float64
 8   v130                14634 non-null  int64  
 9   v190                14634 non-null  int64  
 10  v191                14634 non-null  int64  
 11  hv270               14634 non-null  int64  
 12  hv271               14634 non-null  int64  
 13  v149                14634 non-null  int64  
 14  v

In [7]:
print("First few rows of raw data:")
print(df.head())

print("\nTarget variable distribution:")
print(df['early_sexual_debut'].value_counts(dropna=False))


First few rows of raw data:
   v525  early_sexual_debut  v012  v013  v101  v102  v106  v107  v130  v190  \
0     0                 1.0    24     2     1     2     3   3.0     1     4   
1    17                 1.0    42     6     1     2     1   4.0     2     3   
2    22                 0.0    32     4     1     2     2   6.0     1     5   
3     0                 1.0    29     3     1     2     2   6.0     2     3   
4    19                 0.0    36     5     1     2     2   0.0     1     4   

   ...  v158  v384a  v384b  hv206  hv207  hv208           caseid  v001  v002  \
0  ...     0      0      0      1      0      0         1   3 02     1     3   
1  ...     0      0      0      1      0      0         1   4 01     1     4   
2  ...     2      1      1      1      1      1         1   5 02     1     5   
3  ...     2      0      0      1      1      0         1   6 03     1     6   
4  ...     2      1      0      1      0      0         1   7 02     1     7   

   household_id 

In [None]:
try:
    df_processed, preprocessing_info = data_preprocessing(
        df, 
        exclude_features, 
        target_strategy=target_strategy
    )
    
    print("\n" + "="*80)
    print("PREPROCESSING COMPLETE")
    print("="*80)
    print(f"âœ“ Ready for modeling with {df_processed.shape[0]:,} samples and {df_processed.shape[1]:,} features")
    
except Exception as e:
    print(f" Error during preprocessing: {e}")
    raise


PHASE 1.2: DATA PREPROCESSING
Starting dataset shape: (14634, 44)
Excluded features: ['v525', 'v512', 'v511', 'v212']
Target strategy: binary

1.2.1 MISSING VALUE HANDLING
Missing data summary (>1% missing):
  v511: 6,060 (41.4%)
  v512: 6,060 (41.4%)
  v212: 5,420 (37.0%)
  v107: 1,352 (9.2%)

1.2.1.1 TARGET VARIABLE PROCESSING (binary)
----------------------------------------
Target variable before processing:
  0.0: 7,919 (54.1%)
  1.0: 6,715 (45.9%)

After excluding never-had-sex cases: 14,634 records
Target variable after processing:
  0: 7,919 (54.1%)
  1: 6,715 (45.9%)

1.2.1.2 REMOVING LEAKAGE FEATURES
----------------------------------------
Removing features: ['v525', 'v512', 'v511', 'v212']
Dataset shape after feature removal: (14634, 40)

1.2.1.3 IMPUTATION STRATEGY
----------------------------------------
Numeric variables for imputation: 35
Categorical variables for imputation: 0
Applied median imputation to numeric variables

1.2.2 FEATURE ENGINEERING

1.2.2.1 BINARY IND

In [9]:
print("Sample of processed data:")
print(df_processed.head(10))

print("\nProcessed data info:")
print(df_processed.info())

print("\nTarget variable distribution after preprocessing:")
print(df_processed['early_sexual_debut'].value_counts())

Sample of processed data:
   early_sexual_debut  v012  v013  v101  v102  v106  v107  v130  v190  \
0                   1  24.0   2.0   1.0   2.0   3.0   3.0   1.0   4.0   
1                   1  42.0   6.0   1.0   2.0   1.0   4.0   2.0   3.0   
2                   0  32.0   4.0   1.0   2.0   2.0   6.0   1.0   5.0   
3                   1  29.0   3.0   1.0   2.0   2.0   6.0   2.0   3.0   
4                   0  36.0   5.0   1.0   2.0   2.0   0.0   1.0   4.0   
5                   1  23.0   2.0   1.0   2.0   2.0   6.0   2.0   5.0   
6                   0  38.0   5.0   1.0   2.0   2.0   6.0   1.0   5.0   
7                   0  30.0   4.0   1.0   2.0   2.0   6.0   3.0   3.0   
8                   1  46.0   7.0   1.0   2.0   1.0   3.0   1.0   2.0   
9                   0  41.0   6.0   1.0   2.0   1.0   4.0   2.0   5.0   

       v191  ...  age_wealth_interaction  education_wealth_interaction  \
0   61640.0  ...               1479360.0                      184920.0   
1  -44072.0  ...      

In [10]:
print("="*80)
print("PREPROCESSING STATISTICS")
print("="*80)

stats = preprocessing_info['preprocessing_stats']
print(f"\nOriginal shape: {stats['original_shape']}")
print(f"Final shape: {stats['final_shape']}")
print(f"Records removed: {stats['original_shape'][0] - stats['final_shape'][0]:,}")
print(f"Features removed: {stats['original_shape'][1] - stats['final_shape'][1]}")
print(f"Missing values before: {stats['missing_before']:,}")
print(f"Missing values after: {stats['missing_after']:,}")
print(f"Memory usage: {stats['memory_mb']:.2f} MB")

print(f"\nInteractions created:")
for interaction in preprocessing_info['interactions_created']:
    print(f"  - {interaction}")



PREPROCESSING STATISTICS

Original shape: (14634, 44)
Final shape: (14634, 66)
Records removed: 0
Features removed: -22
Missing values before: 18,892
Missing values after: 0
Memory usage: 8.80 MB

Interactions created:
  - age_education_interaction
  - age_wealth_interaction
  - education_wealth_interaction
  - urban_education_interaction


In [11]:
print("="*80)
print("FEATURE CATEGORIES SUMMARY")
print("="*80)

feature_summary = preprocessing_info['feature_summary']

for category, features in feature_summary.items():
    available_features = [f for f in features if f in df_processed.columns]
    if available_features:
        print(f"\n{category.upper()} ({len(available_features)} features):")
        for feature in available_features[:10]:  # Show first 10
            print(f"  - {feature}")
        if len(available_features) > 10:
            print(f"  ... and {len(available_features) - 10} more")


FEATURE CATEGORIES SUMMARY

TARGET (1 features):
  - early_sexual_debut

DEMOGRAPHICS (2 features):
  - v012
  - v013

GEOGRAPHIC (9 features):
  - v101
  - v102
  - region_kigali
  - region_south
  - region_west
  - region_north
  - region_east
  - is_urban
  - is_rural

EDUCATION (10 features):
  - v106
  - v107
  - v149
  - v150
  - v151
  - v152
  - has_education
  - age_education_interaction
  - education_wealth_interaction
  - urban_education_interaction

SOCIOECONOMIC (10 features):
  - v130
  - v190
  - v191
  - hv270
  - hv271
  - is_catholic
  - is_protestant
  - is_muslim
  - age_wealth_interaction
  - education_wealth_interaction

HOUSEHOLD (5 features):
  - hv009
  - has_electricity
  - has_radio
  - has_tv
  - total_assets

MARITAL (4 features):
  - v501
  - v502
  - ever_married
  - currently_married

FERTILITY (4 features):
  - v201
  - v213
  - bord
  - b5

FAMILY_PLANNING (3 features):
  - v301
  - v312
  - v602

HEALTH (4 features):
  - v157
  - v158
  - v384a
  - v3

In [None]:
missing_check = df_processed.isnull().sum()
missing_vars = missing_check[missing_check > 0]

if len(missing_vars) == 0:
    print("âœ“ No missing values in processed dataset")
else:
    print(" Missing values found:")
    for var, count in missing_vars.items():
        pct = (count / len(df_processed)) * 100
        print(f"  {var}: {count:,} ({pct:.2f}%)")


âœ“ No missing values in processed dataset


In [None]:
output_path = r"C:/Users/USER/Desktop/seraphine_thesis/findatasets/rwanda_dhs_processed.csv"

try:
    df_processed.to_csv(output_path, index=False)
    print(f"âœ“ Processed data saved successfully")
    print(f"  Location: {output_path}")
    print(f"  Size: {df_processed.shape[0]:,} rows Ã— {df_processed.shape[1]:,} columns")
    
except Exception as e:
    print(f" Error saving processed data: {e}")


âœ“ Processed data saved successfully
  Location: C:/Users/USER/Desktop/seraphine_thesis/findatasets/rwanda_dhs_processed.csv
  Size: 14,634 rows Ã— 66 columns


In [None]:
print("="*80)
print("FINAL SUMMARY REPORT")
print("="*80)

print("\n DATASET OVERVIEW")
print(f"  Original records: {df.shape[0]:,}")
print(f"  Processed records: {df_processed.shape[0]:,}")
print(f"  Records retained: {df_processed.shape[0]/df.shape[0]*100:.1f}%")
print(f"  Original features: {df.shape[1]:,}")
print(f"  Final features: {df_processed.shape[1]:,}")

print("\n TARGET VARIABLE")
target_dist = df_processed['early_sexual_debut'].value_counts()
for value, count in target_dist.items():
    pct = (count / len(df_processed)) * 100
    label = "Early Debut" if value == 1 else "Late Debut"
    print(f"  {label} ({value}): {count:,} ({pct:.1f}%)")

print("\n FEATURE ENGINEERING")
print(f"  Interaction terms created: {len(preprocessing_info['interactions_created'])}")
if preprocessing_info['scaling_info']:
    print(f"  Variables scaled: {len(preprocessing_info['scaling_info']['original_vars'])}")
print(f"  Binary indicators created: Multiple")

print("\n OUTPUT")
print(f"  File saved: {output_path}")
print(f"  Memory usage: {stats['memory_mb']:.2f} MB")

print("\n" + "="*80)
print("âœ“ PREPROCESSING PIPELINE COMPLETED SUCCESSFULLY")
print("="*80)

FINAL SUMMARY REPORT

ðŸ“Š DATASET OVERVIEW
  Original records: 14,634
  Processed records: 14,634
  Records retained: 100.0%
  Original features: 44
  Final features: 66

ðŸŽ¯ TARGET VARIABLE
  Late Debut (0): 7,919 (54.1%)
  Early Debut (1): 6,715 (45.9%)

âœ¨ FEATURE ENGINEERING
  Interaction terms created: 4
  Variables scaled: 3
  Binary indicators created: Multiple

ðŸ’¾ OUTPUT
  File saved: C:/Users/USER/Desktop/seraphine_thesis/findatasets/rwanda_dhs_processed.csv
  Memory usage: 8.80 MB

âœ“ PREPROCESSING PIPELINE COMPLETED SUCCESSFULLY
