# üßπ Data Cleaning Workflow: patient_demographics

**Generated:** 2025-12-09 12:47:39  
**Purpose:** Systematic data cleaning and preparation

This notebook provides a comprehensive data cleaning workflow.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('D:\data-dojo-1\datasets\healthcare\patient_demographics.csv')
df_original = df.copy()  # Keep backup

print(f"Original shape: {df.shape}")
print("Starting data cleaning workflow...")

## 1. üï≥Ô∏è Handle Missing Values

In [None]:
# Check for missing values
missing_summary = df.isnull().sum()
missing_percentage = (missing_summary / len(df)) * 100

print("=== MISSING VALUES SUMMARY ===")
for col in missing_summary[missing_summary > 0].index:
    count = missing_summary[col]
    percentage = missing_percentage[col]
    print(f"{col}: {count} missing ({percentage:.1f}%)")

# Handle missing values
# Option 1: Drop columns with >50% missing
high_missing_cols = missing_percentage[missing_percentage > 50].index
if len(high_missing_cols) > 0:
    print(f"\nDropping columns with >50% missing: {list(high_missing_cols)}")
    df = df.drop(columns=high_missing_cols)

# Option 2: Fill numeric columns with median
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# Option 3: Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

print(f"\nAfter cleaning - Missing values: {df.isnull().sum().sum()}")

## 2. üóëÔ∏è Remove Duplicates

In [None]:
# Handle duplicates
duplicates_before = df.duplicated().sum()
df = df.drop_duplicates()

print(f"Duplicates removed: {duplicates_before}")
print(f"New shape: {df.shape}")

## 3. ‚úÖ Cleaning Summary

In [None]:
# Final summary
print("=== CLEANING SUMMARY ===")
print(f"Original shape: {df_original.shape}")
print(f"Final shape: {df.shape}")
print(f"Rows removed: {len(df_original) - len(df)}")
print(f"Missing values remaining: {df.isnull().sum().sum()}")

# Save cleaned data
df.to_csv('cleaned_dataset.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_dataset.csv'")