##  The 5-Step Cleaning Framework

    Inspect → Understand what's broken
    Decide → Choose cleaning strategy (drop, fill, transform)
    Apply → Execute transformations
    Validate → Verify results with assertions
    Document → Log decisions for audit trails

In [None]:
import pandas as pd
import numpy as np

In [None]:


# Sample messy data (typical real-world scenario)
data = {
    'order_id': ['ORD001', 'ORD002', 'ORD003', 'ORD002', 'ORD004', None],
    'customer_email': ['john@email.com', 'JANE@EMAIL.COM', 'bob@email.com',
                       'JANE@EMAIL.COM', 'invalid-email', ''],
    'order_date': ['2025-01-15', '2025/01/16', '2025-01-17',
                   '2025/01/16', 'N/A', '2025-01-20'],
    'amount': [100.50, 200, -50, 200, 999999, 150.75],
    'status': ['completed', 'Completed', 'pending', 'Completed', 'SHIPPED', None]
}

df = pd.DataFrame(data)
print("ORIGINAL DATA:")
print(df)
print(f"\nShape: {df.shape}")

<!-- handling missing values -->

In [None]:
#  step 1 - inspect nulls
print("missing values")
print(df.isnull().sum())
print("\nEmpty strings (not detected as null):")
print((df == '').sum())

# step 2 - decide statergy per column

# order_id critical -> drop
# customer_email -> fill with placeholder for tracking
# status -> fill with unknown

#  fill empty with '' with nan
df.replace('',np.nan,inplace=True)

# Apply strategies

#  remove specific column
df_cleaned = df.dropna(subset=['order_id'])  # Drop if no order_id
df_cleaned['customer_email'].fillna('no-email@unknown.com')
df_cleaned['status'].fillna('unknown')


print("\nAfter handling nulls:")
print(df_cleaned)


In [None]:
#  remove duplicates
print(f"\nDuplicate rows: {df_cleaned.duplicated().sum()}")
print("\nDuplicate order_ids:")
print(df_cleaned[df_cleaned.duplicated(subset=['order_id'], keep=False)])

# Remove duplicates (keep first occurrence)
df_cleaned = df_cleaned.drop_duplicates(subset=['order_id'], keep='first')
print(f"\nShape after deduplication: {df_cleaned.shape}")

In [None]:
df_cleaned

Standardizing Data Types & Formats

In [None]:
# Fix dates (handle multiple formats)
df_cleaned['order_date'] = pd.to_datetime(
    df_cleaned['order_date'],
    errors='coerce'
)


# Drop rows with invalid dates
df_cleaned = df_cleaned.dropna(subset=['order_date'])

# Standardize text (lowercase, trim whitespace)
# df_cleaned['customer_email'] = df_cleaned['customer_email'].str.lower().str.strip()
# df_cleaned['status'] = df_cleaned['status'].str.lower().str.strip()

df_cleaned.loc[:, 'customer_email'] = df_cleaned['customer_email'].str.lower().str.strip()
df_cleaned.loc[:, 'status'] = df_cleaned['status'].str.lower().str.strip()


print("\nAfter type conversion:")
print(df_cleaned.dtypes)
print(df_cleaned)

In [None]:
# handling outliers
print("\nAmount statistics:")
print(df_cleaned['amount'].describe())

In [None]:
#  Flag suspicious values
df_cleaned['is_valid'] = True
df_cleaned.loc[df_cleaned['amount'] < 0, 'is_valid'] = False  # Negative amounts
df_cleaned.loc[df_cleaned['amount'] > 10000, 'is_valid'] = False  # Suspiciously high


print("\nInvalid rows:")
print(df_cleaned[~df_cleaned['is_valid']])


# Option 1: Remove invalid rows
# df_cleaned = df_cleaned[df_cleaned['is_valid']].drop(columns=['is_valid'])

# Option 2: Cap values (alternative approach)
# df_cleaned['amount'] = df_cleaned['amount'].clip(lower=0, upper=10000)
