# 🧽 Advanced Data Cleaning with Python

In [None]:
import pandas as pd
import numpy as np
import re

## 1. Sample Data

In [None]:
data = pd.DataFrame({
    'name': [' Alice ', 'bob', 'BOB', 'Charlie', 'alice', None],
    'email': ['ALICE@EXAMPLE.com', 'bob@example.COM', 'not-an-email', None, 'alice@example.com', ''],
    'join_date': ['2021/01/05', '05-01-2021', 'Jan 5, 2021', '2021.01.05', None, '2021-01-05'],
    'age': [25, 27, 27, 30, -1, 300],
    'income': [50000, None, 48000, 52000, 51000, None]
})
data

## 2. Normalize Categorical Text

In [None]:
data['name_clean'] = data['name'].str.strip().str.lower()
data

## 3. Validate Email Format with Regex

In [None]:
data['valid_email'] = data['email'].str.contains(r'^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b$', na=False)
data[['email', 'valid_email']]

## 4. Convert to Datetime

In [None]:
data['join_date_parsed'] = pd.to_datetime(data['join_date'], errors='coerce')
data[['join_date', 'join_date_parsed']]

## 5. Remove Outliers (Simple Rule)

In [None]:
data['age_clean'] = data['age'].apply(lambda x: np.nan if x < 0 or x > 120 else x)
data[['age', 'age_clean']]

## 6. Handle Duplicates with Custom Logic

In [None]:
data['name_lower'] = data['name'].str.lower().str.strip()
data_no_dups = data.drop_duplicates(subset=['name_lower'])
data_no_dups[['name', 'name_lower']]

## 7. Impute Missing Income

In [None]:
mean_income = data['income'].mean()
data['income_filled'] = data['income'].fillna(mean_income)
data[['income', 'income_filled']]