In [1]:
#Challenge 1: E-commerce Dataset
# Import required libraries
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Function to generate random dates within a range
def random_dates(start_date, end_date, n):
    date_range = (end_date - start_date).days
    random_days = np.random.randint(0, date_range, n)
    # Convert numpy.int64 to Python int for timedelta compatibility
    return [start_date + timedelta(days=int(day)) for day in random_days]

# Generate sample data
n_orders = 1000

# Customer information
customer_ids = np.random.randint(1, 201, n_orders)  # 200 unique customers
product_ids = np.random.randint(1, 51, n_orders)    # 50 unique products

# Generate order dates (last 2 years)
end_date = datetime.now()
start_date = end_date - timedelta(days=730)
order_dates = random_dates(start_date, end_date, n_orders)

# Generate prices with some inconsistencies
base_prices = np.random.uniform(10, 500, n_orders)
# Add some string prices and missing values
prices = [f"${p:.2f}" if i % 20 != 0 else str(p) if i % 40 == 0 else np.nan 
         for i, p in enumerate(base_prices)]

# Generate quantities with some errors
quantities = np.random.randint(1, 11, n_orders)
# Add some negative quantities and zeros
quantities[np.random.choice(n_orders, 20)] = np.random.randint(-5, 0, 20)
quantities[np.random.choice(n_orders, 10)] = 0

# Generate shipping status with inconsistent formatting
status_options = ['delivered', 'DELIVERED', 'Delivered', 'in transit', 'IN TRANSIT', 
                 'pending', 'Pending', 'PENDING', 'cancelled', 'CANCELLED']
shipping_status = np.random.choice(status_options, n_orders)

# Create messy email addresses
domains = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com']
emails = [f"customer{cid}@{np.random.choice(domains)}" if i % 15 != 0 
          else f"customer{cid}@invalid" if i % 30 == 0
          else None for i, cid in enumerate(customer_ids)]

# Create the messy dataset
messy_orders = pd.DataFrame({
    'order_id': range(1, n_orders + 1),
    'customer_id': customer_ids,
    'product_id': product_ids,
    'order_date': order_dates,
    'price': prices,
    'quantity': quantities,
    'shipping_status': shipping_status,
    'customer_email': emails
})

# Add some duplicate orders
duplicate_indices = np.random.choice(n_orders, 50, replace=False)
duplicates = messy_orders.iloc[duplicate_indices].copy()
messy_orders = pd.concat([messy_orders, duplicates], ignore_index=True)

In [None]:
class DataCleaningPracticePipeline:
    """
    Practice pipeline for data cleaning and transformation
    """
    
    def __init__(self, config=None):
        """
        Initialize the pipeline with configuration parameters
        
        Parameters:
        config (dict): Configuration dictionary with cleaning parameters
        """
        self.config = config or self._default_config()
        self.cleaning_report = {}

    def _default_config(self):
        """Default configuration for the cleaning pipeline"""
        return {
            'handle_missing': True,
            'missing_strategy': 'median',  # 'mean', 'median', 'mode', 'drop', 'knn'
            'handle_duplicates': True,
            'duplicate_subset': None,  # Columns to check for duplicates
            'handle_outliers': True,
            'outlier_method': 'iqr',  # 'iqr', 'zscore', 'cap'
            'standardize_text': True,
            'validate_data_types': True,
            'create_features': False,
            'scaling_method': None,  # 'standard', 'minmax', 'robust'
            'verbose': True
        }  
    
    def clean_data(self, df):
        """
        Main method to clean the DataFrame
        
        Parameters:
        df (pd.DataFrame): Input DataFrame to clean
        
        Returns:
        pd.DataFrame: Cleaned DataFrame
        dict: Cleaning report with statistics
        """
        if self.config['verbose']:
            print("🧹 STARTING DATA CLEANING PIPELINE")
            print("=" * 50)
            print(f"Original dataset shape: {df.shape}")

        # Create a copy to avoid modifying original data
        df_clean = df.copy()

        # Initialize cleaning report
        self.cleaning_report = {
            'original_shape': df.shape,
            'final_shape': None,
            'rows_removed': 0,
            'columns_added': 0,
            'steps': []
        }

        # 1. HANDLE MISSING VALUES