    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [2]:
# Write your code from here
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError
import unittest

# Function to validate data
def validate_data(df):
    if df.empty:
        raise ValueError("Dataset is empty.")
    if not all(df.dtypes.apply(lambda dt: np.issubdtype(dt, np.number))):
        raise TypeError("All columns must be numeric for this pipeline.")

# Sample data
data = {
    'Age': [25, np.nan, 30, 45, np.nan],
    'Income': [50000, 54000, np.nan, 75000, 70000]
}
df = pd.DataFrame(data)

# Validate before processing
try:
    validate_data(df)

    # Pipeline
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Fit and transform
    processed_data = pipeline.fit_transform(df)
    processed_df = pd.DataFrame(processed_data, columns=df.columns)
    print("✅ Cleaned and Scaled Data:\n", processed_df)

except (ValueError, TypeError) as e:
    print("⚠️ Error:", e)

# ========== Unit Tests ==========

class TestPipeline(unittest.TestCase):
    def setUp(self):
        self.valid_df = pd.DataFrame({
            'Feature1': [1, 2, np.nan],
            'Feature2': [4, 5, 6]
        })
        self.empty_df = pd.DataFrame()
        self.invalid_df = pd.DataFrame({
            'Name': ['Alice', 'Bob', 'Charlie'],
            'Age': [25, 30, 35]
        })

    def test_valid_pipeline(self):
        validate_data(self.valid_df)
        result = pipeline.fit_transform(self.valid_df)
        self.assertEqual(result.shape, (3, 2))

    def test_empty_dataframe(self):
        with self.assertRaises(ValueError):
            validate_data(self.empty_df)

    def test_non_numeric_data(self):
        with self.assertRaises(TypeError):
            validate_data(self.invalid_df)

unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestPipeline))


...
----------------------------------------------------------------------
Ran 3 tests in 0.008s

OK


✅ Cleaned and Scaled Data:
         Age    Income
0 -1.265924 -1.304743
1  0.000000 -0.878704
2 -0.506370  0.000000
3  1.772294  1.357998
4  0.000000  0.825450


<unittest.runner.TextTestResult run=3 errors=0 failures=0>

In [None]:
# Task: Imputation Function








# Scaling Function









# Combined Transformation Function







