    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [None]:
# Write your code from here


...
----------------------------------------------------------------------
Ran 3 tests in 0.008s

OK


✅ Cleaned and Scaled Data:
         Age    Income
0 -1.265924 -1.304743
1  0.000000 -0.878704
2 -0.506370  0.000000
3  1.772294  1.357998
4  0.000000  0.825450


<unittest.runner.TextTestResult run=3 errors=0 failures=0>

In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError
import unittest

# -----------------------
# Function: Data Validator
# -----------------------
def validate_data(df):
    """
    Validates input DataFrame:
    - Ensures it's not empty.
    - Ensures all columns are numeric.
    - Ensures no negative or suspicious values for age/income.
    """
    if df.empty:
        raise ValueError("Dataset is empty.")
    if not all(df.dtypes.apply(lambda dt: np.issubdtype(dt, np.number))):
        raise TypeError("All columns must be numeric.")
    if (df < 0).any().any():
        raise ValueError("Dataset contains negative values, which may be invalid.")

# ---------------------
# Step 1: Load Dataset
# ---------------------
data = {
    'Age': [25, np.nan, 30, 45, np.nan],
    'Income': [50000, 54000, np.nan, 75000, 70000]
}
df = pd.DataFrame(data)

# ---------------------
# Step 2: Validate Data
# ---------------------
try:
    validate_data(df)

    # ---------------------
    # Step 3: Define Pipeline
    # ---------------------
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # ---------------------
    # Step 4: Transform Data
    # ---------------------
    processed_data = pipeline.fit_transform(df)
    processed_df = pd.DataFrame(processed_data, columns=df.columns)
    print("✅ Cleaned and Scaled Data:\n", processed_df)

except (ValueError, TypeError) as e:
    print("⚠️ Error:", e)

# ---------------------
# Step 5: Unit Tests
# ---------------------
class TestPipeline(unittest.TestCase):

    def setUp(self):
        # Valid DataFrame
        self.valid_df = pd.DataFrame({
            'Feature1': [1, 2, np.nan],
            'Feature2': [4, 5, 6]
        })
        # Empty DataFrame
        self.empty_df = pd.DataFrame()
        # Non-numeric DataFrame
        self.invalid_df = pd.DataFrame({
            'Name': ['Alice', 'Bob', 'Charlie'],
            'Age': [25, 30, 35]
        })
        # DataFrame with negative values
        self.negative_df = pd.DataFrame({
            'Age': [20, -5, 30],
            'Income': [40000, 50000, 60000]
        })

    def test_valid_pipeline(self):
        validate_data(self.valid_df)
        result = pipeline.fit_transform(self.valid_df)
        self.assertEqual(result.shape, (3, 2))
        # Assert specific transformation result
        expected_mean = np.nanmean(self.valid_df['Feature1'])
        expected_first_row = [(1 - expected_mean) / np.nanstd(self.valid_df['Feature1']), 
                              (4 - 5) / np.std([4, 5, 6])]
        np.testing.assert_almost_equal(result[0], expected_first_row, decimal=5)

    def test_empty_dataframe(self):
        with self.assertRaises(ValueError):
            validate_data(self.empty_df)

    def test_non_numeric_data(self):
        with self.assertRaises(TypeError):
            validate_data(self.invalid_df)

    def test_negative_values(self):
        with self.assertRaises(ValueError):
            validate_data(self.negative_df)

unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestPipeline))


...F
FAIL: test_valid_pipeline (__main__.TestPipeline)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_18783/3113358262.py", line 91, in test_valid_pipeline
    np.testing.assert_almost_equal(result[0], expected_first_row, decimal=5)
  File "/usr/local/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/home/vscode/.local/lib/python3.10/site-packages/numpy/testing/_private/utils.py", line 521, in assert_almost_equal
    return assert_array_almost_equal(actual, desired, decimal, err_msg)
  File "/usr/local/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/home/vscode/.local/lib/python3.10/site-packages/numpy/testing/_private/utils.py", line 1034, in assert_array_almost_equal
    assert_array_compare(compare, x, y, err_msg=err_msg, verbose=verbose,
  File "/usr/local/lib/python3.10/contextlib.py", line 79, in inner
    return func(*a

✅ Cleaned and Scaled Data:
         Age    Income
0 -1.265924 -1.304743
1  0.000000 -0.878704
2 -0.506370  0.000000
3  1.772294  1.357998
4  0.000000  0.825450


<unittest.runner.TextTestResult run=4 errors=0 failures=1>

In [None]:
# Task: Imputation Function








# Scaling Function









# Combined Transformation Function







