### Handling Missing Values - Imputation within ML Pipelines
**Description**: Implement a machine learning pipeline that includes imputation and a classifier.

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def validate_input_data(df, required_columns, numeric_cols, categorical_cols):
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Check numeric columns are numeric type
    for col in numeric_cols:
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise TypeError(f"Column '{col}' should be numeric")
    
    # Check categorical columns are of object or categorical type
    for col in categorical_cols:
        if not pd.api.types.is_object_dtype(df[col]) and not pd.api.types.is_categorical_dtype(df[col]):
            raise TypeError(f"Column '{col}' should be categorical")

def build_pipeline(numeric_features, categorical_features):
    numeric_transformer = SimpleImputer(strategy='mean')
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    return pipeline

def run_pipeline(df):
    required_columns = ['Age', 'Income', 'Gender', 'Purchased']
    numeric_cols = ['Age', 'Income']
    categorical_cols = ['Gender']

    validate_input_data(df, required_columns, numeric_cols, categorical_cols)

    X = df.drop('Purchased', axis=1)
    y = df['Purchased']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    pipeline = build_pipeline(numeric_cols, categorical_cols)

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(classification_report(y_test, y_pred))


# Example dataset
data = {
    'Age': [25, 30, None, 22, 28, None, 35],
    'Income': [50000, None, 62000, 58000, None, 54000, 61000],
    'Gender': ['Male', 'Female', 'Female', None, 'Male', 'Female', 'Male'],
    'Purchased': [0, 1, 0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

try:
    run_pipeline(df)
except Exception as e:
    print(f"Error: {e}")


# Unit tests
def test_validate_input_data():
    import pytest

    df_good = pd.DataFrame({
        'Age': [1,2],
        'Income': [3,4],
        'Gender': ['M','F'],
        'Purchased': [0,1]
    })
    # No error expected
    validate_input_data(df_good, ['Age','Income','Gender','Purchased'], ['Age','Income'], ['Gender'])

    df_missing_col = pd.DataFrame({'Age':[1], 'Income':[2], 'Purchased':[0]})
    with pytest.raises(ValueError):
        validate_input_data(df_missing_col, ['Age','Income','Gender','Purchased'], ['Age','Income'], ['Gender'])

    df_wrong_type = pd.DataFrame({'Age':['a'], 'Income':[2], 'Gender':['M'], 'Purchased':[0]})
    with pytest.raises(TypeError):
        validate_input_data(df_wrong_type, ['Age','Income','Gender','Purchased'], ['Age','Income'], ['Gender'])


def test_pipeline_runs():
    df_test = pd.DataFrame({
        'Age': [25, None, 35],
        'Income': [50000, 60000, None],
        'Gender': ['Male', 'Female', None],
        'Purchased': [1, 0, 1]
    })
    run_pipeline(df_test)


if __name__ == "__main__":
    import pytest
    test_validate_input_data()
    test_pipeline_runs()
    print("\nAll tests passed!")