    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [8]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Load a sample dataset (Iris) and introduce missing values
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
df = pd.read_csv(url, header=None, names=columns)

# Introduce missing values for demonstration
np.random.seed(42)
mask = np.random.rand(*df.iloc[:, :-1].shape) < 0.1  # 10% missingness
df.loc[:, df.columns != 'class'] = df.loc[:, df.columns != 'class'].mask(mask)

X = df.drop(columns=['class'])  # features only

# Step 2: Define a pipeline with imputation + scaling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # fill missing numeric values with mean
    ('scaler', StandardScaler())                   # standardize features
])

# Step 3: Fit pipeline and transform data
X_transformed = pipeline.fit_transform(X)

print("Original data with missing values (first 5 rows):")
print(X.head())
print("\nTransformed data after imputation and scaling (first 5 rows):")
print(X_transformed[:5])


Original data with missing values (first 5 rows):
   sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           NaN          0.2
2           4.7          3.2           NaN          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2

Transformed data after imputation and scaling (first 5 rows):
[[-1.02472346  1.08451012 -1.43456995 -1.32386852]
 [-1.29195113 -0.10968944  0.         -1.32386852]
 [-1.55917881  0.36799038  0.         -1.32386852]
 [-1.69279265  0.12915047 -1.37322779 -1.32386852]
 [-1.1583373   1.32335003 -1.43456995 -1.32386852]]


In [11]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


# Imputation Function
def impute_data(X, strategy='mean'):
    """
    Imputes missing values in X using SimpleImputer.
    
    Parameters:
    - X: pd.DataFrame or np.ndarray with missing values
    - strategy: str, imputation strategy ('mean', 'median', 'most_frequent', 'constant')
    
    Returns:
    - np.ndarray: imputed data
    """
    imputer = SimpleImputer(strategy=strategy)
    return imputer.fit_transform(X)


# Scaling Function
def scale_data(X):
    """
    Scales numerical data using StandardScaler.
    
    Parameters:
    - X: np.ndarray or pd.DataFrame with numerical features
    
    Returns:
    - np.ndarray: scaled data
    """
    scaler = StandardScaler()
    return scaler.fit_transform(X)


# Combined Transformation Function
def impute_and_scale(X, impute_strategy='mean'):
    """
    Performs imputation followed by scaling on dataset X.
    
    Parameters:
    - X: pd.DataFrame or np.ndarray with numerical features and missing values
    - impute_strategy: str, strategy for imputation
    
    Returns:
    - np.ndarray: transformed data after imputation and scaling
    """
    X_imputed = impute_data(X, strategy=impute_strategy)
    X_scaled = scale_data(X_imputed)
    return X_scaled
