    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
import numpy as np

# Step 1: Load sample dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame.copy()

# Step 2: Introduce missing values artificially (~10%)
np.random.seed(42)
mask = np.random.rand(*df.shape) < 0.1
df[mask] = np.nan

print("Missing values per column BEFORE imputation:")
print(df.isnull().sum())

# Step 3: Define pipeline with imputation and scaling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # fill missing with mean
    ('scaler', StandardScaler())                   # standardize features
])

# Step 4: Fit pipeline and transform data
df_transformed = pipeline.fit_transform(df)

# Convert transformed array back to DataFrame for easier inspection
df_transformed = pd.DataFrame(df_transformed, columns=df.columns)

print("\nMissing values per column AFTER pipeline:")
print(pd.DataFrame(df_transformed).isnull().sum())

print("\nFirst 5 rows of processed data:")
print(df_transformed.head())



Missing values per column BEFORE imputation:
MedInc         2052
HouseAge       2092
AveRooms       2010
AveBedrms      2105
Population     2089
AveOccup       2075
Latitude       2055
Longitude      2079
MedHouseVal    2074
dtype: int64

Missing values per column AFTER pipeline:
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

First 5 rows of processed data:
     MedInc  HouseAge      AveRooms  AveBedrms  Population  AveOccup  \
0  2.476802  1.039054  6.441551e-01  -0.169777   -1.018855 -0.051443   
1  2.463568  0.000000  3.344082e-01  -0.292437    0.900240 -0.097003   
2  1.883065  1.960652  1.185600e+00  -0.052531   -0.858238 -0.026225   
3  0.985454  1.960652  3.687858e-16  -0.053445   -0.801007  0.000000   
4 -0.013689  0.000000  3.525599e-01  -0.034497   -0.794545 -0.089682   

       Latitude  Longitude  MedHouseVal  
0  3.506221e-15  -1.402253     2.251183  
1  

In [16]:
# Task: Imputation Function








# Scaling Function









# Combined Transformation Function









In [17]:
from sklearn.impute import SimpleImputer
import pandas as pd

def impute_data(df, columns, strategy='mean'):
    """
    Impute missing values in specified columns using SimpleImputer.
    
    Parameters:
        df (pd.DataFrame): Input data
        columns (list): Columns to impute
        strategy (str): Imputation strategy (mean, median, most_frequent, constant)
        
    Returns:
        pd.DataFrame: DataFrame with imputed columns
    """
    imputer = SimpleImputer(strategy=strategy)
    df_copy = df.copy()
    df_copy[columns] = imputer.fit_transform(df_copy[columns])
    return df_copy


In [18]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def scale_data(df, columns):
    """
    Scale specified columns using StandardScaler.
    
    Parameters:
        df (pd.DataFrame): Input data
        columns (list): Columns to scale
        
    Returns:
        pd.DataFrame: DataFrame with scaled columns
    """
    scaler = StandardScaler()
    df_copy = df.copy()
    df_copy[columns] = scaler.fit_transform(df_copy[columns])
    return df_copy


In [19]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import pandas as pd

def transform_data(df, columns, impute_strategy='mean'):
    """
    Pipeline to impute missing values and scale specified columns.
    
    Parameters:
        df (pd.DataFrame): Input data
        columns (list): Columns to transform
        impute_strategy (str): Strategy for imputation
        
    Returns:
        pd.DataFrame: Transformed DataFrame with imputed and scaled columns
    """
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy=impute_strategy)),
        ('scaler', StandardScaler())
    ])
    
    df_copy = df.copy()
    df_copy[columns] = pipeline.fit_transform(df_copy[columns])
    
    return df_copy


In [20]:
import pandas as pd

# Sample data
data = {
    'Age': [25, 30, None, 22, 40],
    'Income': [50000, None, 60000, 52000, 58000],
    'Gender': ['M', 'F', 'M', 'F', 'F']
}
df = pd.DataFrame(data)

# Impute Age and Income
df_imputed = impute_data(df, ['Age', 'Income'])

# Scale Age and Income
df_scaled = scale_data(df_imputed, ['Age', 'Income'])

# Combined pipeline (impute + scale)
df_transformed = transform_data(df, ['Age', 'Income'])
