    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Step 1: Sample dataset with missing values
data = {
    'Age': [25, None, 35, 40, 50],
    'Income': [50000, 60000, None, 80000, 90000],
    'Experience': [2, 5, 7, None, 10]
}
df = pd.DataFrame(data)
print("Original Dataset with Missing Values:")
print(df)

# Step 2: List of numerical columns
numeric_features = ['Age', 'Income', 'Experience']

# Step 3: Define transformer for numerical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),       # Fill missing values
    ('scaler', StandardScaler())                       # Scale the features
])

# Step 4: Combine into a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)
])

# Step 5: Apply the transformation
processed_data = preprocessor.fit_transform(df)

# Step 6: Convert to DataFrame for easy viewing
processed_df = pd.DataFrame(processed_data, columns=numeric_features)

print("\nTransformed Dataset (after Imputation + Scaling):")
print(processed_df)


Original Dataset with Missing Values:
    Age   Income  Experience
0  25.0  50000.0         2.0
1   NaN  60000.0         5.0
2  35.0      NaN         7.0
3  40.0  80000.0         NaN
4  50.0  90000.0        10.0

Transformed Dataset (after Imputation + Scaling):
        Age    Income  Experience
0 -1.550434 -1.414214   -1.533930
1  0.000000 -0.707107   -0.383482
2 -0.310087  0.000000    0.383482
3  0.310087  0.707107    0.000000
4  1.550434  1.414214    1.533930


In [2]:
from sklearn.impute import SimpleImputer
import pandas as pd

def impute_data(df, strategy='mean'):
    """
    Impute missing values in a DataFrame using the specified strategy.
    """
    imputer = SimpleImputer(strategy=strategy)
    imputed_data = imputer.fit_transform(df)
    return pd.DataFrame(imputed_data, columns=df.columns)


In [3]:
from sklearn.preprocessing import StandardScaler

def scale_data(df):
    """
    Scale numerical features using StandardScaler.
    """
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    return pd.DataFrame(scaled_data, columns=df.columns)


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def preprocess_data(df, numeric_features):
    """
    Apply both imputation and scaling to numeric features.
    """
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features)
    ])
    
    processed_data = preprocessor.fit_transform(df)
    return pd.DataFrame(processed_data, columns=numeric_features)


In [6]:
# Sample data
data = {
    'Age': [25, None, 35, 40, 50],
    'Income': [50000, 60000, None, 80000, 90000],
    'Experience': [2, 5, 7, None, 10]
}
df = pd.DataFrame(data)

# Define columns to transform
numeric_columns = ['Age', 'Income', 'Experience']

# Step-by-step usage
imputed_df = impute_data(df[numeric_columns])
scaled_df = scale_data(imputed_df)
combined_df = preprocess_data(df, numeric_columns)

print("Original:\n", df)
print("\nImputed:\n", imputed_df)
print("\nScaled:\n", scaled_df)
print("\nCombined (Imputed + Scaled):\n", combined_df)


Original:
     Age   Income  Experience
0  25.0  50000.0         2.0
1   NaN  60000.0         5.0
2  35.0      NaN         7.0
3  40.0  80000.0         NaN
4  50.0  90000.0        10.0

Imputed:
     Age   Income  Experience
0  25.0  50000.0         2.0
1  37.5  60000.0         5.0
2  35.0  70000.0         7.0
3  40.0  80000.0         6.0
4  50.0  90000.0        10.0

Scaled:
         Age    Income  Experience
0 -1.550434 -1.414214   -1.533930
1  0.000000 -0.707107   -0.383482
2 -0.310087  0.000000    0.383482
3  0.310087  0.707107    0.000000
4  1.550434  1.414214    1.533930

Combined (Imputed + Scaled):
         Age    Income  Experience
0 -1.550434 -1.414214   -1.533930
1  0.000000 -0.707107   -0.383482
2 -0.310087  0.000000    0.383482
3  0.310087  0.707107    0.000000
4  1.550434  1.414214    1.533930
