    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Create a sample dataset with missing values
data = {
    'age': [25, np.nan, 22, 40, np.nan],
    'salary': [50000, 60000, np.nan, 80000, 70000]
}
df = pd.DataFrame(data)

# Features matrix
X = df

# Step 2: Define a pipeline with imputation and scaling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),   # Fill missing values with mean
    ('scaler', StandardScaler())                    # Scale features to zero mean and unit variance
])

# Step 3: Fit the pipeline and transform the data
X_transformed = pipeline.fit_transform(X)

print("Original data:")
print(X)
print("\nData after imputation and scaling:")
print(X_transformed)



Original data:
    age   salary
0  25.0  50000.0
1   NaN  60000.0
2  22.0      NaN
3  40.0  80000.0
4   NaN  70000.0

Data after imputation and scaling:
[[-0.65582584 -1.5       ]
 [ 0.         -0.5       ]
 [-1.14769521  0.        ]
 [ 1.80352105  1.5       ]
 [ 0.          0.5       ]]


In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Imputation Function
def impute_data(X, strategy='mean'):
    """
    Imputes missing values in the dataset X using SimpleImputer.
    
    Parameters:
    - X: pd.DataFrame or np.ndarray with missing values.
    - strategy: str, imputation strategy ('mean', 'median', 'most_frequent', or 'constant').
    
    Returns:
    - np.ndarray: imputed data.
    """
    imputer = SimpleImputer(strategy=strategy)
    X_imputed = imputer.fit_transform(X)
    return X_imputed

# Scaling Function
def scale_data(X):
    """
    Scales features in the dataset X using StandardScaler.
    
    Parameters:
    - X: np.ndarray or pd.DataFrame of numerical data.
    
    Returns:
    - np.ndarray: scaled data.
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

# Combined Transformation Function
def impute_and_scale(X, impute_strategy='mean'):
    """
    Imputes missing values and scales the dataset X.
    
    Parameters:
    - X: pd.DataFrame or np.ndarray with numerical features.
    - impute_strategy: str, imputation strategy passed to SimpleImputer.
    
    Returns:
    - np.ndarray: transformed data after imputation and scaling.
    """
    # Step 1: Impute missing values
    X_imputed = impute_data(X, strategy=impute_strategy)
    
    # Step 2: Scale the imputed data
    X_transformed = scale_data(X_imputed)
    
    return X_transformed


In [4]:
import pandas as pd
import numpy as np

data = {
    'age': [25, np.nan, 22, 40, np.nan],
    'salary': [50000, 60000, np.nan, 80000, 70000]
}
df = pd.DataFrame(data)

result = impute_and_scale(df, impute_strategy='mean')
print(result)


[[-0.65582584 -1.5       ]
 [ 0.         -0.5       ]
 [-1.14769521  0.        ]
 [ 1.80352105  1.5       ]
 [ 0.          0.5       ]]
