    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [10]:
# Write your code from here
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Sample dataset with missing values
data = pd.DataFrame({
    'Age': [25, 30, None, 22, 40],
    'Salary': [50000, None, 60000, 52000, None]
})

print("Original Data:\n", data)

# Step 2: Define pipeline with imputation and scaling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),   # Fill missing with mean
    ('scaler', StandardScaler())                    # Scale features
])

# Step 3: Fit and transform the data
processed_data = pipeline.fit_transform(data)

# Convert back to DataFrame for clarity
processed_df = pd.DataFrame(processed_data, columns=data.columns)

print("\nProcessed Data (Imputed + Scaled):\n", processed_df)


Original Data:
     Age   Salary
0  25.0  50000.0
1  30.0      NaN
2   NaN  60000.0
3  22.0  52000.0
4  40.0      NaN

Processed Data (Imputed + Scaled):
         Age    Salary
0 -0.695414 -1.195229
1  0.122720  0.000000
2  0.000000  1.792843
3 -1.186295 -0.597614
4  1.758989  0.000000


In [11]:
# Task: Imputation Function








# Scaling Function









# Combined Transformation Function









In [12]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Imputation Function: fills missing numerical values with mean
def impute_missing(data):
    imputer = SimpleImputer(strategy='mean')
    imputed_array = imputer.fit_transform(data)
    imputed_df = pd.DataFrame(imputed_array, columns=data.columns)
    return imputed_df

# Scaling Function: standardizes numerical features
def scale_features(data):
    scaler = StandardScaler()
    scaled_array = scaler.fit_transform(data)
    scaled_df = pd.DataFrame(scaled_array, columns=data.columns)
    return scaled_df

# Combined Transformation Function: imputation followed by scaling
def transform_data(data):
    imputed_df = impute_missing(data)
    scaled_df = scale_features(imputed_df)
    return scaled_df

# Example usage:
if __name__ == "__main__":
    sample_data = pd.DataFrame({
        'Age': [25, None, 30, 22, 40],
        'Salary': [50000, 48000, None, 52000, None]
    })

    print("Original Data:\n", sample_data)

    imputed = impute_missing(sample_data)
    print("\nAfter Imputation:\n", imputed)

    scaled = scale_features(imputed)
    print("\nAfter Scaling:\n", scaled)

    combined = transform_data(sample_data)
    print("\nCombined Transformation:\n", combined)


Original Data:
     Age   Salary
0  25.0  50000.0
1   NaN  48000.0
2  30.0      NaN
3  22.0  52000.0
4  40.0      NaN

After Imputation:
      Age   Salary
0  25.00  50000.0
1  29.25  48000.0
2  30.00  50000.0
3  22.00  52000.0
4  40.00  50000.0

After Scaling:
         Age    Salary
0 -0.695414  0.000000
1  0.000000 -1.581139
2  0.122720  0.000000
3 -1.186295  1.581139
4  1.758989  0.000000

Combined Transformation:
         Age    Salary
0 -0.695414  0.000000
1  0.000000 -1.581139
2  0.122720  0.000000
3 -1.186295  1.581139
4  1.758989  0.000000
