    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Step 1: Create a sample dataset with missing values
data = {
    'Age': [25, np.nan, 35, 22, np.nan],
    'Salary': [50000, 60000, np.nan, 52000, 58000],
    'Experience': [2, 5, 7, np.nan, 10]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

# Step 2: Define numerical columns
numeric_features = ['Age', 'Salary', 'Experience']

# Step 3: Define transformation pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),     # Impute missing values
    ('scaler', StandardScaler())                     # Scale the data
])

# Step 4: Create ColumnTransformer to apply transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ]
)

# Step 5: Apply the complete pipeline to the dataset
df_transformed = preprocessor.fit_transform(df)

# Step 6: Convert to DataFrame for readability
df_transformed = pd.DataFrame(df_transformed, columns=numeric_features)
print("\nTransformed Data:")
print(df_transformed)

Original Data:
    Age   Salary  Experience
0  25.0  50000.0         2.0
1   NaN  60000.0         5.0
2  35.0      NaN         7.0
3  22.0  52000.0         NaN
4   NaN  58000.0        10.0

Transformed Data:
        Age    Salary  Experience
0 -0.542001 -1.355815   -1.533930
1  0.000000  1.355815   -0.383482
2  1.780860  0.000000    0.383482
3 -1.238859 -0.813489    0.000000
4  0.000000  0.813489    1.533930


In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Sample dataset with missing values
data = {
    'Age': [25, np.nan, 35, 22, np.nan],
    'Salary': [50000, 60000, np.nan, 52000, 58000],
    'Experience': [2, 5, 7, np.nan, 10]
}
df = pd.DataFrame(data)

# 1. Imputation Function
def imputation_function(data, strategy='mean'):
    imputer = SimpleImputer(strategy=strategy)
    return imputer.fit_transform(data)

# 2. Scaling Function
def scaling_function(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

# 3. Combined Transformation Function (Imputation + Scaling)
def combined_transformation_function(data, strategy='mean'):
    # Impute missing values first
    imputed_data = imputation_function(data, strategy)
    # Scale the data
    scaled_data = scaling_function(imputed_data)
    return scaled_data

# Apply transformations to the dataset
transformed_data = combined_transformation_function(df)

# Convert the result to DataFrame for better readability
transformed_df = pd.DataFrame(transformed_data, columns=df.columns)
print("Transformed Data after Imputation and Scaling:")
print(transformed_df)


Transformed Data after Imputation and Scaling:
        Age    Salary  Experience
0 -0.542001 -1.355815   -1.533930
1  0.000000  1.355815   -0.383482
2  1.780860  0.000000    0.383482
3 -1.238859 -0.813489    0.000000
4  0.000000  0.813489    1.533930
