### Ensuring Feature Consistency Between Training & InferencePipelines:

**Task 1**: Consistent Feature Preparation
- Step 1: Write a function for data preprocessing and imputation shared by both training and inference pipelines.
- Step 2: Demonstrate consistent application on both datasets.

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def preprocess_data(df, imputer=None, scaler=None, fit=True):
    """
    Preprocess data consistently for training and inference.

    Parameters:
    - df (pd.DataFrame): input data
    - imputer (SimpleImputer): imputer instance (optional)
    - scaler (StandardScaler): scaler instance (optional)
    - fit (bool): if True, fit the imputer and scaler, else only transform

    Returns:
    - pd.DataFrame: preprocessed data
    - SimpleImputer: fitted imputer
    - StandardScaler: fitted scaler
    """
    # Select numeric columns for simplicity
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    data = df[numeric_cols]

    # Initialize imputer and scaler if not provided
    if imputer is None:
        imputer = SimpleImputer(strategy='mean')
    if scaler is None:
        scaler = StandardScaler()

    # Impute missing values
    if fit:
        data_imputed = imputer.fit_transform(data)
    else:
        data_imputed = imputer.transform(data)

    # Scale features
    if fit:
        data_scaled = scaler.fit_transform(data_imputed)
    else:
        data_scaled = scaler.transform(data_imputed)

    # Return as DataFrame with original column names
    processed_df = pd.DataFrame(data_scaled, columns=numeric_cols, index=df.index)

    return processed_df, imputer, scaler

# Example Usage:

# Sample training data with missing values
train_df = pd.DataFrame({
    'age': [25, 30, np.nan, 22, 40],
    'income': [50000, 60000, 55000, np.nan, 65000],
    'score': [200, 220, 210, 215, np.nan]
})

# Sample inference data (new data)
inference_df = pd.DataFrame({
    'age': [28, np.nan, 35],
    'income': [52000, 58000, np.nan],
    'score': [205, 210, 215]
})

# Preprocess training data (fit)
train_processed, imputer, scaler = preprocess_data(train_df, fit=True)
print("Processed Training Data:")
print(train_processed)

# Preprocess inference data (transform only)
inference_processed, _, _ = preprocess_data(inference_df, imputer=imputer, scaler=scaler, fit=False)
print("\nProcessed Inference Data:")
print(inference_processed)

**Task 2**: Pipeline Integration
- Step 1: Use sklearn pipelines to encapsulate the preprocessing steps.
- Step 2: Configure identical pipelines for both training and building inference models.

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Sample training data
train_df = pd.DataFrame({
    'age': [25, 30, np.nan, 22, 40],
    'income': [50000, 60000, 55000, np.nan, 65000],
    'score': [200, 220, 210, 215, np.nan],
    'target': [0, 1, 0, 1, 0]
})

# Sample inference data (no target here)
inference_df = pd.DataFrame({
    'age': [28, np.nan, 35],
    'income': [52000, 58000, np.nan],
    'score': [205, 210, 215]
})

# Define feature columns and target
feature_cols = ['age', 'income', 'score']
target_col = 'target'

# Build pipeline for preprocessing and classification
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Training phase
X_train = train_df[feature_cols]
y_train = train_df[target_col]

pipeline.fit(X_train, y_train)

# After training, use the same pipeline for inference (transform + predict)
X_infer = inference_df[feature_cols]
predictions = pipeline.predict(X_infer)

print("Predictions on inference data:")
print(predictions)

**Task 3**: Saving and Loading Preprocessing Models
- Step 1: Save the transformation model after fitting it to the training data.
- Step 2: Load and apply the saved model during inference.

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib  # for saving and loading models

# Sample training data
train_df = pd.DataFrame({
    'age': [25, 30, np.nan, 22, 40],
    'income': [50000, 60000, 55000, np.nan, 65000],
    'score': [200, 220, 210, 215, np.nan],
    'target': [0, 1, 0, 1, 0]
})

feature_cols = ['age', 'income', 'score']
target_col = 'target'

# Build pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

# Training phase
X_train = train_df[feature_cols]
y_train = train_df[target_col]
pipeline.fit(X_train, y_train)

# Save the pipeline to a file
joblib.dump(pipeline, 'trained_pipeline.joblib')

# ----- Later or in inference script -----

# Load the saved pipeline
loaded_pipeline = joblib.load('trained_pipeline.joblib')

# Sample inference data (no target)
inference_df = pd.DataFrame({
    'age': [28, np.nan, 35],
    'income': [52000, 58000, np.nan],
    'score': [205, 210, 215]
})
X_infer = inference_df[feature_cols]

# Use loaded pipeline for prediction (imputation + scaling + classification)
predictions = loaded_pipeline.predict(X_infer)

print("Predictions using loaded pipeline:")
print(predictions)