### Ensuring Feature Consistency Between Training & InferencePipelines:

**Task 1**: Consistent Feature Preparation
- Step 1: Write a function for data preprocessing and imputation shared by both training and inference pipelines.
- Step 2: Demonstrate consistent application on both datasets.

In [1]:
# write your code from here
import pandas as pd
from sklearn.impute import SimpleImputer

def preprocess_data(df, imputer=None, fit_imputer=True):
    """
    Preprocesses the input DataFrame by imputing missing values.
    
    Parameters:
    - df: pd.DataFrame, input data to preprocess
    - imputer: sklearn.imputer.SimpleImputer or None, the imputer instance to use
    - fit_imputer: bool, whether to fit the imputer or just transform
    
    Returns:
    - df_processed: pd.DataFrame, preprocessed data with imputed values
    - imputer: fitted imputer instance (if fit_imputer=True), else same as input
    """
    # For example, let's impute numeric columns with median
    numeric_cols = df.select_dtypes(include='number').columns
    
    if imputer is None:
        imputer = SimpleImputer(strategy='median')
    
    if fit_imputer:
        df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    else:
        df[numeric_cols] = imputer.transform(df[numeric_cols])
    
    return df, imputer

# Sample data with missing values
train_df = pd.DataFrame({
    'A': [1, 2, None, 4, 5],
    'B': [10, None, 30, 40, 50],
    'C': ['cat', 'dog', 'dog', 'cat', 'mouse']
})

test_df = pd.DataFrame({
    'A': [None, 3, 4],
    'B': [15, None, 45],
    'C': ['dog', 'cat', 'mouse']
})

print("Before preprocessing:")
print("Train:\n", train_df)
print("Test:\n", test_df)

# Preprocess train (fit imputer)
train_processed, imputer = preprocess_data(train_df.copy(), fit_imputer=True)

# Preprocess test (use same fitted imputer)
test_processed, _ = preprocess_data(test_df.copy(), imputer=imputer, fit_imputer=False)

print("\nAfter preprocessing:")
print("Train:\n", train_processed)
print("Test:\n", test_processed)


Before preprocessing:
Train:
      A     B      C
0  1.0  10.0    cat
1  2.0   NaN    dog
2  NaN  30.0    dog
3  4.0  40.0    cat
4  5.0  50.0  mouse
Test:
      A     B      C
0  NaN  15.0    dog
1  3.0   NaN    cat
2  4.0  45.0  mouse

After preprocessing:
Train:
      A     B      C
0  1.0  10.0    cat
1  2.0  35.0    dog
2  3.0  30.0    dog
3  4.0  40.0    cat
4  5.0  50.0  mouse
Test:
      A     B      C
0  3.0  15.0    dog
1  3.0  35.0    cat
2  4.0  45.0  mouse


**Task 2**: Pipeline Integration
- Step 1: Use sklearn pipelines to encapsulate the preprocessing steps.
- Step 2: Configure identical pipelines for both training and building inference models.

In [2]:
# write your code from here
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml

# Sample dataset: Titanic (contains numeric & categorical data + missing values)
titanic = fetch_openml('titanic', version=1, as_frame=True)
df = titanic.frame

# Split features and target
X = df.drop(columns='survived')
y = df['survived'].astype(int)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['category', 'object']).columns.tolist()

# Numeric pipeline: impute with median, then scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: impute with most frequent, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


In [3]:
from sklearn.ensemble import RandomForestClassifier

# Full pipeline: preprocessing + classifier
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit on training data
model_pipeline.fit(X, y)

# Example: Predict on new data (inference)
X_new = X.sample(5, random_state=42)  # simulate new data
predictions = model_pipeline.predict(X_new)

print("Predictions on new data:", predictions)


Predictions on new data: [0 1 0 0 0]


**Task 3**: Saving and Loading Preprocessing Models
- Step 1: Save the transformation model after fitting it to the training data.
- Step 2: Load and apply the saved model during inference.

In [8]:
# write your code from here
import joblib

# Assume model_pipeline from previous step is already fitted
filename = 'titanic_model_pipeline.joblib'

# Save the entire pipeline (preprocessing + model)
joblib.dump(model_pipeline, filename)
print(f"Pipeline saved to {filename}")

# Load the pipeline
loaded_pipeline = joblib.load(filename)
print("Pipeline loaded.")

# Example new data (simulate inference data)
import pandas as pd

new_data = pd.DataFrame({
    'pclass': [3, 1],
    'age': [22, 38],
    'sibsp': [1, 1],
    'parch': [0, 0],
    'fare': [7.25, 71.2833],
    'sex': ['male', 'female'],
    'embarked': ['S', 'C'],
    'deck': [None, 'C'],
    'embark_town': ['Southampton', 'Cherbourg'],
    'alive': ['no', 'yes'],
    'class': ['Third', 'First'],
    'who': ['man', 'woman'],
    'adult_male': [True, False],
    'alone': [False, False]
})

# Predict with loaded pipeline
preds = loaded_pipeline.predict(new_data)
print("Predictions on new data:", preds)


Pipeline saved to titanic_model_pipeline.joblib
Pipeline loaded.


ValueError: columns are missing: {'name', 'cabin', 'boat', 'body', 'ticket', 'home.dest'}