In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

titanic = sns.load_dataset('titanic')

X = titanic.drop(columns=['survived', 'fare'])
y = titanic['fare']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
num_features = ['age', 'sibsp', 'parch']
cat_features = ['pclass', 'sex', 'embarked']

def build_pipe(scaler, encoder):
    num_pipe = Pipeline([
        ('imp', SimpleImputer(strategy='median')),
        ('scaler', scaler)
    ])
    # Pipeline for categorical features
    cat_pipe = Pipeline([
        ('imp', SimpleImputer(strategy='most_frequent')),
        ('encoder', encoder)
    ])
    pre = ColumnTransformer([
        ('num', num_pipe, num_features),
        ('cat', cat_pipe, cat_features)
    ])
    return Pipeline([
        ('pre', pre),
        ('lr', LinearRegression())
    ])
pipelines = {
    "Normalization + OHE": build_pipe(MinMaxScaler(), OneHotEncoder(drop='first', handle_unknown='ignore')),
    "Standardization + OHE": build_pipe(StandardScaler(), OneHotEncoder(drop='first', handle_unknown='ignore')),
    "Standardization + Label": build_pipe(StandardScaler(), OrdinalEncoder())
}

results = []
print("Training and evaluating pipelines...")

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)

    r2 = r2_score(y_test, preds)

    results.append({
        'name': name,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    })
    print(f"- Completed: {name}")
df_results = pd.DataFrame(results).sort_values(by='rmse')
print("\n--- Pipeline Performance Comparison ---")
print(df_results.to_string(index=False))

Training and evaluating pipelines...
- Completed: Normalization + OHE
- Completed: Standardization + OHE
- Completed: Standardization + Label

--- Pipeline Performance Comparison ---
                   name      rmse       mae       r2
Standardization + Label 30.470255 20.837023 0.400014
    Normalization + OHE 30.920169 18.870241 0.382165
  Standardization + OHE 30.920169 18.870241 0.382165
