In [1]:
import pandas as pd
import seaborn as sns

In [2]:
titanic = sns.load_dataset("titanic")

In [3]:
titanic = titanic[['pclass','sex','age','fare','embarked','survived']].dropna()

In [None]:
X = titanic.drop("survived", axis=1)
y = titanic["survived"]

***Train / Validation / Test Split***

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.2, random_state=42, stratify=y_trainval
)

***Build ColumnTransformer***

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [9]:
num_features = ['age', 'fare']
cat_features = ['pclass', 'sex', 'embarked']

In [10]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop="first")

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ]
)

***Building pipelines***

In [None]:

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])


In [13]:
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

y_val_pred = clf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

y_test_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))


Validation Accuracy: 0.7719298245614035
Test Accuracy: 0.7902097902097902


In [None]:

preprocessor_noscale = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop="first"), cat_features),
        ('num', 'passthrough', num_features)
    ]
)

clf_noscale = Pipeline(steps=[
    ('preprocessor', preprocessor_noscale),
    ('model', LogisticRegression(max_iter=1000))
])

clf_noscale.fit(X_train, y_train)

print("Val Accuracy (No Scaling):", accuracy_score(y_val, clf_noscale.predict(X_val)))
print("Val Accuracy (With Scaling):", accuracy_score(y_val, clf.predict(X_val)))


Val Accuracy (No Scaling): 0.7719298245614035
Val Accuracy (With Scaling): 0.7719298245614035
