In [1]:
# use scikit learn with column transformer 
import pandas as pd 
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
import seaborn as sns 

In [None]:
# load 'titanic' dataset 
titanic = sns.load_dataset('titanic')
titanic.head()

In [None]:
titanic.isnull().sum()

In [4]:
X = titanic[['age', 'fare', 'sex', 'class', 'embark_town']]
y = titanic['survived']

In [5]:
num_transformer = Pipeline( steps= [
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
]
                           )

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))   
])

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, ['age', 'fare']),       
        ('cat', cat_transformer, ['sex', 'class', 'embark_town'])
    ]
)

In [7]:
# build the pipeline 
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [None]:
# prepare the data 
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


In [None]:
# Inspect transformed features
X_transformed = preprocessor.fit_transform(X_train)
print("Transformed Feature Shape:", X_transformed.shape)

In [None]:
X_transformed[:1]

In [22]:
# use grid search 
from sklearn.model_selection import GridSearchCV
param_grid = {
    'classifier__n_estimators': [5, 10, 100],
    'classifier__max_depth': [None, 10, 20, 30]
}

In [None]:
# fit the training data
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv = 3, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
# score on training dataset 
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross validation score: {grid_search.best_score_}")

In [None]:
# score on test dataset / not too much improvement
test_score = grid_search.score(X_test, y_test)
print(f"Test set score: {test_score}")