In [2]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [3]:
# Load dataset
data = sns.load_dataset('titanic')

# Select features and target
X = data[['pclass', 'age', 'fare', 'sex', 'embarked']]
y = data['survived']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing setup
numeric_features = ['age', 'fare']
categorical_features = ['pclass', 'sex', 'embarked']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Define models
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)), 
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42))
]

best_model = None
best_accuracy = 0.0

# Train and evaluate each model
for name, model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Final pipeline with best model
final_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', best_model)
])

final_pipe.fit(X_train, y_train)
y_pred = final_pipe.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model Accuracy: {final_accuracy:.4f}")


Random Forest Accuracy: 0.7821
Gradient Boosting Accuracy: 0.8212
XGBoost Accuracy: 0.7989
Best Model Accuracy: 0.8212
