In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

X = df.drop('Survived', axis=1)
y = df['Survived']

In [3]:
num_features = ['Age', 'Fare', 'SibSp', 'Parch']
cat_features = ['Pclass', 'Sex', 'Embarked']

In [4]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',   StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
], remainder='drop')

In [5]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=180,
        max_depth=7,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ))
])

In [6]:
# Train
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

model_pipeline.fit(X_train, y_train)

print("Train accuracy:", model_pipeline.score(X_train, y_train))
print("Test  accuracy:", model_pipeline.score(X_test, y_test))

Train accuracy: 0.8820224719101124
Test  accuracy: 0.8044692737430168


In [8]:
import os

# Create the 'model' folder if it doesn't exist
os.makedirs("model", exist_ok=True)

joblib.dump(model_pipeline, "model/titanic_model_full_pipeline.pkl")
print("Model saved → model/titanic_model_full_pipeline.pkl")


Model saved → model/titanic_model_full_pipeline.pkl
