In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Select relevant features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# Drop rows with missing values in selected features
df = df[features + ['Survived']].dropna()

print(df.head(10))

    Pclass     Sex   Age  SibSp  Parch     Fare Embarked  Survived
0        3    male  22.0      1      0   7.2500        S         0
1        1  female  38.0      1      0  71.2833        C         1
2        3  female  26.0      0      0   7.9250        S         1
3        1  female  35.0      1      0  53.1000        S         1
4        3    male  35.0      0      0   8.0500        S         0
6        1    male  54.0      0      0  51.8625        S         0
7        3    male   2.0      3      1  21.0750        S         0
8        3  female  27.0      0      2  11.1333        S         1
9        2  female  14.0      1      0  30.0708        C         1
10       3  female   4.0      1      1  16.7000        S         1


In [15]:
# Define feature types: numerical and categorical
num_features = ['Age', 'SibSp', 'Parch', 'Fare']
cat_features = ['Pclass', 'Sex', 'Embarked']

# Define transformation
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformations into preprocessor pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

In [16]:
# Define target and features
X = df[features]
y = df['Survived']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (569, 7)
X_test shape: (143, 7)


In [17]:
# Build and train the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)
print("Model training completed.")

# Make predictions
y_pred = pipeline.predict(X_test)
print("Predictions on test set completed.")

Model training completed.
Predictions on test set completed.


In [18]:
# Evaluate the model
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.76


In [19]:
# Save and load the model
import joblib
joblib.dump(pipeline, 'titanic_model.pkl')
loaded_model = joblib.load('titanic_model.pkl')
print("Model saved and loaded successfully.")

Model saved and loaded successfully.


In [20]:
# Predict with the loaded model
sample_data = pd.DataFrame({
    'Pclass': [3],
    'Sex': ['male'],
    'Age': [22],
    'SibSp': [1],
    'Parch': [0],
    'Fare': [7.25],
    'Embarked': ['S']
})

prediction = loaded_model.predict(sample_data)
print(f"Prediction for sample data (1=Survived, 0=Did not survive): {prediction[0]}")

Prediction for sample data (1=Survived, 0=Did not survive): 0
