In [1]:
!pip install scikit-learn



In [3]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
import seaborn as sns
df = sns.load_dataset("titanic")

In [5]:
df = df[[
    "survived",
    "pclass",
    "sex",
    "age",
    "sibsp",
    "parch",
    "fare",
    "embarked"
]]

In [9]:
df["age"] = df["age"].fillna(df["age"].median())
df["embarked"] = df["embarked"].fillna(df["embarked"].mode()[0])

In [10]:
df["sex"] = df["sex"].map({"male": 0, "female": 1})
df["embarked"] = df["embarked"].map({"C": 0, "Q": 1, "S": 2})

In [11]:
X = df.drop("survived", axis=1)
y = df["survived"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Re-apply cleaning directly to ensure X_train is clean
# (Mapping might have failed if categories didn't match)

def clean_titanic(data):
    d = data.copy()
    # Ensure mapping is applied to the string/category values
    if d['sex'].dtype == 'object' or d['sex'].dtype.name == 'category':
        d['sex'] = d['sex'].astype(str).map({'male': 0, 'female': 1})
    if d['embarked'].dtype == 'object' or d['embarked'].dtype.name == 'category':
        d['embarked'] = d['embarked'].astype(str).map({'C': 0, 'Q': 1, 'S': 2})

    # Fill any remaining NaNs that might have been missed
    d = d.fillna(0)
    return d

X_train_clean = clean_titanic(X_train)
X_test_clean = clean_titanic(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_clean, y_train)
print("Model trained successfully!")

Model trained successfully!


In [17]:
# Use the cleaned version of the test set that we prepared in the previous cell
y_pred = model.predict(X_test_clean)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.7318435754189944


In [18]:
with open("titanic_model.pkl", "wb") as file:
    pickle.dump(model, file)

print("Model saved.")

Model saved.


In [19]:
with open("titanic_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

print("Model loaded.")

Model loaded.


In [24]:
loaded_model.score(X_test_clean, y_test)

0.7318435754189944

In [26]:
# Use X_test_clean instead of X_test to avoid NaN errors
samples = X_test_clean.iloc[0:5]

predictions = loaded_model.predict(samples)

print("Predictions:", predictions)
print("Actual:", y_test.iloc[0:5].values)

Predictions: [0 0 0 1 0]
Actual: [1 0 0 1 1]


In [29]:
from sklearn.metrics import classification_report, confusion_matrix

# Use the cleaned test set to avoid NaN errors
y_pred = loaded_model.predict(X_test_clean)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[94 11]
 [37 37]]
              precision    recall  f1-score   support

           0       0.72      0.90      0.80       105
           1       0.77      0.50      0.61        74

    accuracy                           0.73       179
   macro avg       0.74      0.70      0.70       179
weighted avg       0.74      0.73      0.72       179

