## Titanic survival
# Using logistic regression models to predict the survival chance of titanic based on different factors

In [76]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

In [77]:
full_titanic = pd.read_csv("../data/titanic.csv")
columns_to_select = ['Survived', 'Age', 'Sex']
titanic = full_titanic.loc[:, columns_to_select]
titanic.dropna(inplace=True)

In [78]:
## Preprocessing and enconding

for age in (0, 18, 25, 40, 60, 100):
    titanic.loc[(titanic['Age'] <= age), 'Age'] = age

categorical_features = ['Sex','Age']
categorical_encoder = OneHotEncoder(sparse_output=False)

preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_encoder, categorical_features)
    ])

preprocessor.set_output(transform='pandas')
encoded_features = preprocessor.fit_transform(titanic)

label_encoder = LabelEncoder()
encoded_label = label_encoder.fit_transform(titanic['Survived'])
encoded_titanic = encoded_features.assign(Survived=encoded_label)

In [80]:
X = encoded_titanic[encoded_titanic.columns.difference(['Survived'])]
y = encoded_titanic['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Cant plot because x is multidimensional

# model metrics
from sklearn.metrics import mean_squared_error

mse = np.sqrt(mean_squared_error(y_test,y_pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(y_pred)*100:3.3}%)')

score = model.score(X_train,y_train)
print('Model determination: ', score)


Mean error: 0.473 (1.24%)
Model determination:  0.7813084112149533
