In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
# Read data and select interesting ones
titanic = pd.read_csv("./titanic_data.csv")
titanic = titanic[["Pclass", "Age", "Sex", "Survived"]]
titanic.head()

Unnamed: 0,Pclass,Age,Sex,Survived
0,3,22.0,male,0
1,1,38.0,female,1
2,3,26.0,female,1
3,1,35.0,female,1
4,3,35.0,male,0


In [4]:
# Convert string variables to numeric types
titanic["Sex"] = titanic["Sex"].astype("category")
titanic["Sex"] = titanic["Sex"].cat.codes

# Check for a null values
titanic.isnull().sum()

Pclass        0
Age         177
Sex           0
Survived      0
dtype: int64

In [5]:
# Clean the data from nulls and NaNs
titanic["Age"].replace("", np.nan, inplace=True)
titanic.dropna(subset=["Age"], inplace=True)
titanic.isnull().sum()

Pclass      0
Age         0
Sex         0
Survived    0
dtype: int64

In [6]:
# Divide data to training and test
x_train, x_test, y_train, y_test = train_test_split(titanic.drop("Survived", axis=1), titanic["Survived"], test_size=0.2, random_state=1234)


In [7]:
# Create and train the model
logReg = LogisticRegression(solver="lbfgs")
logReg.fit(x_train, y_train)

## Let's use our trained model

We want to predict if 33 years old woman and her 13 years old son will survive if they would be in a 2 class on Titanic.

In [8]:
# Woman
woman = np.array([2, 33, 0])
boy = np.array([2, 13, 1])

result = logReg.predict([woman, boy])

print(f"Woman: {result[0]} \n Boy: {result[1]}")

Woman: 1 
 Boy: 0




### Scoring the model
This number will tells you how precise is our model.

In [10]:
logReg.score(x_test, y_test).round(2)

0.78