In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

titanic_ds = sns.load_dataset('titanic')
colors = ['red' if x==0 else 'blue' for x in [0, 1]]

titanic_ds.drop(labels=['embark_town', 'embarked', 'deck', 'class', 'alive'], axis=1, inplace=True)

titanic_ds['sex'] = titanic_ds['sex'].map({'male': 0, 'female': 1})
titanic_ds['who'] = titanic_ds['who'].map({'man': 0, 'woman': 1, 'child': 2}) 

titanic_ds.dropna(inplace=True)

titanic_ds

In [None]:
X, y = titanic_ds.drop('survived', axis=1), titanic_ds['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=60)
X_train

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
accuracy

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

predictions = dtree.predict(X_test)
print(classification_report(y_test, predictions))

## The overall accuracy obtained with the Decision Tree method is good, in this case we obtain 0.8 in the report. Even though this result is good, it is lower (by 0.04) than the result we obtained in the Logistic Regression model

In [None]:
titanic_ds.head()

In [None]:
fn = ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'who', 'adult_male', 'alone']
plt.figure(figsize=(80, 40))
cn = ['Died', 'Survived']

plot_tree(dtree, filled=True, feature_names=fn, class_names=cn)

In [None]:
avgMale = {
    'pclass': [0],
    'sex': [0],
    'age': [20],
    'sibsp': [0],
    'parch': [0],
    'fare': [25],
    'who': [0],
    'adult_male': [True],
    'alone': [True]
}

avgFemale = {
    'pclass': [0],
    'sex': [1],
    'age': [20],
    'sibsp': [0],
    'parch': [0],
    'fare': [25],
    'who': [1],
    'adult_male': [False],
    'alone': [True]
}

mySelfTest = {
    'pclass': [0],
    'sex': [0],
    'age': [19],
    'sibsp': [0],
    'parch': [0],
    'fare': [25],
    'who': [0],
    'adult_male': [True],
    'alone': [True]
}

avgMale = pd.DataFrame(avgMale)
avgFemale = pd.DataFrame(avgFemale)
mySelfTest = pd.DataFrame(mySelfTest)
print(f'Do you survive the Titanic?\nSubject - Talon (Survived = 0, Died = 1): {dtree.predict(avgMale)}\nSubject - Mikhaela (Survived = 0, Died = 1): {dtree.predict(avgFemale)}\nSubject - Alejandro (Survived = 0, Died = 1): {dtree.predict(mySelfTest)}')

## This example shows how adult males who were alone in the Titanic had a higher chance of survival under the same conditions as an adult woman who was also alone, this shows that alone women had a lower chance of survival than men, this changes when women were accompanied by someone else

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=60)
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
forest.predict(X_test)
forest.score(X_test, y_test)

## The Random Forest Classifier seems to obtain the best result in the score test, this could be attributed to the nature of the random forest, where the results are aggregated and each tree votes in order to give an output, we need to also keep in mind the fact that the test size is smaller leaving a smaller chance of error 

In [None]:
fn = ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'who', 'adult_male', 'alone']
cn = ['Died', 'Survived']

plt.figure(figsize=(80, 40))
plot_tree(forest.estimators_[0], filled=True, feature_names=fn, class_names=cn)

In [None]:
print(forest.predict(avgMale), forest.predict(avgFemale))

## Here, we obtain the same results as before, where the adult (alone) male has a better chance of survival than the adult (alone) female