In [4]:
# Project libs
from titanic.data import download_data

# Python built-in libs 
import os
import os.path as osp
from typing import (
    List, 
    Tuple, 
    Dict, 
    Any, 
    Union, 
    Optional, 
    Callable, 
)
# Data Science libs
import pandas as pd
import numpy as np
import seaborn as sns

In [5]:
data_dir = '../data'
df = pd.read_csv(osp.join(data_dir, 'processed', 'train.csv'),
                 index_col='PassengerId')
df

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Complexity 2 (35%)




## 9. Is there something particular between the age of the people and the variable Survived?


In [16]:
fig = px.violin(
    df, y="age", 
    color="survived", 
    violinmode="overlay"
)

fig.show()

histograma de la edad, separado por si sobrevivió o no

There is a considerable concentration of younger (children) people that survived 

## 10. Did large families survive more? SibSp + Parch

In [19]:
df['familysz'] = df['sibsp'] + df['parch'] 
df['familysz'].head()


PassengerId
1    1
2    1
3    0
4    1
5    0
Name: familysz, dtype: int64

In [40]:
counts =  pd.crosstab(
     df['familysz'], 
     df['survived'].apply(lambda x : 'perc_survived' if bool(x) else 'perc_died'),
     normalize='index'
) * 100 
counts

survived,perc_died,perc_survived
familysz,Unnamed: 1_level_1,Unnamed: 2_level_1
0,69.646182,30.353818
1,44.720497,55.279503
2,42.156863,57.843137
3,27.586207,72.413793
4,80.0,20.0
5,86.363636,13.636364
6,66.666667,33.333333
7,100.0,0.0
10,100.0,0.0


In [53]:
bins: Dict[str, Tuple[int, int]] = {
    'zero': (0, 1), 
    'medium': (1, 4), 
    'large': (4, np.inf)
}

def bin_func(x: int, bins: Dict[str, Tuple[int, int]]) -> str:
    for k, v in bins.items():
        if v[0] <= x < v[1]:
            return k

df['cat_familysz'] = df['familysz'].apply(lambda x: bin_func(x, bins=bins))

In [54]:
pd.crosstab(
    df['cat_familysz'],
    df['survived'],
    normalize='index'
) * 100

survived,0,1
cat_familysz,Unnamed: 1_level_1,Unnamed: 2_level_1
large,83.870968,16.129032
medium,42.123288,57.876712
zero,69.646182,30.353818


We defined 'large' families as the ones with size greater or equal than 4 people. 83% of individuals from these families did not survided.  

## 11. For this data set, How true is it that "women and children first…"?


Yes, we defined 5 categories of ages specified by the below ranges

In [58]:
age_mapper: Dict[Tuple[int, int], str]= {
    'child': (0, 13),
    'teen': (13, 22),
    'young': (22, 30),
    'adult': (30, 50),
    'old': (50, np.inf)
}

df['age_group'] = df['age'].apply(lambda x: bin_func(x, bins=age_mapper))

If we observe the distribution of the number of people that survided given these 2 categories, we can easily notice that female survived more in all age groups defined, specially in the 'old' category. 

In [80]:
age_sex_counts = pd.crosstab(
    df['age_group'],
    df['sex'],
    values=df['survived'],
    aggfunc=sum
)

age_sex_counts

sex,female,male
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
adult,72,35
child,19,21
old,20,7
teen,38,9
young,48,21


And globally:

In [81]:
age_sex_counts.sum(0)

sex
female    197
male       93
dtype: int64

Furthermore, the age group with more percentage of survivants is 'child'

In [82]:
pd.crosstab(
    df['age_group'],
    df['survived'],
    normalize='index'
) * 100

survived,0,1
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
adult,58.203125,41.796875
child,42.028986,57.971014
old,63.513514,36.486486
teen,65.185185,34.814815
young,61.666667,38.333333


In [84]:
df.to_csv(
    osp.join(data_dir, 'processed', 'train.csv')
)