In [1]:
# Project libs
from titanic.data import download_data

# Python built-in libs 
import os
import os.path as osp
from typing import (
    List, 
    Tuple, 
    Dict, 
    Any, 
    Union, 
    Optional, 
    Callable, 
)
# Data Science libs
import pandas as pd
import numpy as np

# Data downloading

# Complexity 1 (35%)




## 1. Load the train.csv dataset with index_col for PassengerId

In [2]:
data_dir = '../data'
df = pd.read_csv(osp.join(data_dir, 'raw', 'train.csv'))
df.set_index('PassengerId', inplace=True)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 2. Convert column names to lowercase


In [3]:
df.rename(columns={col:col.lower() for col in df.columns}, inplace=True)
df.head()

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 3. Explore the dataset and identify the variables that have missing data.


In [4]:
nans_sum = df.isna().sum() 
nans_sum[nans_sum > 0]

age         177
cabin       687
embarked      2
dtype: int64

## 5. Filter the data (evaluate if by rows or columns) and obtain the records that are not
they have missing data


There are 183 records that don't have missing sata

In [5]:
df[~df.isna().any(axis=1)]

Unnamed: 0_level_0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...
872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


## 6. What is the largest number of records (values) for the Survived variable?
(0.1)


549 people did not survided, 342 did

In [6]:
df['survived'].value_counts()

0    549
1    342
Name: survived, dtype: int64

## 7. Who survived more, women or men?


68% of the people who survived were women. 'children and women first'

In [7]:
pd.crosstab(
    df['survived'],
    df['sex'], 
    normalize='index'
) * 100

sex,female,male
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,14.754098,85.245902
1,68.128655,31.871345


## 8. Calculate the average passenger fare for class 1 (pclass)

In [8]:
df[df['pclass'] == 1]['fare'].mean()

84.1546875

The average passenger fare for pclass 1 is 84.15

In [9]:
df.to_csv(osp.join(data_dir, 'processed', 'train.csv'))