## Data Profiling for Titanic Dataset
* Inspect the dataset
* Get rid of empty values

In [9]:
import pandas as pd

In [10]:
train_df = pd.read_csv("./raw/train.csv")

In [11]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
gender_sub = pd.read_csv("./raw/gender_submission.csv")

In [13]:
gender_sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [14]:
train_df.shape

(891, 12)

In [15]:
train_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [43]:
# random questions: how many people survived?
series = train_df["PassengerId"]
series.loc[train_df["Survived"] == 1]

1        2
2        3
3        4
8        9
9       10
      ... 
875    876
879    880
880    881
887    888
889    890
Name: PassengerId, Length: 342, dtype: int64

In [23]:
# what's the survival ratio?
ratio = series.loc[train_df["Survived"] == 1].size / series.loc[train_df["Survived"] == 0].size
ratio
# how to iterate value in Series using loc?

0.6229508196721312

In [48]:
# male and survived
# why: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
train_df.loc[(train_df["Survived"] == 1) & (train_df["Sex"] == "male")]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0000,D56,S
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5000,A6,S
36,37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C
55,56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5000,C52,S
...,...,...,...,...,...,...,...,...,...,...,...,...
838,839,1,3,"Chip, Mr. Chang",male,32.0,0,0,1601,56.4958,,S
839,840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7000,C47,C
857,858,1,1,"Daly, Mr. Peter Denis",male,51.0,0,0,113055,26.5500,E17,S
869,870,1,3,"Johnson, Master. Harold Theodor",male,4.0,1,1,347742,11.1333,,S


In [102]:
# how many people coming from same family
name = train_df["Name"]
survived_name_with_family_onboard = name.loc[(train_df["SibSp"] > 0) & (train_df["Parch"] > 0) & (train_df["Survived"] == 1)]
last_name = survived_name_with_family_onboard.apply(lambda x: x.split(",")[0])
last_name.groupby(last_name).aggregate("count")

Name
Abbott              1
Allison             1
Andersson           1
Asplund             3
Baclini             3
Becker              2
Beckwith            2
Brown               1
Caldwell            1
Carter              4
Christy             1
Collyer             1
Compton             1
Coutts              2
Davies              1
Dean                1
Drew                1
Fortune             2
Frolicher-Stehli    1
Goldsmith           1
Hamalainen          1
Hart                1
Hays                1
Herman              2
Hocking             1
Jacobsohn           1
Johnson             2
Laroche             2
Moubarek            2
Nakid               1
Navratil            2
Peter               1
Quick               1
Richards            3
Ryerson             2
Sandstrom           1
Spedden             1
Taussig             1
Thayer              1
Wells               1
West                2
Wick                1
Name: Name, dtype: int64

In [103]:
# find out who they are
survived_name_with_family_onboard

10                       Sandstrom, Miss. Marguerite Rut
25     Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...
43              Laroche, Miss. Simonne Marie Anne Andree
58                          West, Miss. Constance Mirium
65                              Moubarek, Master. Gerios
                             ...                        
831                      Richards, Master. George Sibley
835                          Compton, Miss. Sara Rebecca
856           Wick, Mrs. George Dennick (Mary Hitchcock)
869                      Johnson, Master. Harold Theodor
871     Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
Name: Name, Length: 62, dtype: object

In [None]:
# groupby family names:
# xxx family -> A, B, C...