In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv("titanic.csv")

In [4]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [5]:
df.shape

(418, 12)

In [6]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [7]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [9]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [10]:
# number of passengers

df.nunique()

PassengerId    418
Survived         2
Pclass           3
Name           418
Sex              2
Age             79
SibSp            7
Parch            8
Ticket         363
Fare           169
Cabin           76
Embarked         3
dtype: int64

In [61]:
# How many passengers survived?

(df["Survived"]==1).sum() # using sum instead of count because count will show True and False for all lines. Sum adds up all the 1s

152

In [25]:
# Of the passengers who survived, what cabin class did they belong to?

df_survived = df[df["Survived"]==1]
df_survived["Pclass"].value_counts()

3    72
1    50
2    30
Name: Pclass, dtype: int64

In [58]:
# How many females were on the ship?
(df["Sex"]=="female").sum()

152

In [54]:
# How many females survived?
df[(df["Survived"]==1) & (df["Sex"]=="female")].count() # using count because I'm asking for the relevant rows to be displayed, and so they can be counted

PassengerId    152
Survived       152
Pclass         152
Name           152
Sex            152
Age            127
SibSp          152
Parch          152
Ticket         152
Fare           152
Cabin           44
Embarked       152
dtype: int64

In [59]:
# How many males survived?
df[(df["Survived"]==1) & (df["Sex"]=="male")].count() # no males survived according to this dataset

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [68]:
# how many passengers were younger than 16

df[df["Age"]<16].count()

PassengerId    32
Survived       32
Pclass         32
Name           32
Sex            32
Age            32
SibSp          32
Parch          32
Ticket         32
Fare           32
Cabin           4
Embarked       32
dtype: int64

In [74]:
# how many passengers were older than 65

df[df["Age"]>65].count()

PassengerId    2
Survived       2
Pclass         2
Name           2
Sex            2
Age            2
SibSp          2
Parch          2
Ticket         2
Fare           2
Cabin          2
Embarked       2
dtype: int64

In [71]:
# how many people has no cabins

df["Cabin"].isnull().sum()

327

In [72]:
# how many people had cabins

df["Cabin"].notnull().sum()

91

In [86]:
# how many males had cabins

((df["Cabin"].notnull()) & (df["Sex"]=="male")).sum()

47

In [87]:
# how many females had cabins
((df["Cabin"].notnull()) & (df["Sex"]=="female")).sum()

44

In [93]:
# Create Age Groups of the passengers

bins = [0,6,16,20,65,110]
labels = ["Infant", "Child", "Youth", "Adult", "Senior Citizen"]
df["AgeGroup"]=pd.cut(df["Age"], bins=bins, labels=labels, right=False)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Adult
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,Adult
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Adult
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Adult
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,Adult
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,Adult
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,


In [100]:
# How many passengers survived, categorised by AgeGroup (excluding null ages)

df["AgeGroup"][(df["Survived"]==1) & (df["AgeGroup"].notnull())].value_counts()

Adult             98
Youth             15
Infant             7
Child              6
Senior Citizen     1
Name: AgeGroup, dtype: int64