In [77]:
import seaborn as sns
import numpy as np
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')

### Loading Dataset

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [4]:
df.deck.value_counts()

deck
C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: count, dtype: int64

In [5]:
df.shape

(891, 15)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [7]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [8]:
df[df['age'].isnull()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,,0,0,13.0000,S,Second,man,True,,Southampton,yes,True
19,1,3,female,,0,0,7.2250,C,Third,woman,False,,Cherbourg,yes,True
26,0,3,male,,0,0,7.2250,C,Third,man,True,,Cherbourg,no,True
28,1,3,female,,0,0,7.8792,Q,Third,woman,False,,Queenstown,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,no,True
863,0,3,female,,8,2,69.5500,S,Third,woman,False,,Southampton,no,False
868,0,3,male,,0,0,9.5000,S,Third,man,True,,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True


In [9]:
# We fill all null ages with a mean of age column

df['age'] = df['age'].fillna(df['age'].mean())

In [10]:
# We fill embark_town with mode since it is categorical datatype

df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode)

In [11]:
# Either we fill embarked or just drop that column it's up to you because embarked and embark_town both are the same.
df['embarked'] = df['embarked'].fillna(df['embarked'].mode)

In [12]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9104
50%,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


- 1.	What is the total number of passengers in the dataset?

In [13]:
df.shape[0]

891

- 2. How many passengers survived the sinking?

In [14]:
df['survived'].value_counts()

survived
0    549
1    342
Name: count, dtype: int64

In [15]:
df[df.survived == 1].shape[0]

342

3. What is the average age of passengers on the Titanic?

In [16]:
round(df.age.mean(),0)

30.0

In [17]:
round(df.age.mean())

30

4.	How many passengers traveled in each passenger class (1st, 2nd, 3rd)?

In [18]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [19]:
df['class'].value_counts()

class
Third     491
First     216
Second    184
Name: count, dtype: int64

In [20]:
df.pclass.value_counts()

pclass
3    491
1    216
2    184
Name: count, dtype: int64

5.	What is the distribution of passengers by gender?

In [21]:
df['sex'].value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [22]:
df.sex.value_counts()

sex
male      577
female    314
Name: count, dtype: int64

6.	What is the survival rate among male passengers?

In [23]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [24]:
male_pass = df[(df.sex== 'male')].shape[0]
male_pass

577

In [25]:
male_sur = df[(df.sex=='male')& (df.survived==1)].shape[0]
male_sur

109

In [26]:
sur_rate= (male_sur/male_pass)*100
sur_rate

18.890814558058924

7.	What is the survival rate among female passengers?

In [27]:
female_pass = df[df.sex=='female'].shape[0]
female_pass

314

In [28]:
female_sur= df[(df.sex=='female')&(df.survived==1)].shape[0]
female_sur

233

In [29]:
sur_rate = (female_sur/female_pass)*100
sur_rate

74.20382165605095

8.	How many passengers embarked from each port (C = Cherbourg, Q = Queenstown, S = Southampton)?

In [30]:
df['embarked'].value_counts()

embarked
S                                                                                                                                                                                         644
C                                                                                                                                                                                         168
Q                                                                                                                                                                                          77
<bound method Series.mode of 0      S\n1      C\n2      S\n3      S\n4      S\n      ..\n886    S\n887    S\n888    S\n889    C\n890    Q\nName: embarked, Length: 891, dtype: object>      2
Name: count, dtype: int64

9.	What is the survival rate for passengers who embarked from Cherbourg?

In [31]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [32]:
emb_pass= df[df.embarked=='C'].shape[0]
emb_pass

168

In [33]:
emb_sur = df[(df.embarked=='C')&(df.survived==1)].shape[0]
emb_sur

93

In [34]:
sur_rate =(emb_sur/emb_pass)*100
sur_rate

55.35714285714286

10.	How many passengers had siblings or spouses aboard the Titanic?

In [35]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [36]:
siblings =df[df.sibsp==1].shape[0]
siblings

209

11.	What is the survival rate for passengers who had siblings or spouses aboard?

In [37]:
survival_rate = df.groupby('sibsp')['survived'].mean()
survival_rate*100

sibsp
0    34.539474
1    53.588517
2    46.428571
3    25.000000
4    16.666667
5     0.000000
8     0.000000
Name: survived, dtype: float64

In [38]:
siblsp_pass= df[df.sibsp!=0].shape[0]
siblsp_pass

283

In [39]:
siblsp_suv=df[(df.sibsp!=0) & (df.survived==1)].shape[0]
siblsp_suv

132

In [40]:
sur_rate = round((siblsp_suv/siblsp_pass)*100,2)
sur_rate

46.64

12.	How many passengers had parents or children aboard the Titanic?

In [41]:
parchl= df[df.parch!=0].shape[0]
parchl

213

In [42]:
df.parch.value_counts()

parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64

13.	What is the survival rate for passengers who had parents or children aboard?

In [43]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [44]:
parch_pass = df[df.parch !=0].shape[0]
parch_pass

213

In [45]:
parch_sur = df[(df.parch !=0)& (df.survived==1)].shape[0]
parch_sur

109

In [46]:
sur_rate = (parch_sur/parch_pass)*100
sur_rate

51.173708920187785

14.	What is the average fare paid by passengers?

In [47]:
Avg_fare = df['fare'].mean()
Avg_fare

32.204207968574636

15.	What is the highest fare paid for a ticket?

In [48]:
Max_fare = df['fare'].max()
Max_fare

512.3292

16.	What is the survival rate for passengers who paid the highest fare?

In [49]:
Sur_rate = df[(df['fare']==Max_fare) & (df.survived==1)].shape[0]
Sur_rate

3

In [50]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [51]:
max_fare = df[df.fare==Max_fare].shape[0]
max_fare

3

In [52]:
sur = df[(df.fare==512.3292) & (df.survived==1)].shape[0]
sur

3

In [53]:
sur_rate = (sur/max_fare)*100
sur_rate

100.0

17.	What is the median age of male passengers?

In [54]:
median_age = df['age'].median()
median_age

29.69911764705882

In [55]:
male = df[df.sex=='male']
male_median = male['age'].median()
male_median

29.69911764705882

 18.	What is the median age of female passengers?

In [56]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [57]:
median_female = df.groupby('sex')['age'].median()
round(median_female,0)

sex
female    30.0
male      30.0
Name: age, dtype: float64

In [58]:
female = df[df.sex=='female']
female_median = female['age'].median()
female_median

29.69911764705882

19.	What is the survival rate for passengers under the age of 18?

In [59]:
Age_bel18 = df[df.age<18].shape[0]
Age_bel18

113

In [60]:
sur = df[(df.age<18) & (df.survived==1)].shape[0]
sur

61

In [61]:
sur_rate = (sur/Age_bel18)*100
sur_rate

53.98230088495575

20.	What is the survival rate for passengers aged 18-30?

In [62]:
age_18_30 = df[(df.age>=18) & (df.age<=30)].shape[0]
age_18_30

473

In [63]:
sur = df[(df.age>=18) & (df.age<=30) & (df.survived==1)].shape[0]
sur

157

In [64]:
sur_rate = (sur/age_18_30)*100
sur_rate

33.192389006342495

21.	What is the survival rate for passengers aged 30-50?

In [65]:
age_grp_30_50 = df[(df['age'] >= 30) & (df['age'] <= 50)]
survival_rate = age_grp_30_50[age_grp_30_50.survived == 1].shape[0] / age_grp_30_50.shape[0]
print(survival_rate*100)

42.10526315789473


22.	What is the survival rate for passengers over the age of 50?

In [66]:
above_50 = df[df.age > 50]
survival_rate = above_50[above_50.survived == 1].shape[0] / above_50.shape[0]
print(survival_rate*100)

34.375


23.	How many passengers had cabins assigned?

In [67]:
df.head(20)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,29.699118,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [68]:
pass_having_cabins = df['deck'].value_counts().sum()
pass_having_cabins

203

24.	What is the survival rate for passengers with cabins assigned?

In [69]:
cab_pass = df[~(df.deck.isnull())].shape[0]
cab_pass

203

In [70]:
sur = df[(~(df.deck.isnull()) & (df.survived==1))].shape[0]
sur

136

In [71]:
sur_rate = (sur/cab_pass)*100
sur_rate

66.99507389162561

25.	What is the survival rate for passengers with missing cabin data?

In [72]:
null_df = df[df.deck.isnull()]

In [73]:
survival_rate = null_df[null_df.survived == 1].shape[0]/null_df.shape[0]
print(survival_rate * 100)

29.941860465116278


28.	What is the average age of passengers in each passenger class?

In [78]:
df.groupby("class")["age"].mean()

class
First     37.048118
Second    29.866958
Third     26.403259
Name: age, dtype: float64

29.	What is the survival rate for passengers in each passenger class?

In [79]:
df.groupby('class')['survived'].mean() * 100

class
First     62.962963
Second    47.282609
Third     24.236253
Name: survived, dtype: float64

30.	What is the survival rate for passengers traveling alone (without siblings, spouses, parents, or children)?


In [80]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [81]:
df.alone.value_counts()

alone
True     537
False    354
Name: count, dtype: int64

In [82]:
alone_pass = df[df.alone==True].shape[0]
alone_pass

537

In [83]:
sur = df[(df.alone==True) & (df.survived==1)].shape[0]
sur

163

In [84]:
sur_rate = round((sur/alone_pass)*100,2)
sur_rate

30.35