In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'
pd.set_option("display.precision", 2)

In [2]:
data = pd.read_csv("titanic_train.csv", index_col="PassengerId")

In [27]:
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_category,Age_category_forty
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0,1.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C,2.0,1.0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S,1.0,1.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2.0,1.0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2.0,1.0


In [4]:
def age_category(age):
    """
    < 30 -> 1
    >= 30, <55 -> 2
    >= 55 -> 3
    """
    if age < 30:
        return 1
    elif age <= 60:
        return 2
    elif age > 60:
        return 3


In [5]:
def age_category_forty(age):
    """
    < 30 -> 1
    >= 30, <55 -> 2
    >= 55 -> 3
    """
    if age <= 40:
        return 1
    elif age > 40:
        return 2

In [6]:
age_categories = [age_category(age) for age in data.Age]
age_categories_forty = [age_category_forty(age) for age in data.Age]
data["Age_category"] = age_categories
data["Age_category_forty"] = age_categories_forty

1. How many men/women were there onboard?

In [7]:
f"male = {data[data['Sex'] == 'male']['Name'].count()}"

'male = 577'

In [8]:
f"female = {data[data['Sex'] == 'female']['Name'].count()}"

'female = 314'

In [9]:
f"female {data.groupby('Sex').count().iloc[0,0]}"

'female 314'

In [10]:
f"male {data.groupby('Sex').count().iloc[1,0]}"

'male 577'

2. Print the distribution of the Pclass feature. Then the same, but for men and women separately. How many men from second class were there onboard?

In [11]:
data.groupby('Pclass')['Name'].count()

Pclass
1    216
2    184
3    491
Name: Name, dtype: int64

In [12]:
data.groupby(['Pclass', 'Sex']).count().iloc[3, 0]

108

3. What are median and standard deviation of Fare?. Round to two decimals.

In [13]:
data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Age_category,Age_category_forty
count,891.0,891.0,714.0,891.0,891.0,891.0,714.0,714.0
mean,0.38,2.31,29.7,0.52,0.38,32.2,1.49,1.21
std,0.49,0.84,14.53,1.1,0.81,49.69,0.56,0.41
min,0.0,1.0,0.42,0.0,0.0,0.0,1.0,1.0
25%,0.0,2.0,20.12,0.0,0.0,7.91,1.0,1.0
50%,0.0,3.0,28.0,0.0,0.0,14.45,1.0,1.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,2.0,1.0
max,1.0,3.0,80.0,8.0,6.0,512.33,3.0,2.0


In [14]:
f"std - {data['Fare'].std()}"

'std - 49.6934285971809'

In [15]:
f"median - {data['Fare'].median()}"

'median - 14.4542'

4. Is that true that the mean age of survived people is higher than that of passengers who eventually died?

In [16]:
not_survived_mean_age = data.groupby('Survived')['Age'].mean().iloc[0]
survived_mean_age = data.groupby('Survived')['Age'].mean().iloc[1]

f"groupby - {survived_mean_age > not_survived_mean_age}"

'groupby - False'

In [17]:
not_survived_mean_age = data[data['Survived'] == 0]['Age'].mean()
survived_mean_age = data[data['Survived'] == 1]['Age'].mean()

f"index - {survived_mean_age > not_survived_mean_age}"

'index - False'

5. Is that true that passengers younger than 30 y.o. survived more frequently than those older than 60 y.o.? What are shares of survived people among young and old people?

In [18]:
survived_age_frequency = data.groupby('Age_category')
for age_cat, sub_df in survived_age_frequency:
    print(age_cat)
    print((sub_df[sub_df['Survived'] == 1]['Survived'].count()/sub_df['Survived'].count()).round(3)*100)

1.0
40.6
2.0
41.9
3.0
22.7


6. Is that true that women survived more frequently than men? What are shares of survived people among men and women?

In [19]:
survived_sex_frequency = data.groupby('Sex')
for sex_cat, sub_df in survived_sex_frequency:
    print(sex_cat)
    print((sub_df[sub_df['Survived'] == 1]['Survived'].count()/sub_df['Survived'].count()).round(3)*100)

female
74.2
male
18.9


7. What's the most popular first name among male passengers?

In [20]:
data.loc[(data.Name.str.contains('William ')) & (data.Sex == 'male')]['Survived'].count()

33

In [21]:
data.loc[(data.Name.str.contains('John ')) & (data.Sex == 'male')]['Survived'].count()

20

In [22]:
data.loc[(data.Name.str.contains('Thomas ')) & (data.Sex == 'male')]['Survived'].count()

11

In [23]:
data.loc[(data.Name.str.contains('Charles ')) & (data.Sex == 'male')]['Survived'].count()

13

8. How is average age for men/women dependent on Pclass? Choose all correct statements:

In [36]:
group_by_pclass_and_sex = data.groupby(['Pclass', 'Sex'])

On average, men of 1 class are older than 40,
On average, women of 1 class are older than 40

In [49]:
for pclass_sex, sub_df in group_by_pclass_and_sex:
    print(pclass_sex)
    print(f"young - {sub_df[sub_df['Age_category_forty'] == 1]['Survived'].count()}")
    print(f"old   - {sub_df[sub_df['Age_category_forty'] == 2]['Survived'].count()}")

(1, 'female')
young - 59
old   - 26
(1, 'male')
young - 51
old   - 50
(2, 'female')
young - 61
old   - 13
(2, 'male')
young - 78
old   - 21
(3, 'female')
young - 93
old   - 9
(3, 'male')
young - 222
old   - 31


Men of all classes are on average older than women of the same class

In [52]:
group_by_pclass = data.groupby('Pclass')

In [56]:
for pclass, sub_df in group_by_pclass:
    print(pclass)
    print(sub_df[(sub_df['Sex'] == 'male')]['Age'].mean() > sub_df[(sub_df['Sex'] == 'female')]['Age'].mean())

1
True
2
True
3
True
