In [42]:
import numpy as np
import pandas as pd

from scipy.stats import pearsonr

# Read data from .csv file.

In [2]:
titanic_data = pd.read_csv('titanic.csv', index_col='PassengerId')
titanic_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Count number of men and women

In [3]:
male_sex = "male"
female_sex = "female"

In [8]:
sex_data = np.array(titanic_data["Sex"])
male_num = np.sum(sex_data == male_sex)
female_num = np.sum(sex_data == female_sex)

# check valid
if male_num + female_num != len(sex_data):
    print("Wrong numbers!")

print("Number of men: %d\nNumber of women: %d" % (male_num, female_num))

Number of men: 577
Number of women: 314


# Count percentage of survived people

In [21]:
survived_data = np.array(titanic_data["Survived"])
survived = np.sum(survived_data == True)
survived_percentage = round(float(survived) / float(len(survived_data)), 4)

print("Percentage of survived people: %.2f%%" % (survived_percentage * 100.0))

Percentage of survived people: 38.38%


# Count percentage of people in First Class

In [22]:
class_data = np.array(titanic_data["Pclass"])
first_class_num = np.sum(class_data == 1)
first_class_percentage = round(float(first_class_num) / float(len(class_data)), 4)

print("Percentage of people in first class: %.2f%%" % (first_class_percentage * 100.0))

Percentage of people in first class: 24.24%


# Count mean and median of passengers' age

In [41]:
age_data = np.array(titanic_data["Age"])
nan_idx = np.isnan(age_data)
not_nan_idx = ~ nan_idx
not_nan_age_data = age_data[not_nan_idx]

mean_age = round(np.mean(not_nan_age_data), 2)
median_age = round(np.median(not_nan_age_data), 2)

print("Age mean: %.2f, Age median: %.2f" % (mean_age, median_age))

Age mean: 29.70, Age median: 28.00


# Count Pearson's coeff between SibSp and Parch

In [44]:
sibsp_data = np.array(titanic_data["SibSp"])
parch_data = np.array(titanic_data["Parch"])

pearson_coeff, _ = pearsonr(sibsp_data, parch_data)
pearson_coeff = round(pearson_coeff, 2)

print("Pearson's coeff between `SibSp` and `Parch` columns: %.2f" % pearson_coeff)

Pearson's coeff between `SibSp` and `Parch` columns: 0.41


# Most popular woman name

In [62]:
female_sex = "female"
woman_idx = np.array(np.where(titanic_data["Sex"] == female_sex))
# print(woman_idx)
name_data = np.array(titanic_data["Name"])[woman_idx]
print(name_data)

woman_prefixies = ["Mrs.", "Miss.", "Master.", "Ms.", "Mlle.", "Mme.", "Dr."]

"""
num = 0
invalid_names = []
for i, name in enumerate(name_data):
    splitted_to_comma = name.split(',')
    splitted = splitted_to_comma[1].strip().split(' ')
    if splitted[0] in woman_prefixies:
        num += 1
    else:
        invalid_names.append(name)
        
print(invalid_names)
"""

def get_name(full_name_str):
    splitted_to_comma = full_name_str.split(',')
    full_name = splitted_to_comma[1]
    if '(' in full_name:
        splitted_brackets = full_name.split('(')
        return splitted_brackets[1].split(' ')[0]
    else:
        return splitted_to_comma[1].strip().split(' ')[1]

woman_names = np.array([get_name(name) for name in name_data])

names_dict = dict()
for name in woman_names:
    if name in names_dict.keys():
        names_dict[name] += 1
    else:
        names_dict[name] = 1

max_num = max(names_dict.values())
name_with_max_num = list(filter(lambda name: names_dict[name] == max_num, names_dict.keys()))
print(name_with_max_num)

[['Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
  'Heikkinen, Miss. Laina' 'Futrelle, Mrs. Jacques Heath (Lily May Peel)'
  'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)'
  'Nasser, Mrs. Nicholas (Adele Achem)' 'Sandstrom, Miss. Marguerite Rut'
  'Bonnell, Miss. Elizabeth' 'Vestrom, Miss. Hulda Amanda Adolfina'
  'Hewlett, Mrs. (Mary D Kingcome) '
  'Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)'
  'Masselmani, Mrs. Fatima' 'McGowan, Miss. Anna "Annie"'
  'Palsson, Miss. Torborg Danira'
  'Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)'
  'O\'Dwyer, Miss. Ellen "Nellie"'
  'Spencer, Mrs. William Augustus (Marie Eugenie)'
  'Glynn, Miss. Mary Agatha' 'Vander Planke, Miss. Augusta Maria'
  'Nicola-Yarred, Miss. Jamila'
  'Ahlin, Mrs. Johan (Johanna Persdotter Larsson)'
  'Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)'
  'Laroche, Miss. Simonne Marie Anne Andree'
  'Devaney, Miss. Margaret Delia' "O'Driscoll, Miss. Bridget"
  'Arnold-Franchi, M

AttributeError: 'numpy.ndarray' object has no attribute 'split'