In [1]:
#useful libraries
import pandas as pd
import numpy as np

In [2]:
#useful functions

# This function can be used to analyze any DataFrame: it takes each field of the DataFrame
# and calculates their unique values.
# If their number is meaningful it also tells us the frequency of that value in the field.
def find_unique(list_columns, df):
    #These variables can be changed, because they depend on the DataFrame we are analysing
    unique_limit = 50
    frequency = 10
    for col in list_columns:
        unique_val = df[col].nunique()
        if unique_val < unique_limit:
            print(f"\nThe column {col} contains {unique_val} unique values")
            if unique_val < frequency:
                print(f"With this frequency: {df[col].value_counts()}")
                
# Also this function can be used to analyze any DataFrame: it tells us if the fields of the Dataframe
# under investigation contain any null values
def find_null(list_columns, df):
    for col in list_columns:
        if df[col].isna().sum() > 0:
            print(f"The column {col} contains {df[col].isna().sum()} null values")

In [3]:
Titanic = pd.read_csv("titanic_crew_passengers.csv")

In [121]:
Titanic

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,Southampton,United States,5547.0,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,Southampton,United States,2673.0,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,Southampton,United States,2673.0,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,Southampton,England,2673.0,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,Southampton,Norway,348125.0,7.13,0.0,0.0,yes
...,...,...,...,...,...,...,...,...,...,...,...
2202,"Wynn, Mr. Walter",male,41.0,deck crew,Belfast,England,,,,,yes
2203,"Yearsley, Mr. Harry",male,40.0,victualling crew,Southampton,England,,,,,yes
2204,"Young, Mr. Francis James",male,32.0,engineering crew,Southampton,England,,,,,no
2205,"Zanetti, Sig. Minio",male,20.0,restaurant staff,Southampton,England,,,,,no






LET'S DO SOME EXPLORATORY DATA ANALYSIS

In [5]:
Titanic.shape

(2207, 11)

The DataFrame contains 2207 rows and 11 columns.
This dataset contains some informations about the passengers of Titanic during its first and last voyage (we all know how it ended). Every field content is very clear (name, class, cost of ticket...) except for these fields:
   - SibSp indicates how many siblings (brother, sister, stepbrother, stepsister) or Spouse (husband, wife) where on the ship with the passenger: from 0 to n; fiancés are not considered.
   - Parch indicates how many Parents (mother, father) or children (daughter, son, stepdaughter, stepson)
      where on the Titanic with the passenger, from 0 to n.
      Some children travelled only with a nanny, therefore parch = 0 for them.
   - Embarked column indicates where the passenger embarked: B = Belfast (Northern Ireland) ; C = Cherbourg (France); S = Southampton (England); Q = Queenstown (Ireland). I insert the following line of code to make the data more readable:

In [120]:
Titanic.loc[Titanic["embarked"] == "S", "embarked"] = "Southampton"
Titanic.loc[Titanic["embarked"] == "Q", "embarked"] = "Queenstown"
Titanic.loc[Titanic["embarked"] == "C", "embarked"] = "Cherbourg"
Titanic.loc[Titanic["embarked"] == "B", "embarked"] = "Belfast"

In [6]:
Titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2207 entries, 0 to 2206
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      2207 non-null   object 
 1   gender    2207 non-null   object 
 2   age       2205 non-null   float64
 3   class     2207 non-null   object 
 4   embarked  2207 non-null   object 
 5   country   2126 non-null   object 
 6   ticketno  1316 non-null   float64
 7   fare      1291 non-null   float64
 8   sibsp     1307 non-null   float64
 9   parch     1307 non-null   float64
 10  survived  2207 non-null   object 
dtypes: float64(5), object(6)
memory usage: 189.8+ KB


We can see the Dataset is quite complete and the type for each field is right. But there are some Null data:

In [7]:
find_null(Titanic.columns, Titanic)

The column age contains 2 null values
The column country contains 81 null values
The column ticketno contains 891 null values
The column fare contains 916 null values
The column sibsp contains 900 null values
The column parch contains 900 null values


Age field has just two Null data, so I think it's a good idea to swap the null value with the mean:

In [8]:
Titanic[Titanic["age"].isna()]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
439,"Gheorgheff, Mr. Stanio",male,,3rd,C,Bulgaria,349254.0,7.1711,0.0,0.0,no
677,"Kraeff, Mr. Theodor",male,,3rd,C,Bulgaria,349253.0,7.1711,0.0,0.0,no


In [9]:
Titanic["age"].fillna(value = Titanic["age"].mean(), inplace = True)

Around a half of "Number of ticket, cost, siblings/spouse and parents/children" data are missing. This is not good: let's see if I can discover the reason looking at the rows containing those null data: I choose "ticketno" column because it's the less empty

In [10]:
Titanic[Titanic["ticketno"].isnull()]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
476,"Green, Mr. George",male,20.0,engineering crew,S,England,,,,,no
653,"Kelly, Mr. James",male,44.0,engineering crew,S,England,,,,,no
1209,"Törnquist, Mr. William Henry",male,26.0,3rd,S,Sweden,,,0.0,0.0,yes
1319,"Abbott, Mr. Ernest Owen",male,21.0,victualling crew,S,England,,,,,no
1320,"Abrams, Mr. William Thomas",male,34.0,engineering crew,S,England,,,,,no
...,...,...,...,...,...,...,...,...,...,...,...
2202,"Wynn, Mr. Walter",male,41.0,deck crew,B,England,,,,,yes
2203,"Yearsley, Mr. Harry",male,40.0,victualling crew,S,England,,,,,yes
2204,"Young, Mr. Francis James",male,32.0,engineering crew,S,England,,,,,no
2205,"Zanetti, Sig. Minio",male,20.0,restaurant staff,S,England,,,,,no


Looking at the rows I see that most of passengers without number of ticket were part of the crew and it would be reasonable: in fact they didn't have a ticket, they didn't pay to traverl on Titanic and I think they travelled by themselves. I group them by class to see if it's true.

In [11]:
Titanic[Titanic["ticketno"].isnull()].groupby("class")["class"].count()

class
3rd                   1
deck crew            66
engineering crew    324
restaurant staff     69
victualling crew    431
Name: class, dtype: int64

So yes, most of the null values belong to the crew's data. I think it's better to create two different DF, one for the passengers and one for the crew:

In [12]:
Titanic.groupby("class")["class"].count()

class
1st                 324
2nd                 284
3rd                 709
deck crew            66
engineering crew    324
restaurant staff     69
victualling crew    431
Name: class, dtype: int64

In [122]:
Titanic_passengers = pd.DataFrame(Titanic[(Titanic["class"] == "1st") | (Titanic["class"] == "2nd") | (Titanic["class"] == "3rd")])

In [14]:
Titanic_passengers.shape

(1317, 11)

The DF has 1317 rows, the number of passengers of Titanic.

In [123]:
Titanic_crew = pd.DataFrame(Titanic[(Titanic["class"] == "deck crew") | (Titanic["class"] == "engineering crew") | (Titanic["class"] == "restaurant staff") | (Titanic["class"] == "victualling crew")])

In [17]:
Titanic_crew.shape

(890, 11)

The crew consisted of 890 people.

Now I can analyze, another time, null data and see if something has changed.

In [18]:
find_null(Titanic_passengers.columns, Titanic_passengers)

The column country contains 77 null values
The column ticketno contains 1 null values
The column fare contains 26 null values
The column sibsp contains 10 null values
The column parch contains 10 null values


Country has 77 null values: maybe the origin of the passenger is not known and we can find values looking for some relatives for example. There are a lot of other data we can use in this column so I leave it for later.

There is one person without number of ticket:

In [18]:
Titanic_passengers[Titanic_passengers["ticketno"].isna()]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
1209,"Törnquist, Mr. William Henry",male,26.0,3rd,S,Sweden,,,0.0,0.0,yes


This person had no relatives but I've decided to look for some information online: I've discovered that he traveled with other 5 co-workers and they were seaman for the "American Line" company. They had a problem with the ship they had to use to come back in USA, so they paid for a third class accomodation on Titanic.
I've found the number of their tiket: 370160; so I can replace the null value. Then I take a look at this group:

In [124]:
Titanic_passengers["ticketno"].fillna(value = 370160, inplace = True)

In [24]:
Titanic_passengers[Titanic_passengers["ticketno"] == 370160]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
211,"Carver, Mr. Alfred John",male,28.0,3rd,S,England,370160.0,7.05,0.0,0.0,no
615,"Johnson, Mr. August",male,49.0,3rd,S,United States,370160.0,7.05,0.0,0.0,no
618,"Johnson, Mr. William Cahoone Jr.",male,19.0,3rd,S,United States,370160.0,7.05,0.0,0.0,no
1094,"Shannon, Mr. Andrew John",male,35.0,3rd,S,Ireland,370160.0,7.05,0.0,0.0,no
1163,"Storey, Mr. Thomas",male,59.0,3rd,S,England,370160.0,7.05,0.0,0.0,no
1209,"Törnquist, Mr. William Henry",male,26.0,3rd,S,Sweden,370160.0,7.05,0.0,0.0,yes


Mr. Törnquist was the only one of the group who survived, everybody of his fellow American Line employees died. I see that five of them have null values in "fare" column, but one of them paid 7.05 pounds: the ticket is the same so it is possible that all of them paid that price. Moreover I see that two of them have null values in relatives columns, but I can assume they didn't travel with families while they were working for American Line Company and it is just an unfortunate coincidence that they were on Titanic.
So I can replace all these null values.

In [125]:
Titanic_passengers.loc[Titanic_passengers["ticketno"] == 370160, "fare"] = 7.05
Titanic_passengers.loc[Titanic_passengers["ticketno"] == 370160, "sibsp"] = 0
Titanic_passengers.loc[Titanic_passengers["ticketno"] == 370160, "parch"] = 0

After the cleaning process let's take another look at the null data:

In [128]:
find_null(Titanic_passengers.columns, Titanic_passengers)

The column country contains 77 null values
The column fare contains 21 null values
The column sibsp contains 8 null values
The column parch contains 8 null values


I see that the columns containing informations about families on Titanic have 8 null data (probably the same rows):

In [26]:
Titanic_passengers[Titanic_passengers["sibsp"].isna()]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
144,"Brailey, Mr. William Theodore Ronald",male,24.0,2nd,S,England,250654.0,,,,no
150,"Bricoux, Mr. Roger Marie",male,20.0,2nd,S,France,250654.0,,,,no
237,"Clarke, Mr. John Frederick Preston",male,28.0,2nd,S,England,250654.0,,,,no
516,"Hartley, Mr. Wallace Henry",male,33.0,2nd,S,England,250654.0,,,,no
576,"Hume, Mr. John Law",male,21.0,2nd,S,,250654.0,,,,no
680,"Krins, Mr. Georges Alexandre",male,23.0,2nd,S,England,250654.0,,,,no
1189,"Taylor, Mr. Percy Cornelius",male,40.0,2nd,S,England,250654.0,,,,no
1304,"Woodward, Mr. John Wesley",male,32.0,2nd,S,England,250654.0,,,,no


All of those people have the same number of ticket and they have no fare: looking for some information about this ticket online I've found that they were the musicians of Titanic!! All of them were very young and the group is notable for playing music, intending to calm the passengers for as long as they possibly could, during the ship's sinking in which all of the members perished.

I think nobody of them paid a ticket fare so i can set these values to 0; and I can think that, like the American Line group, they didn't travel with some relatives, as they were working on Titanic. Moreover, I've discovered that Mr Hume was Scottish, so I can clean also this null value:

In [129]:
Titanic_passengers.loc[Titanic_passengers["ticketno"] == 250654, "fare"] = 0
Titanic_passengers.loc[Titanic_passengers["ticketno"] == 250654, "sibsp"] = 0
Titanic_passengers.loc[Titanic_passengers["ticketno"] == 250654, "parch"] = 0
Titanic_passengers.loc[Titanic_passengers["name"] == "Hume, Mr. John Law", "country"] = "Scotland"

In [130]:
Titanic_passengers[Titanic_passengers["ticketno"] == 250654]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
144,"Brailey, Mr. William Theodore Ronald",male,24.0,2nd,Southampton,England,250654.0,0.0,0.0,0.0,no
150,"Bricoux, Mr. Roger Marie",male,20.0,2nd,Southampton,France,250654.0,0.0,0.0,0.0,no
237,"Clarke, Mr. John Frederick Preston",male,28.0,2nd,Southampton,England,250654.0,0.0,0.0,0.0,no
516,"Hartley, Mr. Wallace Henry",male,33.0,2nd,Southampton,England,250654.0,0.0,0.0,0.0,no
576,"Hume, Mr. John Law",male,21.0,2nd,Southampton,Scotland,250654.0,0.0,0.0,0.0,no
680,"Krins, Mr. Georges Alexandre",male,23.0,2nd,Southampton,England,250654.0,0.0,0.0,0.0,no
1189,"Taylor, Mr. Percy Cornelius",male,40.0,2nd,Southampton,England,250654.0,0.0,0.0,0.0,no
1304,"Woodward, Mr. John Wesley",male,32.0,2nd,Southampton,England,250654.0,0.0,0.0,0.0,no


In [131]:
find_null(Titanic_passengers.columns, Titanic_passengers)

The column country contains 76 null values
The column fare contains 13 null values


The fare column contains 13 null values:

In [30]:
Titanic_passengers[Titanic_passengers["fare"].isna()]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
45,"Andrews, Mr. Thomas",male,39.0,1st,B,Northern Ireland,112050.0,,0.0,0.0,no
189,"Campbell, Mr. William Henry",male,21.0,2nd,B,Northern Ireland,239853.0,,0.0,0.0,no
230,"Chisholm, Mr. Roderick Robert Crispin",male,43.0,1st,B,Scotland,112051.0,,0.0,0.0,no
288,"Cunningham, Mr. Alfred Fleming",male,22.0,2nd,B,Northern Ireland,239853.0,,0.0,0.0,no
423,"Frost, Mr. Anthony Wood",male,39.0,2nd,B,England,239854.0,,0.0,0.0,no
424,"Fry, Mr. Richard",male,39.0,1st,S,England,112058.0,,0.0,0.0,no
515,"Harrison, Mr. William Henry",male,46.0,1st,S,England,112059.0,,0.0,0.0,no
588,"Ismay, Mr. Joseph Bruce",male,49.0,1st,S,England,112058.0,,0.0,0.0,yes
676,"Knight, Mr. Robert",male,41.0,2nd,B,Northern Ireland,239855.0,,0.0,0.0,no
940,"Parkes, Mr. Francis",male,21.0,2nd,B,Northern Ireland,239853.0,,0.0,0.0,no


The group is formed by a 1st and 2nd class male people. Looking at the Ticket number and class it's clear that those people could somehow know each other: the numbers are very similar, just the last digit is different (and just one person seems to be an outsider, Mr Reuchlin).
In fact it is possible to find some information about those people online:
   - Andrews, Campbell, Chisholm, Cunningham, Frost, Knight, Parkes, Parr, Watson were the nine-member of Belfast shipbuilder Harland & Wolff guarantee group. They were given passenger accommodation, but they were also regarded as members of the crew. Headed by the master-shipbuilder, Thomas Andrews Jr, the group's responsibility was to accompany the ship on her maiden voyage to oversee any unfinished work or find and fix any problems that might arise during the voyage. Mr. Andrews was the architect of Titanic: He's quite famous as it is said He acted like a hero during the sinking; you can see him also in the movie.
   - Mr. Fry was Mr. Ismay valet and Mr. Harrison was his private secretary: Mr. Ismay was se CEO of White Star Line; you can also see him in the movie; he's the only one of this entire group who survived, in fact there is some controversy surrounding his rescue.
   - Mr Reuchlin was director of passage for the Holland America Line, which was part of the International Mercantile Marine, J. Pierpont Morgan's conglomerate that also owned the White Star Line. He was an honorable guest invited by White Star Line, that's why the ticket was complimentary.
   
For all these reasons it is possible to say that nobody of this group paid for the ticket and it's possibile to replace null values

In [132]:
Titanic_passengers["fare"].fillna(value = 0, inplace = True)

In [None]:
Titanic_passengers[Titanic_passengers["embarked"] == "B"]

Notice that The Garantee Group is the only one in Passengers' DataFrame who embarked in Belfast, most likely becaues the Company needed them to be sure the first Titanic trip (Belfast-Southampton) ended smoothly.

Now let's take a look at crew's null data:

In [133]:
find_null(Titanic_crew.columns, Titanic_crew)

The column country contains 4 null values
The column ticketno contains 890 null values
The column fare contains 890 null values
The column sibsp contains 890 null values
The column parch contains 890 null values


Four columns contain just null data: in these case it's a good idea to just drop them:

In [142]:
Titanic_crew = Titanic_crew.drop(columns = ["ticketno", "fare", "sibsp", "parch"])

In country columns there are just four null data: maybe I can solve the problem or just look for some information online:

In [34]:
Titanic_crew[Titanic_crew["country"].isna()]

Unnamed: 0,name,gender,age,class,embarked,country,survived
1426,"Bristow, Mr. Harry",male,38.0,victualling crew,S,,no
1830,"McAndrews, Mr. William",male,23.0,engineering crew,S,,no
2066,"Smith, Mr. John Richard Jago",male,35.0,victualling crew,S,,no
2078,"Steel, Mr. Robert Edward",male,30.0,engineering crew,S,,no


Mr. Bristow and Mr. Smith were born in Cornwall, Mr. McAndrews in England, Mr. Steel in Guernsey in the Channel Islands: it's possible to replace null values.

In [134]:
Titanic_crew["country"].fillna(value = "England", inplace = True)
Titanic_crew.loc[Titanic_crew["name"] == "Steel, Mr. Robert Edward", "country"] = "Channel Islands"

NOW IT CAN BE USEFUL TO TAKE A LOOK TO SOME STATISTICAL DATA ABOUT THE TWO DF:

In [36]:
Titanic_passengers.describe()

Unnamed: 0,age,ticketno,fare,sibsp,parch
count,1317.0,1317.0,1317.0,1317.0,1317.0
mean,29.627858,284280.9,32.772055,0.495824,0.382688
std,13.841236,633236.3,51.902554,1.039211,0.863444
min,0.166667,2.0,0.0,0.0,0.0
25%,21.0,14263.0,7.1711,0.0,0.0
50%,28.0,111427.0,14.0901,0.0,0.0
75%,38.0,347077.0,31.0506,1.0,0.0
max,74.0,3101317.0,512.0607,8.0,9.0


These statistical data help us to better understand the DF.
   - Note the Age field, where the mean is just 29, but the maximum age is 74 and also the third quartile is just 38: these data tell us that Passengers were quite young.
   - We can also have some information about the Fare of tickets, it's interesting that maximum was 512 pounds (per person, the room could cost much more), but mean is 33 pounds and second quartile is just 14 pounds; of course most of passengers were in third class and paid a lot less than the few people in first class.

In [37]:
Titanic_crew.describe()

Unnamed: 0,age
count,890.0
mean,31.652809
std,8.946643
min,15.0
25%,25.0
50%,31.0
75%,37.0
max,63.0


The crew was also very young (mean is 31).

NOW LET'S TAKE A LOOK TO UNIQUE VALUES OF THE TWO DF:

In [38]:
find_unique(Titanic_passengers.columns, Titanic_passengers)


The column gender contains 2 unique values
With this frequency: gender
male      851
female    466
Name: count, dtype: int64

The column class contains 3 unique values
With this frequency: class
3rd    709
1st    324
2nd    284
Name: count, dtype: int64

The column embarked contains 4 unique values
With this frequency: embarked
S    914
C    271
Q    123
B      9
Name: count, dtype: int64

The column country contains 47 unique values

The column sibsp contains 7 unique values
With this frequency: sibsp
0.0    899
1.0    319
2.0     42
4.0     22
3.0     20
8.0      9
5.0      6
Name: count, dtype: int64

The column parch contains 8 unique values
With this frequency: parch
0.0    1010
1.0     170
2.0     113
3.0       8
5.0       6
4.0       6
6.0       2
9.0       2
Name: count, dtype: int64

The column survived contains 2 unique values
With this frequency: survived
no     817
yes    500
Name: count, dtype: int64


Looking at unique values we can already say a lot about Titanic's passengers:
   - there were much more male (almost the double) than female and most of them were travelling by themselves;
   - there were more people in third class than in first and second together;
   - almost the totality of passengers embarked in Southampton, England, but 9 of them embarked in Belfast (later we'll take a look at them).
   - Most of passengers traveled by themselves (without any relative)
   - just 500 passengers out of 1317 could survive: Later I'll see if there is any relation between survival rate and gender, class or age.

In [39]:
find_unique(Titanic_crew.columns, Titanic_crew)


The column gender contains 2 unique values
With this frequency: gender
male      867
female     23
Name: count, dtype: int64

The column age contains 46 unique values

The column class contains 4 unique values
With this frequency: class
victualling crew    431
engineering crew    324
restaurant staff     69
deck crew            66
Name: count, dtype: int64

The column embarked contains 2 unique values
With this frequency: embarked
S    702
B    188
Name: count, dtype: int64

The column country contains 19 unique values

The column survived contains 2 unique values
With this frequency: survived
no     679
yes    211
Name: count, dtype: int64


Looking at unique values we can already say a lot about Titanic's crew:
   - Over 890 people, just 23 of them were women: I'm not surprised, I can't imagine a woman who has the age to work who decide to leave her family, in 1912 to embark on a ship which is going in another continent. Later I analyze those data to see if I find something meaningful about them.
   - Most of crew belonged to victualling and engineering crew: restaurant staff was the staff of the only "à la carte" restaurant that Titanic had, The Ritz,(there were Dining rooms, obviously differentiated for the three classes, in position, elegance and menu).
   - Most of them embarked in England: it can be interesting to see who and why embarked in Belfast
   - Arround 80% of them didn't survive.

LET'S BETTER ANALYZE THE CREW:

In [40]:
Titanic_crew[Titanic_crew["gender"] == "female"]

Unnamed: 0,name,gender,age,class,embarked,country,survived
1380,"Bennett, Mrs. Mabel",female,33.0,victualling crew,S,England,yes
1402,"Bliss, Mrs. Emma",female,45.0,victualling crew,S,Switzerland,yes
1413,"Bowker, Miss. Ruth Harwood",female,31.0,restaurant staff,S,England,yes
1463,"Caton, Miss. Annie",female,33.0,victualling crew,S,England,yes
1616,"Gold, Mrs. Jane Kate Coulson",female,45.0,victualling crew,S,England,yes
1626,"Gregson, Miss. Mary",female,45.0,victualling crew,S,England,yes
1724,"Jessop, Miss. Violet Constance",female,24.0,victualling crew,S,England,yes
1772,"Lavington, Miss. Elizabeth",female,40.0,victualling crew,S,England,yes
1775,"Leather, Mrs. Elizabeth Mary",female,50.0,victualling crew,S,England,yes
1811,"Marsden, Miss. Evelyn",female,28.0,victualling crew,S,England,yes


Most of women were part of victualling crew (of corse it would be absurd to find them in engineering crew at the beginning of XXth century). The Victualling Department provided all the services for the occupants of the ship; food, housekeeping, laundry, room service, etc.. Just three of them didn't survive, so most of women who were part of the crew could survive.
Lets see some quick percentage about gender and survival rate:

In [41]:
m = Titanic_crew["gender"].value_counts()["male"]
f = Titanic_crew["gender"].value_counts()["female"]
print(f"Number of men in the crew: {m}")
print(f"Number of women in the crew: {f}")
Titanic_crew.groupby(["gender", "survived"])["survived"].count()

Number of men in the crew: 867
Number of women in the crew: 23


gender  survived
female  no            3
        yes          20
male    no          676
        yes         191
Name: survived, dtype: int64

As we can see 86% of women could survived, but just 22% of men could.

In [42]:
Titanic_crew[Titanic_crew["embarked"] == "B"].groupby("class")["embarked"].count()

class
deck crew            23
engineering crew     43
victualling crew    122
Name: embarked, dtype: int64

Workers who embarked in Belfast are the ones the company needed to let the Titanic take life: they had to prepare the ship for passengers and to move it from Belfast, where it was built, to Southampton, the first passengers' boarding; in fact there is no restaurant staff.

Now it's better to count and compare these data (age, sex, country, surirval rate...) using a software like Google looker: sometimes a chart is much more readable than a table.


EXPLORATORY DATA ANALYSIS IS DONE.

Before to pass the two DataFrame to Looker, I would like to take a look to some passengers data: I want to find a relation between survival rate or socio-economic class and other data.
To analyze "sex" field I create a DataFrame wich help us to understend if there is a connection between sex and class with survival rate.

In [44]:
Titanic_passengers

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,S,United States,5547.0,7.1100,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,S,United States,2673.0,20.0500,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,S,United States,2673.0,20.0500,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,S,England,2673.0,20.0500,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,S,Norway,348125.0,7.1300,0.0,0.0,yes
...,...,...,...,...,...,...,...,...,...,...,...
1314,"Yvois, Miss. Henriette",female,24.0,2nd,S,France,248747.0,13.0000,0.0,0.0,no
1315,"Zakarian, Mr. Mapriededer",male,22.0,3rd,C,Turkey,2656.0,7.0406,0.0,0.0,no
1316,"Zakarian, Mr. Ortin",male,27.0,3rd,C,Turkey,2670.0,7.0406,0.0,0.0,no
1317,"Zenni, Mr. Philip",male,25.0,3rd,C,Lebanon,2620.0,7.0406,0.0,0.0,yes


In [45]:
df_survived = pd.DataFrame(Titanic_passengers.groupby(["gender", "class"])["survived"].value_counts())
df_survived = df_survived.reset_index()
df_survived

Unnamed: 0,gender,class,survived,count
0,female,1st,yes,139
1,female,1st,no,5
2,female,2nd,yes,94
3,female,2nd,no,12
4,female,3rd,no,110
5,female,3rd,yes,106
6,male,1st,no,118
7,male,1st,yes,62
8,male,2nd,no,154
9,male,2nd,yes,24


This DataFrame shows us how many people survived or not, divided by gender and class, but data are difficult to read, because the number of male and female are different. So let's create a field where we can see these data in percentage.

In [47]:
m = Titanic_passengers["gender"].value_counts()["male"]
f = Titanic_passengers["gender"].value_counts()["female"]
print(m, f)

851 466


Note that the percentage must be calculated on different quantities, in fact women are 466 and men are 851.

In [48]:
df_survived["percentage"] = np.where(df_survived["gender"] == "female", df_survived["count"]/f*100, df_survived["count"]/m*100)
df_survived

Unnamed: 0,gender,class,survived,count,percentage
0,female,1st,yes,139,29.828326
1,female,1st,no,5,1.072961
2,female,2nd,yes,94,20.171674
3,female,2nd,no,12,2.575107
4,female,3rd,no,110,23.60515
5,female,3rd,yes,106,22.746781
6,male,1st,no,118,13.86604
7,male,1st,yes,62,7.285546
8,male,2nd,no,154,18.096357
9,male,2nd,yes,24,2.820212


Looking at the percentage table we can see that most of the first and second class women survived, just half of them in third class survived instead. Totally about 74% of women survived.

The situation for men is quite different. More than half of them belonged to third class and most of them died there. Also in second class the situation for them was terrible and in first class less then half of them surived.
Well, maybe the fact that they decided to load on lifeboats women and children frist is true...but just for 1 and 2 class.

Now I'm very courious to know who were the 5 women belonging to the first class and the 12 of second class who died: maybe they didn't want to leave their husband or sons?

In [51]:
Titanic_passengers[(Titanic_passengers["gender"] == "female") & (Titanic_passengers["class"] == "1st") & (Titanic_passengers["survived"] == "no")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
23,"Allison, Miss. Helen Loraine",female,2.0,1st,S,Canada,113781.0,151.16,1.0,2.0,no
25,"Allison, Mrs. Bessie Waldo",female,25.0,1st,S,United States,113781.0,151.16,1.0,2.0,no
381,"Evans, Miss. Edith Corse",female,36.0,1st,C,United States,17531.0,31.1307,0.0,0.0,no
587,"Isham, Miss. Ann Elizabeth",female,50.0,1st,C,United States,17595.0,28.1403,0.0,0.0,no
1168,"Straus, Mrs. Rosalie Ida",female,63.0,1st,S,Germany,17483.0,221.1507,1.0,0.0,no


Miss Evans
We can say almost nothing about the death of Miss Isham and Miss Evans: they were 50 and 36 years old, they both embarked in Cherbourg they had no relatives on the ship and they didn't pay a lot for the ticket.

Mrs. Straus instead travelled with her husband Isidor. It is known that it was offered to both of them a place in a lifeboat, wut when he refused, because there still were women and children to save, she decided to stay with him. They both died. The story of Ida's bravery and loyalty became much celebrated.

 Mrs. and Miss. Allison instead, are for sure related: not only the last name is the same, but also number of ticket and of relatives. Bessie was the mother and Helen Loraine was her 2 years old daughter: the story of this family is quite tragic:

In [52]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Allison")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
22,"Allison, Master. Hudson Trevor",male,0.916667,1st,S,Canada,113781.0,151.16,1.0,2.0,yes
23,"Allison, Miss. Helen Loraine",female,2.0,1st,S,Canada,113781.0,151.16,1.0,2.0,no
24,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1st,S,Canada,113781.0,151.16,1.0,2.0,no
25,"Allison, Mrs. Bessie Waldo",female,25.0,1st,S,United States,113781.0,151.16,1.0,2.0,no


Allison family consisted of the father, the mother a two years old daughter, Helen Loraine, and a less than one year old son, Hudson Trevor. They travelled with a nanny, a cook, a shaffeur and a personal maid. After the ship hit the iceberg the nanny, unable to calm the hysteria of Bessie, who wanted to stay inside her cabin at all costs, took Trevor in her arms and ran away with him, taking refuge on a lifeboat. Spouses Allison and Loraine remained on board the Titanic, looking for Trevor, unaware of the fact that the nanny had already placed him safe in a lifeboat.

In [56]:
Titanic_passengers[(Titanic_passengers["gender"] == "female") & (Titanic_passengers["class"] == "2nd") & (Titanic_passengers["survived"] == "no")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
208,"Carter, Mrs. Lilian",female,45.0,2nd,S,England,244252.0,26.0,1.0,0.0,no
223,"Chapman, Mrs. Sara Elizabeth",female,29.0,2nd,S,England,29037.0,26.0,1.0,0.0,no
265,"Corbett, Mrs. Irene",female,30.0,2nd,S,United States,237249.0,13.0,0.0,0.0,no
266,"Corey, Mrs. Mary Emma",female,32.0,2nd,S,,13534.0,21.0,0.0,0.0,no
425,"Funk, Miss. Annie Clemmer",female,38.0,2nd,S,India,237671.0,13.0,0.0,0.0,no
546,"Hiltunen, Miss. Marta",female,18.0,2nd,S,Finland,250650.0,13.0,1.0,1.0,no
571,"Howard, Mrs. Ellen Truelove",female,61.0,2nd,S,England,24065.0,26.0,1.0,0.0,no
640,"Karnes, Mrs. Claire",female,22.0,2nd,S,United States,13534.0,21.0,0.0,0.0,no
685,"Lahtinen, Mrs. Anna Amelia",female,34.0,2nd,S,United States,250651.0,26.0,1.0,1.0,no
750,"Mack, Mrs. Mary",female,57.0,2nd,S,England,3.0,10.1,0.0,0.0,no


About the women of second class who died I can say all of them were adults but they had different ages, from 18 to 61 years old; 6 of them travelled with a husband or sibling; two of them travelled with a parent or child.
I've noticed that their number of tickets are different, so I don't think there was a relation between them. Just two of them have the same number, 13534. Let's look for some information about them.

In [59]:
Titanic_passengers[Titanic_passengers["ticketno"] == 13534]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
266,"Corey, Mrs. Mary Emma",female,32.0,2nd,S,United States,13534.0,21.0,0.0,0.0,no
640,"Karnes, Mrs. Claire",female,22.0,2nd,S,United States,13534.0,21.0,0.0,0.0,no


They're the only passengers to have this ticket. So I've looked for some information online about them: they both were from Pennsylvania, they were friends and their husbands worked together. They both were returning home from England because they were pregnant. It's not known the reason why they didn't leave the ship.
Thanks to this research I can fill Mrs. Corey's null data.

In [135]:
Titanic_passengers.loc[Titanic_passengers["ticketno"] == 13534, "country"] = "United States"

AGE FIELD:
Now I want to find a relation between "Age" field and other data about Passengers:

First I look for a connection between age and sex (notice that I'm using the precentiles calculated before):

In [64]:
Titanic_passengers.describe()

Unnamed: 0,age,ticketno,fare,sibsp,parch
count,1317.0,1317.0,1317.0,1317.0,1317.0
mean,29.627858,284280.9,32.772055,0.495824,0.382688
std,13.841236,633236.3,51.902554,1.039211,0.863444
min,0.166667,2.0,0.0,0.0,0.0
25%,21.0,14263.0,7.1711,0.0,0.0
50%,28.0,111427.0,14.0901,0.0,0.0
75%,38.0,347077.0,31.0506,1.0,0.0
max,74.0,3101317.0,512.0607,8.0,9.0


In [65]:
Titanic_passengers.groupby([pd.cut(Titanic_passengers["age"], [0, 21, 28, 38, 74]), "gender"])["gender"].count()

age       gender
(0, 21]   female    145
          male      224
(21, 28]  female    107
          male      218
(28, 38]  female     99
          male      198
(38, 74]  female    115
          male      211
Name: gender, dtype: int64

In [None]:
qua magari aggiungi il grafico che fa vedere le percentuali!!! idem sotto!!!

I dont't see any connection between age and sex.

LET'S. TRY WITH AGE AND SURVIVAL RATE:

In [66]:
Titanic_passengers.groupby([pd.cut(Titanic_passengers["age"], [0, 21, 28, 38, 74]), "survived"])["survived"].count()

age       survived
(0, 21]   no          229
          yes         140
(21, 28]  no          205
          yes         120
(28, 38]  no          179
          yes         118
(38, 74]  no          204
          yes         122
Name: survived, dtype: int64

Again, there is no connection between the fields Age and survived; the results for each percentile are very similar to each other.

LET'S TRY WITH AGE AND CLASS:

In [67]:
Titanic_passengers.groupby([pd.cut(Titanic_passengers["age"], [0, 21, 28, 38, 74]), "class"])["class"].count()

age       class
(0, 21]   1st       31
          2nd       68
          3rd      270
(21, 28]  1st       47
          2nd       70
          3rd      208
(28, 38]  1st       74
          2nd       80
          3rd      143
(38, 74]  1st      172
          2nd       66
          3rd       88
Name: class, dtype: int64

Finally here we find a connection: while passengers grow older, their number increases in first class and decreases in third class (second class remains almost equal). Well, this is a common pattern that my generation knows very well, in fact we can find this pattern also in our society: older generation is richer than youngers.
Another hypothesis could be that it could have been difficult for older working class people to travel.

To better analyze this pattern I add "Sex" field in the table:

In [68]:
Titanic_passengers.groupby([pd.cut(Titanic_passengers["age"], [0, 21, 28, 38, 74]), "class", "gender"])["gender"].count()

age       class  gender
(0, 21]   1st    female     19
                 male       12
          2nd    female     28
                 male       40
          3rd    female     98
                 male      172
(21, 28]  1st    female     25
                 male       22
          2nd    female     26
                 male       44
          3rd    female     56
                 male      152
(28, 38]  1st    female     33
                 male       41
          2nd    female     30
                 male       50
          3rd    female     36
                 male      107
(38, 74]  1st    female     67
                 male      105
          2nd    female     22
                 male       44
          3rd    female     26
                 male       62
Name: gender, dtype: int64

What I said before about the relation between Age and Social class seems to be true expecially for men (but in percentage they were a lot more than women).






NOW IT'S TIME TO RECONTRUCT THE HISTORY OF FAMILIES THAT LIVED THIS TRAGEDY:


I create a DataFrame where data in "sibsp" column is more than 1 (to exclude passengers who travelled just with the spuse) and I sort it by using "sibsp" column and "name" column in alphabetic order:

In [69]:
Titanic_passengers.head(5)

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,S,United States,5547.0,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,S,United States,2673.0,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,S,United States,2673.0,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,S,England,2673.0,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,S,Norway,348125.0,7.13,0.0,0.0,yes


In [136]:
df_sibsp = Titanic_passengers[Titanic_passengers["sibsp"] > 1].sort_values(["sibsp", "name"], ascending = False )

In [137]:
df_sibsp.head(50)

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
1068,"Sage, Mr. George John",male,20.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
1067,"Sage, Mr. Frederick",male,17.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
1066,"Sage, Mr. Douglas Bullen",male,18.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
1065,"Sage, Miss. Stella Anne",female,21.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
1064,"Sage, Miss. Elizabeth Ada",female,10.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
1063,"Sage, Miss. Dorothy",female,14.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
1062,"Sage, Miss. Constance Gladys",female,8.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
1061,"Sage, Master. Thomas Henry",male,5.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
1060,"Sage, Master. Anthony William",male,13.0,3rd,Southampton,England,2343.0,69.11,8.0,2.0,no
466,"Goodwin, Mr. Charles Edward",male,14.0,3rd,Southampton,England,2144.0,46.18,5.0,2.0,no


It's immediately possible to notice that biggest families belongs to third class and that they had the same ticket (with the same price): this can be useful if we want to look for relatives using the Last name, because we can find other passengers with the same one).
Let's see how many people with relatives belonged to each class and wich was their survival chance:

In [73]:
df_sibsp.groupby(["class", "survived"])["survived"].count()

class  survived
1st    no           2
       yes         10
2nd    no           6
       yes          7
3rd    no          63
       yes         11
Name: survived, dtype: int64

In [None]:
metti grafico!!!!

We see that most of them belongs to third class, a quite intersting anthropological fact if we think that they are also the poorest one. Moreover most of them died and considering that a lot of them are children, this is going to be a quite sad story.

Let's analyze the largest families (I use Titanic's passengers DataFrame to analyze each family, so I can find also the parents: in fact parents have small numbers in "sibsp" columns, even 1 or 0, but large numbers in "parch" columns, so be careful to similar last names or people who don't belong to the family):

In [76]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Sage")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
1060,"Sage, Master. Anthony William",male,13.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1061,"Sage, Master. Thomas Henry",male,5.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1062,"Sage, Miss. Constance Gladys",female,8.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1063,"Sage, Miss. Dorothy",female,14.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1064,"Sage, Miss. Elizabeth Ada",female,10.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1065,"Sage, Miss. Stella Anne",female,21.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1066,"Sage, Mr. Douglas Bullen",male,18.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1067,"Sage, Mr. Frederick",male,17.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1068,"Sage, Mr. George John",male,20.0,3rd,S,England,2343.0,69.11,8.0,2.0,no
1069,"Sage, Mr. John George",male,45.0,3rd,S,England,2343.0,69.11,1.0,9.0,no


Sage family was the largest who shipped on Titanic: mother, father and 9 children. Thier story is quite famous: the entire family was travelling on Titanic (parents are the ones with 9 childrens in "parch" field), the parents and their children, 5 male and 4 female, the youngest one was just 5 years old. As we can see from the data they travelled in third class and this is the reason why they couldn't survive: when they arrived on the deck every lifeboat was already sealed (as we know, a lot of them half empty).

In [78]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Goodwin")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
461,"Goodwin, Master. Harold Victor",male,9.0,3rd,S,England,2144.0,46.18,5.0,2.0,no
462,"Goodwin, Master. Sidney Leslie",male,1.0,3rd,S,England,2144.0,46.18,5.0,2.0,no
463,"Goodwin, Master. William Frederick",male,11.0,3rd,S,England,2144.0,46.18,5.0,2.0,no
464,"Goodwin, Miss. Jessie Allis",female,10.0,3rd,S,England,2144.0,46.18,5.0,2.0,no
465,"Goodwin, Miss. Lillian Amy",female,16.0,3rd,S,England,2144.0,46.18,5.0,2.0,no
466,"Goodwin, Mr. Charles Edward",male,14.0,3rd,S,England,2144.0,46.18,5.0,2.0,no
467,"Goodwin, Mr. Frederick Joseph",male,42.0,3rd,S,England,2144.0,46.18,1.0,6.0,no
468,"Goodwin, Mrs. Augusta",female,43.0,3rd,S,England,2144.0,46.18,1.0,6.0,no


The second largest family, the Goodwins, had 6 children. Unfortunately their fate was no different from that of the Sages. 

In [79]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Rice")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
1006,"Rice, Master. Albert",male,10.0,3rd,Q,Ireland,382652.0,29.0206,4.0,1.0,no
1007,"Rice, Master. Arthur",male,4.0,3rd,Q,Ireland,382652.0,29.0206,4.0,1.0,no
1008,"Rice, Master. Eric",male,7.0,3rd,Q,Ireland,382652.0,29.0206,4.0,1.0,no
1009,"Rice, Master. Eugene Francis",male,2.0,3rd,Q,Ireland,382652.0,29.0206,4.0,1.0,no
1010,"Rice, Master. George Hugh",male,8.0,3rd,Q,Ireland,382652.0,29.0206,4.0,1.0,no
1011,"Rice, Mrs. Margaret",female,39.0,3rd,Q,Ireland,382652.0,29.0206,0.0,5.0,no


In [82]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Panula")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
933,"Panula, Master. Eino Viljam",male,1.0,3rd,S,Finland,3101295.0,39.1309,4.0,1.0,no
934,"Panula, Master. Jaako Arnold",male,15.0,3rd,S,Finland,3101295.0,39.1309,4.0,1.0,no
935,"Panula, Master. Juha Niilo",male,7.0,3rd,S,Finland,3101295.0,39.1309,4.0,1.0,no
936,"Panula, Master. Urho Abraham",male,2.0,3rd,S,Finland,3101295.0,39.1309,4.0,1.0,no
937,"Panula, Mr. Ernesti Arvid",male,16.0,3rd,S,Finland,3101295.0,39.1309,4.0,1.0,no
938,"Panula, Mrs. Maija Emelia Abrahamintytar",female,41.0,3rd,S,Finland,3101295.0,39.1309,0.0,5.0,no


Always in third class we see Rice family and Panula family: they consists of a mother travelling without husband, but with her 5 sons, all of them very young; none of them survived.

There's a mistake in "contry" column: Panula family came from Finland but one of them comes from USA (ticket fare number of brothers is the same, so for sure it's a mistake).

In [138]:
Titanic_passengers.loc[Titanic_passengers["name"] == "Panula, Master. Jaako Arnold", "country"] = "Finland"

In [83]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Asplund")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
56,"Asplund, Master. Carl Edgar",male,5.0,3rd,S,Sweden,347077.0,31.0709,4.0,2.0,no
57,"Asplund, Master. Clarence Gustaf Hugo",male,9.0,3rd,S,Sweden,347077.0,31.0709,4.0,2.0,no
58,"Asplund, Master. Edvin Rojj Felix",male,3.0,3rd,S,Sweden,347077.0,31.0709,4.0,2.0,yes
59,"Asplund, Master. Filip Oscar",male,13.0,3rd,S,Sweden,347077.0,31.0709,4.0,2.0,no
60,"Asplund, Miss. Lillian Gertrud",female,5.0,3rd,S,Sweden,347077.0,31.0709,4.0,2.0,yes
61,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40.0,3rd,S,Sweden,347077.0,31.0709,1.0,5.0,no
62,"Asplund, Mr. Johan Charles",male,23.0,3rd,S,Sweden,350054.0,7.1511,0.0,0.0,yes
63,"Asplund, Mrs. Selma Augusta Emilia",female,38.0,3rd,S,Sweden,347077.0,31.0709,1.0,5.0,yes


The Asplund family is the only big family belonging to the third class of which at least the mother and two  children survided. In fact their history is quite known thanks to them, (Lillian died in 2006).

In the list we see some other smaller families, belonging to third class and most of them didn't survive.

Let's see if we find something interesting about families in second and first class:

In [88]:
df_sibsp[df_sibsp["class"] != "3rd"]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
1001,"Renouf, Mrs. Lillian",female,30.0,2nd,S,Channel Islands,31027.0,21.0,3.0,0.0,yes
410,"Fortune, Mr. Charles Alexander",male,19.0,1st,S,Canada,19950.0,263.0,3.0,2.0,no
409,"Fortune, Miss. Mabel Helen",female,23.0,1st,S,Canada,19950.0,263.0,3.0,2.0,yes
408,"Fortune, Miss. Ethel Flora",female,28.0,1st,S,Canada,19950.0,263.0,3.0,2.0,yes
407,"Fortune, Miss. Alice Elizabeth",female,24.0,1st,S,Canada,19950.0,263.0,3.0,2.0,yes
1051,"Ryerson, Miss. Susan Parker Suzette",female,21.0,1st,C,United States,17608.0,262.0706,2.0,2.0,yes
1050,"Ryerson, Miss. Emily Borie",female,18.0,1st,C,United States,17608.0,262.0706,2.0,2.0,yes
1049,"Ryerson, Master. John Borie",male,13.0,1st,C,United States,17608.0,262.0706,2.0,2.0,yes
1015,"Richards, Mrs. Emily",female,24.0,2nd,S,England,29106.0,18.15,2.0,3.0,yes
815,"Minahan, Dr. William Edward",male,44.0,1st,Q,United States,19928.0,90.0,2.0,0.0,no


The list is quite short, (as I said most of people who travelled with relatives were in third class) but in the first and second rows I see two persons travelling with 3 siblings: Fortune family in firts class and Mrs. Renouf in second class.

In [89]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Fortune")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
407,"Fortune, Miss. Alice Elizabeth",female,24.0,1st,S,Canada,19950.0,263.0,3.0,2.0,yes
408,"Fortune, Miss. Ethel Flora",female,28.0,1st,S,Canada,19950.0,263.0,3.0,2.0,yes
409,"Fortune, Miss. Mabel Helen",female,23.0,1st,S,Canada,19950.0,263.0,3.0,2.0,yes
410,"Fortune, Mr. Charles Alexander",male,19.0,1st,S,Canada,19950.0,263.0,3.0,2.0,no
411,"Fortune, Mr. Mark",male,64.0,1st,S,Canada,19950.0,263.0,1.0,4.0,no
412,"Fortune, Mrs. Mary",female,60.0,1st,S,Canada,19950.0,263.0,1.0,4.0,yes


Fortune family is the largest one not belonging to the third class, but to the first one, consisting of mother, father, 3 daughters and 1 son, the youngest. All of them were adults. The only son and the father didn't survive, maybe to let all the women and children to escape on lifeboats.

In [90]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Renouf")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
1000,"Renouf, Mr. Peter Henry",male,33.0,2nd,S,Channel Islands,31027.0,21.0,1.0,0.0,no
1001,"Renouf, Mrs. Lillian",female,30.0,2nd,S,Channel Islands,31027.0,21.0,3.0,0.0,yes


Mrs Renouf's family is more difficult to find: she was an adult travelling with 3 siblings, or the husband and two siblings. Looking for the last name I've found another passenger, who I think was her husband (same cabin, same age, he travelled with one sibling or the wife), so "Renouf" isn't her original last name.
At least we can understand that her family was the biggest of second class (3 siblings and the husband of one of tham); she survied (the husband didn't).

For the analysis it can also be interesting to know how many women and men travelled by themselves (without family):

In [96]:
Titanic_passengers[(Titanic_passengers["sibsp"] == 0) & (Titanic_passengers["parch"] == 0)].groupby("gender")["gender"].count()

gender
female    194
male      604
Name: gender, dtype: int64

194 women of 466 were travelling by themselves, around 41%. And 604 men over 851 travelled by themselves, around 70%. This means that most of men were alone (or with friends, or fiancés and we can not know). But it was quite difficult to find a woman travelling by herself. (I have to say that 41% is a lot more than I thought for the society of the period, but I'm quite sure that lot of them were travelling with friends or anyway people they knew very well).

We can also see how many children travelled by themselves (consider that they could travel with the nanny, who is not considered part of the family; moreover I put in the analysis passengers who were less than 13, so I don't consider Teenagers):

In [104]:
Titanic_passengers[(Titanic_passengers["age"] < 13) & (Titanic_passengers["sibsp"] == 0) & (Titanic_passengers["parch"] == 0)]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
376,"Emanuel, Miss. Virginia Ethel",female,5.0,3rd,S,United States,364516.0,12.0906,0.0,0.0,yes
579,"Ibrāhīm, Mr. Husayn Mahmūd Husayn",male,11.0,3rd,C,,2699.0,18.1509,0.0,0.0,no
1091,"Seman, Master. Betros",male,10.0,3rd,C,Lebanon,2622.0,4.0,0.0,0.0,no
1257,"Watt, Miss. Robertha Josephine",female,12.0,2nd,S,Scotland,33595.0,15.15,0.0,0.0,yes


Of course there aren't many childrend travelling  by themselves.
In fact, looking for some information online, I discovered that it is trueLooking for some information about them online I've found

In [109]:
Titanic_passengers[Titanic_passengers["name"].str.contains("Watt")]

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
1257,"Watt, Miss. Robertha Josephine",female,12.0,2nd,S,Scotland,33595.0,15.15,0.0,1.0,yes
1258,"Watt, Mrs. Elizabeth",female,40.0,2nd,S,Scotland,33595.0,15.15,0.0,1.0,yes


Mistakes!!! a sto punto correggili e rifai il file csv

In [139]:
Titanic_passengers.loc[Titanic_passengers["name"] == "Watt, Miss. Robertha Josephine", "parch"] = 1
Titanic_passengers.loc[Titanic_passengers["name"] == "Watt, Mrs. Elizabeth", "parch"] = 1

In [None]:
qua puoi mettere un graficino!!

CONCLUSIONS:
Now that've found the most meaninful statistical data, and I've found some relations between the data I have, I finally I create the CSV file containing the processed data. So later I'll show what I've discovered using a more specific software, like Google Looker and Microsoft Power BI. Thanks to them I'll be able to show not only what I've discovered, but also statistical data (for example the medium fare for each Class or the more used harbour).


N.B. FOR CODERS
In the code below I transform some data, but just because I'm using softwares in italian language, so we use commas for decimals and my Power query doesn't recognise the dot as decimal separator. If you don't have this kind of problems you don't have to run the code. I've decided to transform every float in int data, except for the fare, because I don't want to lose the real price of tickets. That's why the CSV file will be separated by a semicolon and not a comma.

In [150]:
Titanic_crew["age"] = Titanic_crew["age"].astype(int)

Titanic_passengers["age"] = Titanic_passengers["age"].astype(int)
Titanic_passengers["ticketno"] = Titanic_passengers["ticketno"].astype(int)
Titanic_passengers["sibsp"] = Titanic_passengers["sibsp"].astype(int)
Titanic_passengers["parch"] = Titanic_passengers["parch"].astype(int)

Titanic_passengers["fare"] = Titanic_passengers["fare"].round(2)

In [154]:
Titanic_passengers.to_csv('passengers.csv', sep=';', decimal=',')
Titanic_crew.to_csv("crew.csv")

This is the link to the report: https://lookerstudio.google.com/reporting/c2ea5699-05af-41e4-b6f0-75ee6ea1745a

In [152]:
Titanic_crew.info()

<class 'pandas.core.frame.DataFrame'>
Index: 890 entries, 476 to 2206
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      890 non-null    object
 1   gender    890 non-null    object
 2   age       890 non-null    int64 
 3   class     890 non-null    object
 4   embarked  890 non-null    object
 5   country   890 non-null    object
 6   survived  890 non-null    object
dtypes: int64(1), object(6)
memory usage: 55.6+ KB


In [155]:
Titanic_passengers

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42,3rd,Southampton,United States,5547,7.11,0,0,no
1,"Abbott, Mr. Eugene Joseph",male,13,3rd,Southampton,United States,2673,20.05,0,2,no
2,"Abbott, Mr. Rossmore Edward",male,16,3rd,Southampton,United States,2673,20.05,1,1,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39,3rd,Southampton,England,2673,20.05,1,1,yes
4,"Abelseth, Miss. Karen Marie",female,16,3rd,Southampton,Norway,348125,7.13,0,0,yes
...,...,...,...,...,...,...,...,...,...,...,...
1314,"Yvois, Miss. Henriette",female,24,2nd,Southampton,France,248747,13.00,0,0,no
1315,"Zakarian, Mr. Mapriededer",male,22,3rd,Cherbourg,Turkey,2656,7.04,0,0,no
1316,"Zakarian, Mr. Ortin",male,27,3rd,Cherbourg,Turkey,2670,7.04,0,0,no
1317,"Zenni, Mr. Philip",male,25,3rd,Cherbourg,Lebanon,2620,7.04,0,0,yes


In [157]:
Titanic_passengers[(Titanic_passengers["sibsp"] > 0) | (Titanic_passengers["parch"] > 0)].count()

name        521
gender      521
age         521
class       521
embarked    521
country     504
ticketno    521
fare        521
sibsp       521
parch       521
survived    521
dtype: int64

In [160]:
Titanic_passengers[Titanic_passengers["survived"] == "yes"].count()

name        500
gender      500
age         500
class       500
embarked    500
country     472
ticketno    500
fare        500
sibsp       500
parch       500
survived    500
dtype: int64