# Titanic EDA example

Things to consider:
1. Age of surviving passengers
2. Sex of surviving passengers
3. Class of surviving passengers
4. There are some columns that are hard to decipher what they mean.  Look them up and find out
5. What are the biggest factors in ensuring survival?

Please try to demonstrate the answers to the above and anything else you can think of.  
Try to display your answers using graphs as much as possible.  

We will come back to this dataset later to do some 'classification' so we can see if your initial thoughts are true!! (No pressure :) ) 


In [1]:
#imports
import pandas as pd

from datetime import datetime 

import numpy

import matplotlib as mp

import plotly as py

import plotly.graph_objects as go


In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df2 = df.copy()

In [4]:
df2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Age of surviving passengers

In [135]:
age_survive = df2.copy()
age_survive = age_survive.loc[age_survive.Survived == 1]
age_survive.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [168]:
age_survive.Age.mean()

28.343689655172415

In [136]:
age_survive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342 entries, 1 to 889
Data columns (total 12 columns):
PassengerId    342 non-null int64
Survived       342 non-null int64
Pclass         342 non-null int64
Name           342 non-null object
Sex            342 non-null object
Age            290 non-null float64
SibSp          342 non-null int64
Parch          342 non-null int64
Ticket         342 non-null object
Fare           342 non-null float64
Cabin          136 non-null object
Embarked       340 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 34.7+ KB


In [137]:
bins = [0.0,12.0,17.0,24.0,34.0,44.0,54.0,64.0,74.0,150.0]
labels = ['Under 12','12-17','18-24','25-34','35-44','45-54','55-64','65-74','75 or over']

age_survive['age_group'] = pd.cut(age_survive['Age'], bins = bins , labels = labels )




age_survive_group = age_survive.groupby(['age_group'])[['PassengerId']].count().rename(columns = {'PassengerId': 'Passenger_count'})
age_survive_group = age_survive_group.reset_index()




age_survive_group.head()


Unnamed: 0,age_group,Passenger_count
0,Under 12,40
1,12-17,21
2,18-24,57
3,25-34,78
4,35-44,51


In [138]:
age_survive_graph = go.Figure(go.Bar(x = age_survive_group.age_group,
                                    y = age_survive_group.Passenger_count,
                                     textposition = 'auto'
                                    ))


age_survive_graph.update_layout(
    title=go.layout.Title(
        text="Titanic: Number of surviving passengers per age group",
        xref="paper",
        x=0
    ),
    
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Number of passengers",
            font=dict(
                family="Courier New, monospace",
                size=16,
                color="#7f7f7f"
            )
        )
    ),
        xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="Age groups",
            font=dict(
                family="Courier New, monospace",
                size=16,
                color="#7f7f7f"
                )
            )
        )
)


age_survive_graph

In [112]:
age_survive_2 = df2.copy()


bins = [0.0,12.0,17.0,24.0,34.0,44.0,54.0,64.0,74.0,150.0]
labels = ['Under 12','12-17','18-24','25-34','35-44','45-54','55-64','65-74','75 or over']

age_survive_2['age_group'] = pd.cut(age_survive_2['Age'], bins = bins , labels = labels )

age_survive_group_2 = age_survive_2.groupby(['age_group','Survived'])[['PassengerId']].count().rename(columns = {'PassengerId': 'Passenger_count'})
age_survive_group_2 = age_survive_group_2.reset_index()


age_group_survived = age_survive_group_2.loc[age_survive_group_2.Survived == 1]
age_group_passed = age_survive_group_2.loc[age_survive_group_2.Survived == 0]

age_group_percent = age_group_survived.merge(age_group_passed, left_on = 'age_group', right_on = 'age_group').rename(columns = {'Passenger_count_y': 'passenger_passed'})

age_group_percent['survival'] = age_group_percent.Passenger_count_x / (age_group_percent.Passenger_count_x + age_group_percent.passenger_passed)

graph_a = go.Figure(go.Bar(x = age_group_percent.age_group, 
                           y = age_group_percent.survival,
                           marker_color = '#A01B41',
                           text = round((age_group_percent.survival),1)*100,
                           textposition='auto'
                           
                           ) )


graph_a.update_layout(
    title=go.layout.Title(
        text="Percentage of surviving passengers per age group",
        xref="paper",
        x=0
    ),
    
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Percentage (%)",
            font=dict(
                family="Courier New, monospace",
                size=16,
                color="#7f7f7f"
            )
        )
    )
)

graph_a.update_xaxes(type = 'category', tickangle=-90, tickfont=dict(color='grey', size=12))

graph_a.update_yaxes(tickformat = '%')
   
graph_a.show()


# Sex of surviving passengers

In [80]:
gender = df2.copy()
gender = gender.loc[gender.Survived == 1]
gender.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [81]:
bins = [0.0,12.0,17.0,24.0,34.0,44.0,54.0,64.0,74.0,150.0]
labels = ['Under 12','12-17','18-24','25-34','35-44','45-54','55-64','65-74','75 or over']

gender['age_group'] = pd.cut(gender['Age'], bins = bins , labels = labels )

In [82]:
gender_group = gender.groupby(['Sex','age_group'])[['PassengerId']].count().rename(columns = {'PassengerId':'Passenger_count'})
gender_group = gender_group.reset_index()
gender_group.head()

Unnamed: 0,Sex,age_group,Passenger_count
0,female,Under 12,19.0
1,female,12-17,19.0
2,female,18-24,47.0
3,female,25-34,47.0
4,female,35-44,36.0


In [164]:
gender_group_male = gender_group.loc[gender_group.Sex == 'male']
gender_group_female = gender_group.loc[gender_group.Sex == 'female']

fig = go.Figure(data=[
    go.Bar(name='Male', x=gender_group_male.age_group, y= gender_group_male.Passenger_count),
    go.Bar(name='Female', x=gender_group_female.age_group, y= gender_group_female.Passenger_count)
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.update_layout(
    title=go.layout.Title(
        text="Number of passengers survuved by gender per age group",
        xref="paper",
        x=0
    ))
fig.show()

In [151]:
gender_percent = df2.copy()

bins = [0.0,12.0,17.0,24.0,34.0,44.0,54.0,64.0,74.0,150.0]
labels = ['Under 12','12-17','18-24','25-34','35-44','45-54','55-64','65-74','75 or over']

gender_percent['age_group'] = pd.cut(gender_percent['Age'], bins = bins , labels = labels )

gender_percent = gender_percent.groupby(['Sex','Survived','age_group'])[['PassengerId']].count().rename(columns = {'PassengerId':'Passenger_count'})
gender_percent = gender_percent.reset_index()

gender_percent.head()

Unnamed: 0,Sex,Survived,age_group,Passenger_count
0,female,0,Under 12,13.0
1,female,0,12-17,4.0
2,female,0,18-24,15.0
3,female,0,25-34,16.0
4,female,0,35-44,9.0


In [157]:

gender_percent_survive = gender_percent.loc[gender_percent.Survived == 1]
gender_percent_passed = gender_percent.loc[gender_percent.Survived == 0]

gender_survival_rate = gender_percent_survive.merge(gender_percent_passed, left_on = ['Sex','age_group'], right_on = ['Sex','age_group']).rename(columns = {'Passenger_count_y':'Passenger_count_passed'})



gender_survival_rate['gender_survive_percent'] = gender_survival_rate.Passenger_count_x / (gender_survival_rate.Passenger_count_x + gender_survival_rate.Passenger_count_passed)


gender_survival_rate



Unnamed: 0,Sex,Survived_x,age_group,Passenger_count_x,Survived_y,Passenger_count_passed,gender_survive_percent
0,female,1,Under 12,19.0,0,13.0,0.59375
1,female,1,12-17,19.0,0,4.0,0.826087
2,female,1,18-24,47.0,0,15.0,0.758065
3,female,1,25-34,47.0,0,16.0,0.746032
4,female,1,35-44,36.0,0,9.0,0.8
5,female,1,45-54,20.0,0,6.0,0.769231
6,female,1,55-64,9.0,0,1.0,0.9
7,female,1,65-74,,0,,
8,female,1,75 or over,,0,,
9,male,1,Under 12,21.0,0,16.0,0.567568


In [165]:

gender_survival_rate_male = gender_survival_rate.loc[gender_survival_rate.Sex == 'male']
gender_survival_rate_female = gender_survival_rate.loc[gender_survival_rate.Sex == 'female']

fig = go.Figure(data=[
    go.Bar(name='Male', x=gender_survival_rate_male.age_group, y= gender_survival_rate_male.gender_survive_percent),
    go.Bar(name='Female', x=gender_survival_rate_female.age_group, y= gender_survival_rate_female.gender_survive_percent)
])
# Change the bar mode
fig.update_layout(
    title=go.layout.Title(
        text="Survival rates by gender and age group",
        xref="paper",
        x=0
    ))
fig.show()


# Class of surviving passengers

In [84]:
pass_class = df2.copy()
pass_class = pass_class.loc[pass_class.Survived == 1]
pass_class.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [86]:
pass_class_group = pass_class.groupby(['Pclass'])[['PassengerId']].count().rename(columns = {'PassengerId':'Passenger_count'})

pass_class_group = pass_class_group.reset_index()

pass_class_group.head()


Unnamed: 0,Pclass,Passenger_count
0,1,136
1,2,87
2,3,119


In [166]:
fig = go.Figure(data=[go.Pie(labels=pass_class_group.Pclass, values=pass_class_group.Passenger_count)])
fig.update_layout(
    title=go.layout.Title(
        text="Survival rate per class",
        xref="paper",
        x=0
    ))
fig.show()

# Band ticket price

In [114]:
ticket_price = df2.copy()
ticket_price.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [128]:

bins = [0.0,50.0,100.0,150.0,200.0,250.0,300.0,350.0,400.0,450.0,500.0,550.0]
labels =['0-49','50-149','100-149','150-199','200-249','250-299','300-349','350-399','400-449','450-499','500-599'] 

ticket_price['ticket_band'] = pd.cut(ticket_price['Fare'], bins = bins, labels = labels)

ticket_price_group = ticket_price.groupby(['ticket_band','Survived'])[['PassengerId']].count().rename(columns = {'PassengerId':'Passenger_count'})
ticket_price_group = ticket_price_group.reset_index()
ticket_price_group.head()

Unnamed: 0,ticket_band,Survived,Passenger_count
0,0-49,0,484.0
1,0-49,1,232.0
2,50-149,0,37.0
3,50-149,1,70.0
4,100-149,0,5.0


In [130]:
ticket_price_survive = ticket_price_group.loc[ticket_price_group.Survived == 1]
ticket_price_passed = ticket_price_group.loc[ticket_price_group.Survived == 0]

ticket_price_percent = ticket_price_survive.merge(ticket_price_passed, left_on = 'ticket_band', right_on = 'ticket_band').rename(columns = {'Passenger_count_y':'Passenger_count_passed'})

ticket_price_percent['ticket_survive_percent'] = ticket_price_percent.Passenger_count_x / (ticket_price_percent.Passenger_count_x + ticket_price_percent.Passenger_count_passed)
ticket_price_percent.head()

Unnamed: 0,ticket_band,Survived_x,Passenger_count_x,Survived_y,Passenger_count_passed,ticket_survive_percent
0,0-49,1,232.0,0,484.0,0.324022
1,50-149,1,70.0,0,37.0,0.654206
2,100-149,1,19.0,0,5.0,0.791667
3,150-199,1,6.0,0,3.0,0.666667
4,200-249,1,7.0,0,4.0,0.636364


In [131]:

graph_a = go.Figure(go.Bar(x = ticket_price_percent.ticket_band, 
                           y = ticket_price_percent.ticket_survive_percent,
                           marker_color = '#A01B41',
                           text = round((ticket_price_percent.ticket_survive_percent),1)*100,
                           textposition='auto'
                           
                           ) )


graph_a.update_layout(
    title=go.layout.Title(
        text="Percentage of surviving passengers per ticket pricing",
        xref="paper",
        x=0
    ),
    
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Percentage (%)",
            font=dict(
                family="Courier New, monospace",
                size=16,
                color="#7f7f7f"
            )
        )
    )
)

graph_a.update_xaxes(type = 'category', tickangle=-90, tickfont=dict(color='grey', size=12))

graph_a.update_yaxes(tickformat = '%')
   
graph_a.show()
