# Titanic Dataset Exploration
***

# 0 - Importing modules and titanic dataset
***

In [526]:
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option("display.max_rows", 10) 
titanic = pd.DataFrame(sns.load_dataset("titanic"))
display(titanic)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# 1 - Data Cleaning

In [527]:
pd.isnull(titanic)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
887,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
889,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [528]:
titanic.info()
print(f" Number of null values: {titanic.isnull().sum()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
 Number of null values: survived         0
pclass           0
sex              

### Due to the high number of missing values in the 'deck' column it is best to remove this from the dataframe  

In [529]:
titanic = titanic.drop("deck", axis = 1)

In [530]:
titanic.duplicated().sum()

111

In [531]:
titanic = titanic.drop_duplicates()

### 111 rows of duplicated data that can be dropped

### Age does not have as many missing values and it may not be appropriate to remove these rows.<br><br> However the 'ages' column has floats and some ages are .5 which needs to be changed. To do this the NaN values must be filled.

In [532]:
titanic["age"] = titanic["age"].fillna(0)
titanic["age"] = titanic["age"].astype(int)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22,1,0,7.2500,S,Third,man,True,Southampton,no,False
1,1,1,female,38,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26,0,0,7.9250,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35,1,0,53.1000,S,First,woman,False,Southampton,yes,False
4,0,3,male,35,0,0,8.0500,S,Third,man,True,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39,0,5,29.1250,Q,Third,woman,False,Queenstown,no,False
887,1,1,female,19,0,0,30.0000,S,First,woman,False,Southampton,yes,True
888,0,3,female,0,1,2,23.4500,S,Third,woman,False,Southampton,no,False
889,1,1,male,26,0,0,30.0000,C,First,man,True,Cherbourg,yes,True


### There are quite a few columns that display duplicated data that can be removed. embarked is derived from embark_town, alone is detrimined by both sibsp(# of siblings or spouses on board) and parch(#number of parents or children on board) being 0, pclass (ticket class) and class are the same, alive does not seem relevant if survived is present. Who is determined by sex and age, which is the same for how is determined adult_male.

In [533]:
titanic = titanic.drop("pclass", axis = 1)
titanic = titanic.drop("embarked", axis = 1)
titanic = titanic.drop("alive", axis = 1)
titanic = titanic.drop("alone", axis = 1)
titanic = titanic.drop("adult_male", axis = 1)
titanic = titanic.drop("who", axis = 1)
titanic

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,embark_town
0,0,male,22,1,0,7.2500,Third,Southampton
1,1,female,38,1,0,71.2833,First,Cherbourg
2,1,female,26,0,0,7.9250,Third,Southampton
3,1,female,35,1,0,53.1000,First,Southampton
4,0,male,35,0,0,8.0500,Third,Southampton
...,...,...,...,...,...,...,...,...
885,0,female,39,0,5,29.1250,Third,Queenstown
887,1,female,19,0,0,30.0000,First,Southampton
888,0,female,0,1,2,23.4500,Third,Southampton
889,1,male,26,0,0,30.0000,First,Cherbourg


# 2 - Explanatory Data Analysis

In [534]:
titanic.describe()

Unnamed: 0,survived,age,sibsp,parch,fare
count,780.0,780.0,780.0,780.0,780.0
mean,0.412821,25.820513,0.525641,0.417949,34.829108
std,0.492657,17.054202,0.988046,0.838536,52.26344
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,16.0,0.0,0.0,8.05
50%,0.0,26.0,0.0,0.0,15.95
75%,1.0,36.0,1.0,1.0,34.375
max,1.0,80.0,8.0,6.0,512.3292


### The ages that were changed to 0 need to be removed so that they do not affect the average

In [535]:
age = pd.DataFrame(titanic, columns=["age","class"])
age["age"] = age["age"].replace(0,np.nan)
age = age.dropna()
print(age)
age.groupby("class", observed = True).mean(numeric_only = True)

      age   class
0    22.0   Third
1    38.0   First
2    26.0   Third
3    35.0   First
4    35.0   Third
..    ...     ...
883  28.0  Second
885  39.0   Third
887  19.0   First
889  26.0   First
890  32.0   Third

[670 rows x 2 columns]


Unnamed: 0_level_0,age
class,Unnamed: 1_level_1
First,38.382514
Second,30.412903
Third,25.307229


In [536]:
fare = pd.DataFrame(titanic, columns=["fare","class"])
fare.groupby("class", observed = True).mean(numeric_only = True)

Unnamed: 0_level_0,fare
class,Unnamed: 1_level_1
First,85.159631
Second,21.889279
Third,13.670843


In [537]:
total = titanic["fare"].sum()
titanic.insert(8,"Percent of Total Ticket Sales",(titanic["fare"]/total) * 100)
titanic

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,embark_town,Percent of Total Ticket Sales
0,0,male,22,1,0,7.2500,Third,Southampton,0.026687
1,1,female,38,1,0,71.2833,First,Cherbourg,0.262392
2,1,female,26,0,0,7.9250,Third,Southampton,0.029172
3,1,female,35,1,0,53.1000,First,Southampton,0.195460
4,0,male,35,0,0,8.0500,Third,Southampton,0.029632
...,...,...,...,...,...,...,...,...,...
885,0,female,39,0,5,29.1250,Third,Queenstown,0.107208
887,1,female,19,0,0,30.0000,First,Southampton,0.110429
888,0,female,0,1,2,23.4500,Third,Southampton,0.086319
889,1,male,26,0,0,30.0000,First,Cherbourg,0.110429


In [538]:
titanic["survived"].sum()

322

In [539]:
table = pd.crosstab(titanic['survived'],titanic['sex'])
table

sex,female,male
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,76,382
1,216,106


### Females were about twice as likely to have survived compared to males

In [542]:
table = pd.crosstab(titanic['survived'],titanic['class'])
table

class,First,Second,Third
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,77,81,300
1,135,83,104


### First class had the most survivors 