In [1]:
import seaborn as sns
import pandas as pd
import certifi

# The Titanic Dataset

In [11]:
# sns.load_data set function is part of the Seaborn Library in Python. it allows you to load
# built-in datasets for data visualisation and analysis. it provides easy access to databases such
# as the Titanic dataset

sns.load_dataset('titanic')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


The function is used to displau the number of null values in each column of a dataframe called df. 
the isna() function returns a df with the same shape as the original where each element is either true = indicating a null values for false. The sum() function is then applied to this df which calculates the sum of True values for each column. Print() is used to display the result

In [13]:
df = sns.load_dataset('titanic') 
print(df.isna().sum())


survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


# Dropping drops any rows with NULL values
The dropna() function drops the null values. I used to_string() method to convert the Dataset to a string representation. By using the index parameter to False, it can exclude the row index from the output.

In [22]:
Dropped = df.dropna()
print(Dropped.to_string(index=False))

 survived  pclass    sex   age  sibsp  parch     fare embarked  class   who  adult_male deck embark_town alive  alone
        1       1 female 38.00      1      0  71.2833        C  First woman       False    C   Cherbourg   yes  False
        1       1 female 35.00      1      0  53.1000        S  First woman       False    C Southampton   yes  False
        0       1   male 54.00      0      0  51.8625        S  First   man        True    E Southampton    no   True
        1       3 female  4.00      1      1  16.7000        S  Third child       False    G Southampton   yes  False
        1       1 female 58.00      0      0  26.5500        S  First woman       False    C Southampton   yes   True
        1       2   male 34.00      0      0  13.0000        S Second   man        True    D Southampton   yes   True
        1       1   male 28.00      0      0  35.5000        S  First   man        True    A Southampton   yes   True
        0       1   male 19.00      3      2 263.0000   

# Duplicated Data
This funcrtion below is used to identify and mark dulplicate rows in a df. it returns a boolean series where each element is True if the corresponding row is a duplicate and False otherwise.

In [24]:
duplicates = df.duplicated()
print("Duplicate rows:")
print(duplicates)

Duplicate rows:
0      False
1      False
2      False
3      False
4      False
       ...  
886     True
887    False
888    False
889    False
890    False
Length: 891, dtype: bool


df.drop_duplicates function is used to remove duplicate rows from the dataframe.

# Renaming columns
This function allows you to rename one or more columns in the dataframe

In [25]:
rename_columns = df.rename(columns={"sibsp": "siblings and spouses aboard", "parch": "parents and child aboard"})

In [28]:
print(rename_columns.to_string(index=False))

 survived  pclass    sex   age  siblings and spouses aboard  parents and child aboard     fare embarked  class   who  adult_male deck embark_town alive  alone
        0       3   male 22.00                            1                         0   7.2500        S  Third   man        True  NaN Southampton    no  False
        1       1 female 38.00                            1                         0  71.2833        C  First woman       False    C   Cherbourg   yes  False
        1       3 female 26.00                            0                         0   7.9250        S  Third woman       False  NaN Southampton   yes   True
        1       1 female 35.00                            1                         0  53.1000        S  First woman       False    C Southampton   yes  False
        0       3   male 35.00                            0                         0   8.0500        S  Third   man        True  NaN Southampton    no   True
        0       3   male   NaN                

# Descriptive statistics
This function in pandas is used to generate descriptive statistics of a datatframe. it basically provides a summary of each column.

In [29]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


# Average fare and age for each class
I used the agg() function to specify the columns we want to calculate the mean for. e.g. 'Age': 'mean'. df.groupby()function in pandas is used to group data in a dataframe based on one or more columns. it can also allow you to split the data into groups.

In [32]:
average_age_fare = df.groupby('class').agg({'age': 'mean', 'fare': 'mean'})
print(average_age_fare)

              age       fare
class                       
First   38.233441  84.154687
Second  29.877630  20.662183
Third   25.140620  13.675550


  average_age_fare = df.groupby('class').agg({'age': 'mean', 'fare': 'mean'})


In [33]:
average_age_fare = df.groupby('class', observed=False).agg({'age': 'mean', 'fare': 'mean'})
print(average_age_fare)

              age       fare
class                       
First   38.233441  84.154687
Second  29.877630  20.662183
Third   25.140620  13.675550


# Creating a new column


In [47]:
total_percentage = df['fare'].sum()
print(total_percentage)

28693.9493


# The numbers of surviors 

In [41]:
survivors = df.groupby('sex')['survived'].sum()
print(survivors)

sex
female    233
male      109
Name: survived, dtype: int64


# Interesting insights 
We can use seaborn to create a bar plot to visualise the surivival rate of males and females. 
this can give us insights into gender based surivial. We can also analyse the passenger class distribution, this can help us undertsand the demographics on board

In [50]:
# Calculate the correlation matrix
correlation_matrix = drop_duplicates.corr()

# Create the heatmap
plt.figure(figsize = (10,8))
sns.heatmap(correlation_matrix, cmap = 'coolwarm')
plt.show()

NameError: name 'drop_duplicates' is not defined