1. Load the Titanic dataset (available in seaborn) into a pandas DataFrame 
2. Find the average age of passengers for each class (1st, 2nd, and 3rd).
3. Create a new DataFrame that contains the count of male and female passengers in each age group (e.g., 0-10, 11-20, etc.).
4. Find the name and ticket number of the passenger(s) who paid the highest fare and survived the disaster.
5. Calculate the survival rate for passengers who were traveling alone (without any siblings, spouses, parents, or children) versus those who were traveling with family members.
6. For each passenger, calculate the age difference with the oldest sibling (if any) and the age difference with the youngest sibling (if any).
7. Find the most common deck letter (A, B, C, etc.) for each passenger class.
8. Group the Titanic DataFrame by 'Embarked' (port of embarkation) and find the percentage of passengers who survived in each group.
9. Calculate the correlation matrix for the 'Age', 'Fare', and 'Survived' columns in the Titanic dataset and find the feature with the highest absolute correlation with 'Survived'.
10. Create a new DataFrame that contains the 'Pclass', 'Sex', 'Age', and 'Fare' columns from the Titanic dataset and pivot it to have 'Pclass' as the index, 'Sex' as the columns, and 'Fare' as the values, with 'Age' as the weights.


## Load the Titanic dataset (available in seaborn) into a pandas DataFrame

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
df = sns.load_dataset('titanic')

In [None]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Find the average age of passengers for each class (1st, 2nd, and 3rd).

In [None]:
avg_age = df.groupby('class')['age'].mean()

In [None]:
print(f"The average age of passengers for each class is {avg_age}")

The average age of passengers for each class is class
First     38.233441
Second    29.877630
Third     25.140620
Name: age, dtype: float64


## Create a new DataFrame that contains the count of male and female passengers in each age group (e.g., 0-10, 11-20, etc.)

In [7]:
age_groups = pd.cut(df['age'] , bins = [0,10,20,30,40,50,60,70,80,90 , 100])

In [8]:
age_groups

0      (20.0, 30.0]
1      (30.0, 40.0]
2      (20.0, 30.0]
3      (30.0, 40.0]
4      (30.0, 40.0]
           ...     
886    (20.0, 30.0]
887    (10.0, 20.0]
888             NaN
889    (20.0, 30.0]
890    (30.0, 40.0]
Name: age, Length: 891, dtype: category
Categories (10, interval[int64, right]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [9]:
passenger_count = df.groupby(['sex' ,age_groups]).size().unstack(fill_value=0)

In [10]:
passenger_count 

age,"(0, 10]","(10, 20]","(20, 30]","(30, 40]","(40, 50]","(50, 60]","(60, 70]","(70, 80]","(80, 90]","(90, 100]"
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
female,31,46,81,55,31,14,3,0,0,0
male,33,69,149,100,55,28,14,5,0,0


## Find the name and ticket number of the passenger(s) who paid the highest fare and survived the disaster.

In [11]:
survived_passangers = df[df['survived']==1]

In [12]:
survived_passangers 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [13]:
highest_fare_survivors = survived_passangers[survived_passangers['fare'] == survived_passangers['fare'].max()]

In [14]:
highest_fare_survivors

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True


## Calculate the survival rate for passengers who were traveling alone (without any siblings, spouses, parents, or children) versus those who were traveling with family members.

In [15]:
df['family_size'] = df['sibsp'] + df['parch'] +1

In [16]:
alone_survived_rate = df[df['family_size']==1]['survived'].mean()
with_family_survived_rate = df[df['family_size']>1]['survived'].mean()

In [17]:
print("The survival rate for passanger who were travelling alone is ", alone_survived_rate)
print("The survival rate for passanger who were travelling with family is ", with_family_survived_rate)

The survival rate for passanger who were travelling alone is  0.30353817504655495
The survival rate for passanger who were travelling with family is  0.5056497175141242


## For each passenger, calculate the age difference with the oldest sibling (if any) and the age difference with the youngest sibling (if any).

In [18]:
df['OldestSiblingAgeDiff'] = df.apply(lambda row: row['age'] - df[(df['sibsp'] > 0) & (df['sibsp'] == row['sibsp'])]['age'].max(), axis=1)

In [19]:
df['YoungestSiblingAgeDiff'] = df.apply(lambda row: row['age'] - df[(df['sibsp'] > 0) & (df['sibsp'] == row['sibsp'])]['age'].min(), axis=1)

In [20]:
df['OldestSiblingAgeDiff'] 

0     -48.0
1     -32.0
2       NaN
3     -35.0
4       NaN
       ... 
886     NaN
887     NaN
888     NaN
889     NaN
890     NaN
Name: OldestSiblingAgeDiff, Length: 891, dtype: float64

In [21]:
df['YoungestSiblingAgeDiff']

0      21.33
1      37.33
2        NaN
3      34.33
4        NaN
       ...  
886      NaN
887      NaN
888      NaN
889      NaN
890      NaN
Name: YoungestSiblingAgeDiff, Length: 891, dtype: float64

## Find the most common deck letter (A, B, C, etc.) for each passenger class.

In [22]:
def extract_cabin_letter(cabin):
    return cabin[0] if not pd.isnull(cabin) else pd.NA

df['cabin_letter'] = df['deck'].apply(extract_cabin_letter)
most_common_letter = df.groupby('pclass')['cabin_letter'].apply(lambda x:x.mode().iloc[0])

In [23]:
most_common_letter

pclass
1    C
2    F
3    F
Name: cabin_letter, dtype: object

## Group the Titanic DataFrame by 'Embarked' (port of embarkation) and find the percentage of passengers who survived in each group.

In [25]:
df.groupby('embarked')['survived'].mean()*100

embarked
C    55.357143
Q    38.961039
S    33.695652
Name: survived, dtype: float64

## Calculate the correlation matrix for the 'Age', 'Fare', and 'Survived' columns in the Titanic dataset and find the feature with the highest absolute correlation with 'Survived'.

In [30]:
correlation_matrix = df[['age','fare','survived']].corr()

In [31]:
correlation_matrix 

Unnamed: 0,age,fare,survived
age,1.0,0.096067,-0.077221
fare,0.096067,1.0,0.257307
survived,-0.077221,0.257307,1.0


In [33]:
correlation_matrix['survived'].abs().nlargest(2).index[1]

'fare'

## Create a new DataFrame that contains the 'Pclass', 'Sex', 'Age', and 'Fare' columns from the Titanic dataset and pivot it to have 'Pclass' as the index, 'Sex' as the columns, and 'Fare' as the values, with 'Age' as the weights.

In [34]:
pivot_df = df[['pclass','sex','age','fare']].pivot_table(index = 'pclass' ,columns = 'sex' , values='fare',aggfunc='mean', fill_value=0, margins=True, margins_name='Total', dropna=False )

In [35]:
pivot_df.head()

sex,female,male,Total
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,106.125798,67.226127,84.154687
2,21.970121,19.741782,20.662183
3,16.11881,12.661633,13.67555
Total,44.479818,25.523893,32.204208
