In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
Data=pd.read_csv('Titanic_DataSet.csv')

Data.head()

In [None]:
Data.shape

In [None]:
Data.info

In [None]:
Data.describe()

In [None]:
Data.isnull().sum()

There are 177 null values in the age column
687 in cabin column
and 2 in embarked column 

We can drop the cabin column

In [None]:
Data.drop(columns="Cabin",axis=1,inplace=True)

Replacing the null values in the age column with the average value

In [None]:
Data['Age'].fillna(Data['Age'].mean(),inplace=True)

Replacing the null values in the embarked column with the frequently occuring value

In [None]:
Data['Embarked'].fillna(Data['Embarked'].mode()[0],inplace=True)

Data.isnull().sum()

In [None]:
#identifying duplicated data 
Data.duplicated().sum()

Idenifying survival status of the passengers

In [None]:
Data['Survived']

In [None]:
Data['Survived'].value_counts()

Survival Count Visualization

In [None]:
sns.countplot(x='Survived',data=Data,palette='viridis',)
plt.xlabel("Survival status")
plt.ylabel("Number of people")
plt.xticks(ticks=[0,1],labels=['Not survived','survived'])
plt.show()

In [None]:
plt.pie(Data['Survived'].value_counts(),explode=[0,0.04],autopct="%1.2f%%",labels=['Not survived'," Didn't Survive"],startangle=90, colors=['#66b3ff','#ff9999'])
plt.title('Survival of People')
plt.show()


Gender based survival Satistics

In [None]:
Data['Sex'].unique()


In [None]:
sns.countplot(x='Sex',data=Data,palette='viridis',)
plt.xlabel("Gender")
plt.ylabel("Number of people")
plt.show()

In [None]:
#Using hue to add a second dimension to the survival statistics
sns.countplot(x='Survived',hue='Sex',data=Data,palette='viridis',)
plt.xlabel("Survival status")
plt.ylabel("Number of people")
plt.xticks(ticks=[0,1],labels=["Didn't  survive",'survived'])
plt.show()

In [None]:
Data[Data['Sex'] == 'male'].Survived.groupby(Data.Survived).count().plot(kind='pie',
figsize=(3, 6),explode=[0,0.05],autopct='%1.1f%%',labels=["Didn't survive","Survived"])
plt.ylabel("")
plt.title("Male survival rate")
plt.show()

In [None]:
Data[Data['Sex'] == 'female'].Survived.groupby(Data.Survived).count().plot(kind='pie',
figsize=(3, 6),explode=[0,0.05],autopct='%1.1f%%',labels=["Didn't survive","Survived"])
plt.ylabel("")
plt.title("Female survival rate")
plt.show()

Observation : Survival rate of women was much higher compared to men

Illustrating the passenger count for each ticket class

In [None]:
sns.countplot(x='Pclass',data=Data,palette='viridis',legend=False,hue='Pclass')
plt.xlabel("Pclass")
plt.ylabel("Number of people")
plt.show()

Visualisation of survival segmentated by ticket class

In [None]:
sns.countplot(x='Survived',hue='Pclass',data=Data,palette='viridis')
plt.xlabel("Survival status")
plt.ylabel("Number of people")
plt.xticks(ticks=[0,1],labels=['Not survived','survived'])
plt.show()

Observation : The people with ticket class 3 had the lowest survival rate

In [None]:
sns.catplot(x = 'Pclass', hue = 'Survived', col = 'Sex', kind = 'count', data =
Data,palette='viridis' )
plt.tight_layout()

Observation: Males of the ticket class 3 had the lowest survival rate

Survival Distribution based on embarkment locations(Ports)

In [None]:
sns.countplot(x='Embarked', data=Data,palette='viridis')
plt.xlabel("Embarked")
plt.ylabel("Number of people")

plt.show()

In [None]:
sns.countplot(x='Survived',hue='Embarked',data=Data,palette='viridis',)
plt.xlabel("Survival status")
plt.ylabel("Number of people")
plt.xticks(ticks=[0,1],labels=['Not survived','survived'])
plt.show()

Visualisation of number of siblings/spouses (sibsp) and number of parents/children (parch) on board 

In [None]:
fig,axes = plt.subplots(1, 2, figsize=(12, 6))
sns.countplot(x='SibSp',data=Data,ax=axes[0],palette='viridis')
sns.countplot(x='Parch',data=Data,ax=axes[1],palette='viridis')
plt.show()

In [None]:
sns.countplot(x ='Survived', hue='SibSp',data=Data,palette='viridis')
plt.xticks(ticks=[0,1],labels=['Not survived','survived'])
plt.xlabel("Survival status")
plt.ylabel("Number of people")
plt.title("Survival population of Sibsp")
plt.show()

sns.countplot(x ='Survived',hue='Parch',data=Data,palette='viridis')
plt.xticks(ticks=[0,1],labels=['Not survived','survived'])
plt.title("Survival population of Parch")
plt.xlabel("Survival status")
plt.ylabel("Number of people")
plt.show()

Fare and Age patters with Kernel Density Estimation(KDE)

In [None]:
fig,axes = plt.subplots(1, 2, figsize=(12, 6))
sns.histplot(Data['Fare'], kde=True,ax=axes[0])
sns.histplot(Data['Age'].dropna(),kde=True,ax=axes[1])
plt.show()

Illustrating survival rates across different age groups

In [None]:
# Define cut points and label names
cut_points = [ 0, 5, 12, 18, 35, 60, 100]
label_names = [ 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']

# Create the "Age_categories" column
Data['Age_categories'] = pd.cut(Data['Age'], bins=cut_points, labels=label_names,right=False)

# Creating a pivot table for survival rates based on age categories
age_cat_pivot = Data.pivot_table(index="Age_categories", values="Survived")

# Define colors for each bar
colors = ['blue', 'green', 'orange', 'purple', 'pink', 'brown']

# Plotting the bar chart with different colors for each bar
fig, ax = plt.subplots()
bars = ax.bar(age_cat_pivot.index, age_cat_pivot['Survived'], color=colors)

# Adding a legend with the specified colors
handles = [plt.Rectangle((0, 0), 1, 1, color=colors[i]) for i in
range(len(colors))]
ax.legend(handles, label_names)
ax.set_title('Survival Rates by Age Category')
ax.set_xlabel('Age Category')
ax.set_ylabel('Survival Rate')
plt.xticks(rotation=40)
plt.show()

Observation : 
* Seniors had the lowest survival rate.
* Infants had the highest survival rate

Analysing Correlation

In [None]:
#replacing non numerical values with numerical values to find pairwise correlation between all columns
Data.replace({'Sex':{'male':1,'female':0},'Embarked':{'S':0,'C':1,'Q':2}},inplace=True)

Data.head

Data_num = Data[['Fare','Parch','SibSp','Age','Sex','Pclass','Embarked','Survived']]

sns.heatmap(Data_num.corr(),annot=True)
plt.show()
     

Obsertvation : Fare,Parch,Sibsp,Age,Sex,Pclass,Embarked columns have correlation with Survival

### Analysis Summary

* **Gender-Based Survival Rates:**
Women exhibited a significantly higher survival rate compared to men. This aligns with the "women and children first" evacuation protocol followed during the Titanic disaster, reflecting the societal norms of the time that prioritized the safety of women and children.

* **Survival Rates by Passenger Class:**
Passengers in Class 3, despite being the largest group, had the lowest survival rates. This suggests a strong link between socio-economic status and survival probability, with higher-class passengers likely having better access to lifeboats and emergency resources.

* **Interaction Between Gender and Passenger Class:**
Further analysis reveals that male passengers in Class 3 had the poorest survival rates. This indicates that both gender and socio-economic status significantly influenced survival chances, with lower-class males being the most at risk.

* **Impact of Age on Survival:**
Younger individuals showed lower survival rates, potentially due to the "women and children first" policy which prioritized older individuals. This highlights the significant role age played in determining survival outcomes.

* **Factors Correlated with Survival:**
Several variables were found to be significantly correlated with survival outcomes, including fare, gender, passenger class (Pclass), and embarkation point (Embarked). These factors are crucial for understanding the dynamics of survival during the Titanic disaster.