In [None]:
# %% [markdown]
# # Data Science Project: Titanic Survival Analysis
# **Folder**: `Computer_Science_Mathematics/Data_Science/Notebooks/`  
# **Dataset**: `../Data/titanic.csv`

In [None]:
# %% [markdown]
# ## 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [None]:
# %% [markdown]
# ## 2. Load Data
df = pd.read_csv('../Data/titanic.csv')
print("Data shape:", df.shape)
df.head()

In [None]:
# %% [markdown]
# ## 3. Data Cleaning
# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

In [None]:
# Drop irrelevant columns
df.drop(['Cabin', 'PassengerId', 'Ticket'], axis=1, inplace=True)

In [None]:
# %% [markdown]
# ## 4. Exploratory Data Analysis (EDA)
# %% [markdown]
# ### Survival Rate by Gender
plt.figure(figsize=(8,5))
sns.barplot(x='Sex', y='Survived', data=df, palette='viridis')
plt.title('Survival Rate by Gender')
plt.savefig('../Outputs/Figures/survival_by_gender.png')  # Save to Outputs
plt.show()

In [None]:
# %% [markdown]
# ### Age Distribution of Passengers
plt.figure(figsize=(10,6))
sns.histplot(df['Age'], bins=30, kde=True, color='purple')
plt.title('Age Distribution')
plt.savefig('../Outputs/Figures/age_distribution.png')
plt.show()

In [None]:
# %% [markdown]
# ### Correlation Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Feature Correlations')
plt.savefig('../Outputs/Figures/correlation_heatmap.png')
plt.show()

In [None]:
# %% [markdown]
# ## 5. Feature Engineering
# Create family size feature
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1