# Titanic - Exploratory Data Analysis

This notebook explores the Titanic dataset to understand feature distributions, missing values, and correlations with survival.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')

## 1. Load Data

In [None]:
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

## 2. Basic Inspection

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

## 3. Missing Values

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(train_df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

print(train_df.isnull().sum())

## 4. Feature Analysis

### Survival by Gender

In [None]:
sns.countplot(x='Survived', hue='Sex', data=train_df)
plt.title('Survival by Gender')
plt.show()

### Survival by Class

In [None]:
sns.countplot(x='Survived', hue='Pclass', data=train_df)
plt.title('Survival by Passenger Class')
plt.show()

### Age Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Age'].dropna(), kde=True, bins=30)
plt.title('Age Distribution')
plt.show()

### Correlation Matrix

In [None]:
plt.figure(figsize=(12, 8))
numeric_df = train_df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()