# Titanic Data Analysis

This notebook performs exploratory data analysis on the Titanic dataset, including data overview, cleaning, visualization, and summary of findings.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [None]:
gender_df = pd.read_csv('gender_submission.csv')
train_df = pd.read_csv('train.csv')

## 1. Data Overview
Use `.info()` and `.describe()` to understand the dataset structure and basic statistics.

In [None]:
train_df.info()

In [None]:
train_df.describe()

### Categorical Value Counts

In [None]:
print(train_df['Sex'].value_counts())
print(train_df['Pclass'].value_counts())
print(train_df['Embarked'].value_counts())

## 2. Data Cleaning
- Fill missing `Age` with median
- Fill missing `Embarked` with mode

In [None]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

## 3. Exploratory Visualizations

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=train_df, x='Sex', hue='Survived')
plt.title('Survival by Gender')
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=train_df, x='Pclass', hue='Survived')
plt.title('Survival by Class')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
train_df['Age'].hist(bins=30)
plt.title('Age Distribution')
plt.show()

## 4. Correlation Heatmap

In [None]:
plt.figure(figsize=(8,6))
numeric = train_df[['Survived','Pclass','Age','SibSp','Parch','Fare']]
sns.heatmap(numeric.corr(), annot=True, cmap='coolwarm')
plt.show()

## 5. Summary of Findings
- Females had higher survival rates
- First-class passengers survived more often
- Younger passengers had better chances
- Higher fare correlates with survival